# Prompting

Notebook to perform prompting with OpenPrompt on the models. 

One of the few resources found to Prefix Templates with OpenPrompt

> https://github.com/thunlp/OpenPrompt/blob/main/tutorial/2.1_conditional_generation.py

---

## Notebook setup

In [None]:
# ! pip install -q openprompt transformers dataset

In [2]:
from openprompt import PromptDataLoader, PromptForGeneration
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import PrefixTuningTemplate
from openprompt.utils.metrics import generation_metric
from sklearn.model_selection import train_test_split
from datasets.dataset_dict import DatasetDict, Dataset
# from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers.optimization import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
import pandas as pd
from pathlib import Path


---

## Set paths and parametes

In [3]:
base_path = "./evaluations/"
csv_data = "./df_songs.csv"
used_model = "gpt2"

train_split = 0.7
epochs = 10
batch_size = 8

---

## Model / Data Preparation

### Dataset

Read the CSV, remove everything except the lyrics. Add index for flavour.

In [None]:
lyrics_df = pd.read_csv(csv_data)
lyrics_df = lyrics_df.drop(
    columns=["Artist", "Song", "LyricsWordCount"], errors="ignore"
).reset_index(level=0)


Split the dataset and create an DatasetDict

In [None]:
train_df, validation_df = train_test_split(lyrics_df, train_size=train_split)
train_dataset, validation_dataset = Dataset.from_pandas(train_df), Dataset.from_pandas(
    validation_df
)
raw_dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})


Create a new dataset with a mapped `InputExample` for each sample

In [None]:
dataset = {}
for split in ['train', 'validation']:
    dataset[split] = []
    for data in raw_dataset[split]:
        # input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
        input_example = InputExample(text_a = data['Lyrics'], guid=data['index'])
        dataset[split].append(input_example)

Steal this dataloader wrapper function 🐱‍👤

In [None]:
def get_dataloader(
    dataset_split, template, tokenizer, wrapper_class, shuffle=False, batch_size=32
):
    """Returns a prompt data load for a given dataset split and template"""

    return PromptDataLoader(
        dataset=dataset_split,
        template=template,
        tokenizer=tokenizer,
        tokenizer_wrapper_class=wrapper_class,
        max_seq_length=256,
        decoder_max_length=256,
        batch_size=batch_size,
        shuffle=shuffle,
        teacher_forcing=False,
        predict_eos_token=True,
        truncate_method="head",
    )


### Model (PLM)

In [None]:
plm, tokenizer, model_config, WrapperClass = load_plm(used_model, used_model)


Using pad_token, but it is not set yet.


In [None]:
# # tokenizer = GPT2Tokenizer.from_pretrained(used_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium

# tokenizer.bos_token = "<|startoftext|>"
# tokenizer.eos_token = "<|endoftext|>"
# tokenizer.pad_token = "<|pad|>"


---

## Prompt-Based Fine-Tuning

Create a template.
The used template (line 1) equals the last template (line 7), so that the text param can be omitted.

In [None]:
template = PrefixTuningTemplate(model=plm, tokenizer=tokenizer, text=' {"placeholder":"text_a"} {"special": "<eos>"} {"mask"} ', using_decoder_past_key_values=False)

# Are the tokens necessary? Probably not:
# # You may observe that the example doesn't end with <|endoftext|> token. Don't worry, adding specific end-of-text token
# # is a language-model-specific token. we will add it for you in the TokenizerWrapper once you pass `predict_eos_token=True`

# template = PrefixTuningTemplate(model=plm, tokenizer=tokenizer, text='<|startoftext|>{"placeholder":"text_a"} {"mask"}<|endoftext|>')

Ignore using_decoder_past_key_values=False in a decoder-only LM.


Create one example and print it, to see what it looks like.

In [None]:
print(template.wrap_one_example(dataset['train'][0]))

[[{'text': ' Still Not A Player Clean Version Lyrics\nDont wanna be a player TS \nI dont wanna be a playa no more\nIm not a playa I just crush a lot\nBut Big Punisher still got what youre lookin for\nFor my thugs I dont wanna be for my thugs I dont wanna be a playa no more \nI dont wanna be a playa no more\nIm not a playa I just crush a lot\nBut you know Big Punisher still down by law\nWhos down to crush a lot\nHey yo Im still not a player but you still a hater\nElevator to the top hah see you later Im gone\nPenthouse suite Penthouse freaks\nIn house beach French countess ten thou piece\nRentout lease with a option to buy\nCoppin a fiveoh Benz for when Im not far up in the sky\nPuffin the lye from my Twinzito\nUp in the Benzito with my kiko from Queens nicknamed Perico\nWe go back like PAs and wearin PJs\nNow we reach the peakage runnin trains for three days\nWho wanna ride it wont cost you a dollar\nWhether soft or harder of course you still gonna holla\nMy my Im big huh I rip my thro

In [None]:
train_dataloader = get_dataloader(
    dataset["train"],
    template,
    tokenizer,
    WrapperClass,
    shuffle=True,
    batch_size=batch_size,
)
validation_dataloader = get_dataloader(
    dataset["validation"],
    template,
    tokenizer,
    WrapperClass,
    shuffle=False,
    batch_size=batch_size,
)


tokenizing: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1029 > 1024). Running this sequence through the model will result in indexing errors
tokenizing: 5714it [00:25, 225.69it/s]
tokenizing: 2449it [00:09, 244.91it/s]


In [None]:
prompt_model = PromptForGeneration(plm=plm,template=template, freeze_plm=True,tokenizer=tokenizer)
prompt_model = prompt_model.cuda()

In [None]:
# Follow PrefixTuning（https://github.com/XiangLi1999/PrefixTuning), we also fix the language model
# only include the template's parameters in training.

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in template.named_parameters()
            if (not any(nd in n for nd in no_decay)) and p.requires_grad
        ],
        "weight_decay": 0.0,
    },
    {
        "params": [
            p
            for n, p in template.named_parameters()
            if any(nd in n for nd in no_decay) and p.requires_grad
        ],
        "weight_decay": 0.0,
    },
]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)


In [None]:
tot_step  = len(train_dataloader)*5
scheduler = get_linear_schedule_with_warmup(optimizer, 0, tot_step)

In [20]:
generation_arguments = {
    "max_length": 512,
    "max_new_tokens": None,
    "min_length": 5,
    "temperature": 1.0,
    "do_sample": False,
    "top_k": 0,
    "top_p": 0.9,
    "repetition_penalty": 1.0,
    "num_beams": 5,
    "bad_words_ids": [[628], [198]]
}

def evaluate(prompt_model, dataloader):
    generated_sentence = []
    prompt_model.eval()

    for step, inputs in enumerate(dataloader):
        inputs = inputs.cuda()
        _, output_sentence = prompt_model.generate(inputs, **generation_arguments)
        generated_sentence.extend(output_sentence)
    return generated_sentence

In [None]:
# training and generation.
global_step = 0
tot_loss = 0
log_loss = 0
for epoch in range(epochs):
    prompt_model.train()
    for step, inputs in tqdm(enumerate(train_dataloader)):
        inputs = inputs.cuda()
        global_step +=1
        loss = prompt_model(inputs)
        loss.backward()
        tot_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(template.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if global_step %500 ==0:
            print("Epoch {}, global_step {} average loss: {} lr: {}".format(epoch, global_step, (tot_loss-log_loss)/500, scheduler.get_last_lr()[0]), flush=True)
            log_loss = tot_loss

generated_sentence = evaluate(prompt_model, validation_dataloader)

499it [06:59,  1.19it/s]

Epoch 0, global_step 500 average loss: 1.5598060819145378 lr: 2.5804195804195803e-05


715it [09:59,  1.19it/s]
284it [03:58,  1.19it/s]

Epoch 1, global_step 1000 average loss: 4.750476747403809e-06 lr: 2.160839160839161e-05


715it [09:58,  1.19it/s]
69it [00:57,  1.19it/s]

Epoch 2, global_step 1500 average loss: 1.2531490560832025e-06 lr: 1.7412587412587412e-05


569it [07:56,  1.19it/s]

Epoch 2, global_step 2000 average loss: 0.0001111410981204699 lr: 1.3216783216783218e-05


715it [09:58,  1.20it/s]
354it [04:55,  1.20it/s]

Epoch 3, global_step 2500 average loss: 1.649537864523154e-07 lr: 9.02097902097902e-06


715it [09:57,  1.20it/s]
139it [01:56,  1.20it/s]

Epoch 4, global_step 3000 average loss: 9.813892097554344e-08 lr: 4.8251748251748255e-06


639it [08:54,  1.20it/s]

Epoch 4, global_step 3500 average loss: 1.1652641319415125e-07 lr: 6.293706293706294e-07


715it [09:57,  1.20it/s]
424it [05:53,  1.20it/s]

Epoch 5, global_step 4000 average loss: 1.026688223646488e-07 lr: 0.0


715it [09:55,  1.20it/s]
209it [02:54,  1.20it/s]

Epoch 6, global_step 4500 average loss: 9.489052081335103e-08 lr: 0.0


709it [09:51,  1.20it/s]

Epoch 6, global_step 5000 average loss: 1.0597574510029517e-07 lr: 0.0


715it [09:56,  1.20it/s]
494it [06:51,  1.20it/s]

Epoch 7, global_step 5500 average loss: 9.727467613629414e-08 lr: 0.0


715it [09:54,  1.20it/s]
279it [03:52,  1.20it/s]

Epoch 8, global_step 6000 average loss: 1.2564512326207478e-07 lr: 0.0


715it [09:55,  1.20it/s]
64it [00:53,  1.20it/s]

Epoch 9, global_step 6500 average loss: 9.843688849286992e-08 lr: 0.0


564it [07:49,  1.20it/s]

Epoch 9, global_step 7000 average loss: 1.2248712528162287e-07 lr: 0.0


715it [09:54,  1.20it/s]


TypeError: 'NoneType' object is not iterable

In [21]:
generated_sentence = evaluate(prompt_model, validation_dataloader)
generated_sentence

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',


In [30]:
test_example = evaluate(prompt_model, get_dataloader(
    [InputExample(text_a =text, guid=data['index']) for text in ["Snow outside", "What is love", "Give me a song", "<|startoftext|>"]],
    template,
    tokenizer,
    WrapperClass,
    shuffle=False,
    batch_size=batch_size,
))
test_example

tokenizing: 4it [00:00, 896.99it/s]


['ShareShare', 'The', '', '']

In [None]:
with open(base_path + "generated_sentences.txt",'w') as f:
    for i in generated_sentence:
        f.write(i+"\n")