In [1]:
import torch
import pandas as pd
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
print(device)

cpu


In [23]:
ref_model_name = "lvwerra/gpt2-imdb"
model_name = "lvwerra/gpt2-imdb-pos-v2"
reward_model = "lvwerra/distilbert-imdb"

N_BEST_OF = 4

In [None]:
# this model generates the output
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)
# this model is used as reference
ref = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
# reward is generated in form of classication output from the below model
reward_pipe = pipeline("sentiment-analysis",
                       model=reward_model,
                       device=device)

In [25]:
tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
tokenizer.pad_token = tokenizer.eos_token

In [27]:
gen_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0, 
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id
    }
sent_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16
    }

In [None]:
text = "this movie was really good!!"
reward_model(text, **sent_kwargs)

In [None]:
text = "this movie was really bad!!"
reward_model(text, **sent_kwargs)

In [None]:
# move the model to gpu
model.cuda()
ref.cuda()

In [19]:
sizer = LengthSampler(6, 8)
sizer()

6

In [20]:
def build_dataset(tokenizer,
                  dataset_name="imdb",
                  input_min_text_length=2,
                  input_max_text_length=8):
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    # rename text columns to review
    ds = ds.rename_columns({"text": "review"})
    # filter datasets with more than 200 chars
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
    # What does LenSampler do? looks like the input_ids are truncated using 
    # the input_size() returned value
    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        # kind of updating the imdb datasets
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [26]:
updated_imdb = build_dataset(tokenizer)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


In [30]:
updated_imdb['input_ids'][:5]

[tensor([   40, 26399,   314]),
 tensor([    1,    40,  1703, 44269]),
 tensor([1532,  691,  284, 3368, 1642]),
 tensor([1212, 2646,  373, 2192, 7867,  416, 1793]),
 tensor([5812,   11, 3956,  986, 8499, 4854])]

In [31]:
output_min_length = 4
output_max_length = 16

output_length_sampler = LengthSampler(output_min_length, output_max_length)

#### get a batch from the dataset
bs = 16
output_data = dict()

updated_imdb.set_format("pandas")

df_batch = updated_imdb[:].sample(bs)

# only a single batch is sampled
output_data["query"] = df_batch["query"].tolist()

query_tensors = df_batch["input_ids"].tolist()

In [34]:
len(query_tensors)

16

In [35]:
# :: [Resp]
response_tensors_ref, response_tensors = [], []
# :: [[Resp]]
response_tensors_best_of = []

In [36]:
quer1 = torch.tensor(query_tensors[0])

In [37]:
N_BEST_OF

4

In [38]:
quer1.repeat((4, 1))

tensor([[23318,   428],
        [23318,   428],
        [23318,   428],
        [23318,   428]], dtype=torch.int32)

In [None]:
for i in range(bs):
    gen_len = output_length_sampler()
    # convert the query_tensors into tensors
    query = torch.tensor(query_tensors[i])
    # generate output from ref model
    output = ref.generate(query.unsqueeze(dim=0).to(device),
                          max_new_tokens=gen_len,
                          **gen_kwargs).squeeze()
    # append the outputs to respective lists
    response_tensors_ref.append(tokenizer.decode(output))
    # generate output from target model
    output = model.generate(query.unsqueeze(dim=0).to(device),
                            max_new_tokens=gen_len,
                            **gen_kwargs).squeeze()
    # append output to respective lists
    response_tensors.append(tokenizer.decode(output))

    # generating copies of the same query for the Best-of-n sampling
    queries = query.repeat((N_BEST_OF, 1))

    # Why regenerating ??
    output = ref.generate(queries.to(device),
                          max_new_tokens=gen_len,
                          **gen_kwargs).squeeze()
    # we get more output from reference model  
    response_tensors_best_of.append(tokenizer.batch_decode(output))

In [None]:
scores_ref = [output[0]["score"] for output in reward_pipe(response_tensors_ref, **sent_kwargs)]

scores = [output[0]["score"] for output in reward_pipe(response_tensors, **sent_kwargs)]

scores_best_of = []

for i, response in enumerate(response_tensors_best_of):
    # base_score = scores_ref[i]
    scores_best_of.append(torch.tensor([output[0]["score"] for output in reward_pipe(response,
                                                                                     **sent_kwargs)]))
     

In [None]:
output_data["response (ref)"] = response_tensors_ref
output_data["scores (ref)"] = scores_ref
output_data["response (RLHF)"] = response_tensors
output_data["scores (RLHF)"] = scores

output_data["response (best_of)"] = [
    response_tensors_best_of[i][a.argmax().item()] for i, a in enumerate(scores_best_of)
]

output_data["scores (best_of)"] = [a.max().item() for a in scores_best_of]

# store results in a dataframe
df_results = pd.DataFrame(output_data)
df_results

In this notebook we fine-tune GPT2 (small) to generate positive movie reviews based on the IMDB dataset. The model gets the start of a real review and is tasked to produce positive continuations. 

To reward positive continuations we use a BERT classifier to analyse the sentiment of the produced sentences and use the classifier's outputs as rewards signals for PPO training.

In [40]:
from trl import PPOTrainer, PPOConfig

ppo_config = PPOConfig(
    model_name=ref_model_name,
    learning_rate=1.4e-5,
    log_with='none'
)
sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 16
}

In [41]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [42]:
ppo_trainer = PPOTrainer(ppo_config,
                         model,
                         ref,
                         tokenizer, 
                         updated_imdb, 
                         data_collator=collator)

NameError: name 'model' is not defined

Training loop

The training loop consists of the following main steps:

    - Get the query responses from the policy network (GPT-2)

    - Get sentiments for query/responses from BERT

    - Optimize policy with PPO using the (query, response, reward) triplet

In [None]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)
from tqdm import tqdm

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        gen_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **gen_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

In [None]:
#### get a batch from the dataset
bs = 16
game_data = dict()
updated_imdb.set_format("pandas")
df_batch = updated_imdb[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),
        max_new_tokens=gen_len,
        **gen_kwargs
    ).squeeze()[-gen_len:]
    
    response_tensors_ref.append(output)
    
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device),
        max_new_tokens=gen_len,
        **gen_kwargs
    ).squeeze()[-gen_len:]
    
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in reward_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in reward_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results

In [None]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())