# Reinforcement Learning
The final stages of training involve further fine-tuning the model via proximal policy optimisation. This reinforcement learning utilises a previously trained reward model as a measure of an output's reward.

In [1]:
%load_ext autoreload
%autoreload 2

# Imports libraries necessary for the reinforcement learning process.
import torch, gc
from datasets import Dataset
from torch.optim import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoModelForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import numpy as np
from random import sample
# from nltk.tokenize.treebank import TreebankWordDetokenizer

from RLHF.trainer import RLTrainer
from RLHF.utils import create_reference_model

In [2]:
# Configures basic functions.
tqdm.pandas()

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

# Defines hyperparameters.
# TODO: redo this
learning_rate = 2e-5

In [3]:
# Sets up the model, reference model and tokenizer.
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
model = AutoModelForSequenceClassification.from_pretrained('Iterations/BERT-final', local_files_only=True)
ref_model = create_reference_model(model)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Sets up the optimizer and learning rate scheduler.
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

In [4]:
# Loads the data into a dataset and then groups the game rounds into rows.
dataset = pd.read_csv('Data/random_cah_data.csv')
grouped_data = dataset.groupby('round_id').apply(lambda x: pd.Series({'jokes': x['white_card_text'].tolist(),
                                                                 'clean_jokes': x['clean_white_card_text'].tolist(),
                                                                 'black_card_text': x['black_card_text'].iat[0],
                                                                 'won': x['won'].tolist()})).reset_index()

# Converts the Pandas data to a Dataset, before splitting and shuffling it.
dataset = Dataset.from_pandas(grouped_data)

In [None]:
# Tokenizes tbe dataset for the training.
def tokenize(sample):
    # Initialises lists to allow for a combination of prompts/punchlines.
    prompt = sample['black_card_text']
    punchlines = sample['jokes']
    clean_punchlines = sample['clean_jokes']
    combined = []

    # For each punchline in the current sample, combine the punchline with the prompt to make a combined joke.
    for index, punchline in enumerate(punchlines):
        if prompt.count('_____') == 0:
            combined.append(f"{prompt} {punchline}")
        else:
            combined.append(prompt.replace("_____", clean_punchlines[index]))

    # Tokenizes the jokes formed through these combinations.
    tokenized_examples = tokenizer(combined, max_length=335, padding="max_length", truncation=True)

    # We also store the winning joke's label, the round ID and the plaintext combined jokes.
    tokenized_examples['label'] = sample['won']
    tokenized_examples['round_id'] = sample['round_id']
    tokenized_examples['combined'] = combined

    return tokenized_examples

# Apply to dataset, split into 80/20 split.
original_columns = dataset.column_names
dataset = dataset.map(tokenize, batched=False, remove_columns=original_columns)
dataset.set_format(type="torch")
print(dataset)

In [6]:
# Set up the reinforcement learning trainer.
trainer = RLTrainer(
    "RL Bert", model, ref_model=ref_model, tokenizer=tokenizer, optimizer=optimizer, dataset=dataset, data_collator=collator
)

# Set up the reward model, if required.
toxicity_tokenizer = RobertaTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
toxicity_model = RobertaForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
# reward_model = AutoModelForSequenceClassification.from_pretrained('Models/Model', num_labels=1, local_files_only=True).to(trainer.accelerator.device)

In [None]:
# Trains the model using RL.
for epoch, batch in tqdm(enumerate(trainer.dataloader)):
    query_tensors = batch["input_ids"]
    mask_tensors = batch['attention_mask']
    label_tensors = batch["label"]

    # Generates a response from the model.
    # We also store the jokes in input_queries.
    reward_tensors = []
    response_tensors = []
    response_list = []

    # Clears up memory.
    gc.collect()
    torch.cuda.empty_cache()

    # Iterates through every query in the batch.
    for q_index, query in enumerate(query_tensors):
        response_tensors.append([])
        response_list.append([])
        # A response for each joke is generated.
        for j_index, joke in enumerate(query):
            with torch.no_grad():
                response = trainer.generate(joke, mask_tensors[q_index][j_index]).logits

            # A separate list and tensor is generated.
            response_tensors[q_index].append(response[0][0])
            response_list[q_index].append(response.tolist())

        # Computes the reward for each joke response.
        # The index of the joke with the maximum score is grabbed.
        max_ind = list(np.flatnonzero(response_list[q_index] == np.max(response_list[q_index])))
        max_ind = sample(max_ind,1)[0]

        # The index of the joke that is chosen joke, by the player, is grabbed.
        truth_ind = list(np.flatnonzero(batch['label'][q_index].tolist() == np.max(batch['label'][q_index].tolist())))[0]

        # Calculates the reward for each joke/prediction.
        with torch.no_grad():
            toxicity_labels = toxicity_model(**toxicity_tokenizer(batch['combined'][q_index], padding=True, truncation=True, return_tensors="pt")).logits.float()
            toxicity_labels = (toxicity_labels[:, 0] * -1).tolist()

        reward_tensors.append([])
        for index, joke in enumerate(response_list[q_index]):
            curr_reward = toxicity_labels[index]
            reward_tensors[q_index].append(torch.tensor(curr_reward).to("cuda:0"))
        print("Reward: ", float(reward_tensors[q_index][max_ind]))

    # Stores the rewards.
    rewards = [torch.stack(output) for output in reward_tensors]

    # A PPO step is completed.
    stats = trainer.step(query_tensors, mask_tensors, rewards)
    # trainer.log_stats(stats, batch, rewards)

    # Save model every 10 epochs.
    if epoch % 5 == 0:
        if trainer.accelerator.is_main_process:
            print(f"Saving after {epoch} epochs.")
            trainer.save_pretrained('Iterations/BERT-final/Toxicity/RLHF')

            if epoch % 25 == 0 and not epoch == 0:
                print(f"Saving extra copy after {epoch} epochs.")
                trainer.save_pretrained(f'Iterations/BERT-final/Toxicity/RLHF{epoch}')
