# Evaluation
The code within this notebook is primarily for evaluating the model's performance in an isolated setting.

Thus, it will support:
1. Testing the model's performance against the test dataset.
2. Testing the performance of other models/methodologies on the test dataset.
3. Evaluating the model on random samples of the test dataset.
4. Testing the performance of the model on individual, randomly generated, game rounds.

The following metrics will be printed for each evaluation:
- The success rate of predictions.
- The most misclassified prompts & punchlines.
- The most accurately classified prompts & punchlines.
- The average reward of predictions, if enabled.

In [1]:
# Imports libraries relevant to the evaluation process.
from datasets import Dataset
from transformers import AutoTokenizer, pipeline, RobertaTokenizer, RobertaForSequenceClassification
from transformers.pipelines.pt_utils import KeyDataset
from random import sample
from tqdm import tqdm
import numpy as np
import torch, gc, random, re, csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc

In [2]:
# Evaluates the trained model off an input dataset.
def eval_model(dataset, model_name='Iterations/BERT-fixed', reward=False):
    # Defines necessary evaluation variables.
    if model_name != "popularity":
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
        generator = pipeline("text-classification", model=model_name, batch_size = 10, tokenizer=tokenizer, device=0, function_to_apply="none")

    if reward:
        toxicity_tokenizer = RobertaTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
        toxicity_model = RobertaForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

    batch_len = len(dataset)

    # Defines the evaluation metrics.
    t1_accuracy, t2_accuracy, t3_accuracy, rewards, truth_rewards = 0, 0, 0, 0, 0
    classified = {"prompts": [], "punchlines":[]}
    misclassified = {"prompts": [], "punchlines": []}
    incorrect = {"punchlines": []}

    # Iterates through each batch in the data, with each batch representing a round played.
    for batch in tqdm(dataset, desc="Evaluation progress"):
        # Generates a score for each joke in the batch.

        if model_name == "popularity":
            # scores = batch['win_rate']
            scores = [0] * 10
            user_choice = int(batch['user_choice'][0])
            scores[user_choice] = 1
        else:
            scores = []
            for out in generator(KeyDataset(batch, "inputs"), **tokenizer_kwargs):
                scores.append(out['score'])

        # Selects the index of the maximum score. If there is more than one as the maximum, one is randomly selected.
        max_ind = list(np.flatnonzero(scores == np.max(scores)))
        max_ind = sample(max_ind,1)[0]

        # Creates a list of the three highest scored indices.
        t3_max_ind = np.argsort(np.array(scores))[-3:][::-1]

        # The value for chosen is a float, so we avoid using an equality check.
        truth_ind = list(np.flatnonzero(batch['won'] == np.max(batch['won'])))[0]
        if max_ind == truth_ind:
            t1_accuracy += 1
            classified['prompts'].append(batch['black_card_text'][max_ind])
            classified['punchlines'].append(batch['white_card_text'][max_ind])
        else:
            misclassified['prompts'].append(batch['black_card_text'][truth_ind])
            misclassified['punchlines'].append(batch['white_card_text'][truth_ind])

            incorrect['punchlines'].append(batch['white_card_text'][max_ind])

        if truth_ind in t3_max_ind[:2]:
            t2_accuracy += 1
        if truth_ind in t3_max_ind:
            t3_accuracy += 1

        if reward:
            rewards += toxicity_model(**toxicity_tokenizer(batch['inputs'][max_ind], padding=True, truncation=True, return_tensors="pt")).logits.tolist()[0][0] * -1

            truth_rewards += toxicity_model(**toxicity_tokenizer(batch['inputs'][truth_ind], padding=True, truncation=True, return_tensors="pt")).logits.tolist()[0][0] * -1

        # Resets the pipeline's call count to avoid any warnings.
        if model_name != "popularity":
            generator.call_count = 0

    # Clears the memory/cache used from the generation process.
    gc.collect()
    torch.cuda.empty_cache()

    # Stores the evaluation metrics and returns them.
    t1_accuracy /= batch_len
    t2_accuracy /= batch_len
    t3_accuracy /= batch_len
    rewards /= batch_len
    truth_rewards /= batch_len

    classified['prompts'] = sort_results(dict(zip(*np.unique(classified['prompts'], return_counts=True))))
    classified['punchlines'] = sort_results(dict(zip(*np.unique(classified['punchlines'], return_counts=True))))

    misclassified['prompts'] = sort_results(dict(zip(*np.unique(misclassified['prompts'], return_counts=True))))
    misclassified['punchlines'] = sort_results(dict(zip(*np.unique(misclassified['punchlines'], return_counts=True))))

    incorrect['punchlines'] = sort_results(dict(zip(*np.unique(incorrect['punchlines'], return_counts=True))))

    results = {"t1_accuracy": t1_accuracy, "t2_accuracy": t2_accuracy, "t3_accuracy": t3_accuracy, "classified": classified, "misclassified": misclassified, "incorrect": incorrect, "rewards": rewards, "truth_rewards": truth_rewards}
    return results

# Sorts the input dictionaries counts in descending order.
def sort_results(dictionary):
    return dict(sorted(dictionary.items(), key=lambda item: item[1], reverse=True))

In [27]:
# Displays the results.
def display_results(results, reward=False):
    print(f"Top-1 accuracy: {results['t1_accuracy'] * 100}%")
    print(f"Top-2 accuracy: {results['t2_accuracy'] * 100}%")
    print(f"Top-3 accuracy: {results['t3_accuracy'] * 100}%")

    if reward:
        print(f"Average reward: {results['rewards']}")
        print(f"Truth average reward: {results['truth_rewards']}")

    n = 0
    print("\nCorrectly predicted prompts:")
    for key, value in results["classified"]["prompts"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nCorrectly predicted punchlines:")
    for key, value in results["classified"]["punchlines"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nUnpredicted winning prompts:")
    for key, value in results["misclassified"]["prompts"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nUnpredicted winning punchlines:")
    for key, value in results["misclassified"]["punchlines"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nIncorrectly predicted punchlines:")
    for key, value in results["incorrect"]["punchlines"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

# Displays a graph comparing the results from two models.
def display_graph(list_1, list_2, title, metric, reward=None):
    x_points1, y_values1 = zip(*list_1)
    x_points2, y_values2 = zip(*list_2)

    plt.plot(x_points1, y_values1, label='Model 1', marker='o')
    plt.plot(x_points2, y_values2, label='Model 2', marker='x')

    if reward is not None:
        x_points3, y_values3 = zip(*reward)
        plt.plot(x_points3, y_values3, label="Reward", marker='*')

    plt.xlabel(metric.capitalize() + "s")
    plt.ylabel(metric.capitalize() + '% Increase')

    plt.title(title)
    plt.grid(True)
    plt.legend()
    plt.tick_params(axis='x', labelrotation=80)

    plt.show()

def save_results(results, name="cah_results"):
    name_list = [result['model_name'] for result in results]
    top1_list = [result['t1_accuracy'] for result in results]
    top2_list = [result['t2_accuracy'] for result in results]
    top3_list = [result['t3_accuracy'] for result in results]
    reward_list = [result['rewards'] for result in results]

    with open(f"{name}.csv", mode='w', newline='') as file:
        writer = csv.writer(file)

        header = ['name', 't1_accuracy', 't2_accuracy', 't3_accuracy', 'rewards']
        writer.writerow(header)

        for i in range(len(name_list)):
            row = [name_list[i], top1_list[i], top2_list[i], top3_list[i], reward_list[i]]
            writer.writerow(row)

    file.close()

# Displays a graph for the results of an evaluated set of models.
def display_models(results, results_2=None, legend=None, iterations=25, reward=False, save=False):
    plt.style.use('seaborn') # I personally prefer seaborn for the graph style, but you may choose whichever you want.
    params = {"ytick.color" : "black",
              "xtick.color" : "black",
              "axes.labelcolor" : "black",
              "axes.edgecolor" : "black",
              "text.usetex" : True,
              "font.family" : "serif",
              "font.serif" : ["Computer Modern Serif"],
              "font.weight" : "bold",
              "font.size" : 15,
              "legend.fontsize" : 12,
              "axes.labelsize" : 14,
              "axes.titlesize" : 16,
              "xtick.labelsize" : 13,
              "ytick.labelsize" : 13}
    plt.rcParams.update(params)

    top1_list = [result['t1_accuracy'] for result in results]

    top2_list = [result['t2_accuracy'] for result in results]
    top3_list = [result['t3_accuracy'] for result in results]
    reward_list = [result['rewards'] for result in results]

    if results_2:
        top1_list_2 = [result['t1_accuracy'] for result in results_2]
        reward_list_2 = [result['rewards'] for result in results_2]

    # Plots the Top-1 accuracy.
    if results_2:
        plt.plot([num * iterations for num in range(0, len(results))], top1_list, color='#70000c', label=legend[0])
        plt.plot([num * iterations for num in range(0, len(results))], top1_list_2, color='#376c67', label=legend[1])
        plt.legend()
    else:
        plt.plot([num * iterations for num in range(0, len(results))], top1_list, color='#70000c')
    plt.xlabel(r'\textbf{Iterations}')
    plt.ylabel(r'\textbf{Accuracy}')
    plt.title(r'\textbf{PPO iterations trained against accuracy}')

    if save:
        plt.savefig("t1-acc.png", bbox_inches="tight")
    plt.show()

    # Plots the Top-2 accuracy.
    plt.plot([num * iterations for num in range(0, len(results))], top2_list, color='#70000c')
    plt.xlabel(r'\textbf{Iterations}')
    plt.ylabel(r'\textbf{Top-2 Accuracy}')
    plt.title(r'\textbf{PPO iterations trained against Top-2 Accuracy}')

    if save:
        plt.savefig("t2-acc.png", bbox_inches="tight")
    plt.show()

    # Plots the Top-3 accuracy.
    plt.plot([num * iterations for num in range(0, len(results))], top3_list, color='#70000c')
    plt.xlabel(r'\textbf{Iterations}')
    plt.ylabel(r'\textbf{Top-3 Accuracy}')
    plt.title(r'\textbf{PPO iterations trained against Top-3 Accuracy}')

    if save:
        plt.savefig("t3-acc.png", bbox_inches="tight")
    plt.show()


    # Plots the average rewards.
    if reward:
        if results_2:
            plt.plot([num * iterations for num in range(0,len(results))], reward_list, color='#70000c', label=legend[0])
            plt.plot([num * iterations for num in range(0, len(results))], reward_list_2, color='#376c67', label=legend[1])
            plt.legend()
        else:
            plt.plot([num * iterations for num in range(0,len(results))], reward_list, color='#70000c')
        plt.xlabel(r'\textbf{Iterations}')
        plt.ylabel(r'\textbf{Average Reward}')
        plt.title(r'\textbf{PPO iterations trained against average reward}')

        if save:
            plt.savefig("reward.png", bbox_inches="tight")
        plt.show()

In [None]:
import csv

# Loads results from previous evaluations, saved in a .csv, and displays them on graphs.
def load_results(dir, dir_2=None):
    results = []
    with open(f'{dir}/cah_results.csv', 'r') as file:
        csv_reader = csv.DictReader(file)

        for row in csv_reader:
            for key, value in row.items():
                try:
                    row[key] = float(value)
                except ValueError:
                    pass
            results.append(row)

    legend = None
    results_2 = None
    if dir_2:
        legend = ['Humicroedit', 'Cards Against Humanity']
        results_2 = []
        with open(f'{dir_2}/cah_results.csv', 'r') as file:
            csv_reader = csv.DictReader(file)

            for row in csv_reader:
                for key, value in row.items():
                    try:
                        row[key] = float(value)
                    except ValueError:
                        pass
                results_2.append(row)

    display_models(results, results_2, legend, reward=True, save=True)

load_results("Iterations/BERT-final/Toxicity")

---
### Individual Models
This section is primarily for measuring the performance or style of individual models, by themselves.

In [None]:
# Performs an evaluation of the model on some, or all, of the test data.
def eval_data(num=0, seed=None, reward=False):
    dataset = pd.read_csv('Data/smaller_test_cah_data.csv')

    if num > 0:
        ids = dataset['round_id'].unique()
        ids = pd.Series(ids).sample(n=num, random_state=seed)
        dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", str(row['clean_white_card_text']))
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + str(row['white_card_text']), axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    results = eval_model(dataset, 'popularity', reward)
    display_results(results, reward)

eval_data(reward=False)

In [None]:
# Performs an evaluation of the model on both the trimmed and non-trimmed data.
def eval_complete_data(num=0, seed=None, reward=False):
    train_dataset = pd.read_csv('Data/train_cah_data.csv')
    dataset = pd.read_csv('Data/Old/proc_cah_data.csv')

    train_ids = train_dataset['round_id'].unique()
    ids = dataset[~dataset["round_id"].isin(train_ids)]['round_id'].unique()
    dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    if num > 0:
        ids = dataset['round_id'].unique()
        ids = pd.Series(ids).sample(n=num, random_state=seed)
        dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]
    results = eval_model(dataset, 'Iterations/BERT-final', reward)
    display_results(results, reward)

eval_complete_data(num=10000, reward=False)

In [None]:
# Performs an evaluation of the model on a split of the test data, based on the domain ratio value for the winning joke.
def eval_metrics(num=0, seed=None, reward=False):
    dataset = pd.read_csv('Data/test_cah_data.csv')

    average_ratio = dataset['domain_ratio'].mean()
    dataset = dataset[dataset['domain_ratio'] <= average_ratio]

    if num > 0:
        ids = dataset['round_id'].unique()
        ids = pd.Series(ids).sample(n=num, random_state=seed)
        dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", str(row['clean_white_card_text']))
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + str(row['white_card_text']), axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    results = eval_model(dataset, 'Iterations/BERT-final/Positivity/125', reward)
    display_results(results, reward)

eval_metrics(reward=False)

In [None]:
# Performs an evaluation of the model on a split of the data containing only unseen punchlines.
# If win is True, we evaluate on rows where the winning joke contains an unseen punchline.
def eval_unseen(win=False):
    train_dataset = pd.read_csv('Data/train_cah_data.csv')
    dataset = pd.read_csv('Data/proc_cah_data.csv')

    if win:
        unique_punchlines = train_dataset["chosen_white_card"].unique()
        u_dataset = dataset[~dataset["white_card_text"].isin(unique_punchlines) & (dataset["won"] == 1)]

        round_ids = u_dataset["round_id"].unique()
    else:
        unique_punchlines = train_dataset["white_card_text"].unique()
        u_dataset = dataset[~dataset["white_card_text"].isin(unique_punchlines)]

        round_count = u_dataset["round_id"].value_counts()
        round_ids = round_count[round_count == 10].index
    dataset = dataset[dataset["round_id"].isin(round_ids)]

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]
    results = eval_model(dataset, 'Iterations/BERT-fixed/')
    display_results(results)

eval_unseen(win=True)

In [None]:
# Performs an evaluation of the model on a randomly generated rounds.
def eval_random(round_num):
    # Defines variables necessary for the process.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model="Iterations/BERT-final/Positivity/250", batch_size = 10, tokenizer=tokenizer, device=0, function_to_apply="none")

    # toxicity_tokenizer = RobertaTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
    # toxicity_model = RobertaForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

    # Reads the data and creates a dataframe of every unique prompt and punchline.
    dataset = pd.read_csv('Data/proc_cah_data.csv')
    prompts = dataset['black_card_text'].unique()
    punchlines = dataset['white_card_text'].unique()

    # For the specified number of rounds, samples a prompt and ten punchlines for the model to select a winning joke from within.
    for n in range(round_num):
        prompt = random.choice(prompts)
        punchline = [random.choice(punchlines) for num in range(0,10)]
        scores = []
        jokes = []

        # Combines the prompt and punchlines into jokes and generates a score for them.
        print("Choices: ")
        for joke in punchline:
            if prompt.count("_____") == 0:
                g_joke = prompt + " " + joke
            else:
                g_joke = prompt.replace("_____", joke[0].lower() + joke[1:-1])
            jokes.append(g_joke)
            print(g_joke)
            scores.append(generator(g_joke)[0]['score'])

        # Finds the index of the maximum and minimum scores.
        max_ind = list(np.flatnonzero(scores == np.max(scores)))
        max_ind = sample(max_ind,1)[0]

        min_ind = list(np.flatnonzero(scores == np.min(scores)))
        min_ind = sample(min_ind,1)[0]

        # Prints out the highest and lowest scoring jokes for the round.
        print("\nHighest scored joke:")
        print(jokes[max_ind])
        print("\nLowest scored joke:")
        print(jokes[min_ind] + "\n")

        print("Scores:",scores)

        # Resets the pipeline's call count to avoid any warnings.
        generator.call_count = 0

eval_random(10)

In [None]:
import shap
import numpy as np
import torch

# Evaluates a model via SHAP values.
def eval_shap(model_name="BERT-final"):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model=f'Iterations/{model_name}', tokenizer=tokenizer, device=0, function_to_apply="none")
    explainer = shap.Explainer(generator)

    dataset = pd.read_csv('Data/Microedit/test_micro_data.csv')
    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)
    dataset = Dataset.from_pandas(dataset)

    shap_values = explainer(dataset['inputs'][:1000], fixed_context=1)

    return shap_values

shap_values = eval_shap()

In [None]:
from transformers import AutoModelForSequenceClassification
from bertviz import head_view

# Evaluates a model via BertViz.
def eval_bertviz():
    dataset = pd.read_csv('Data/test_cah_data.csv')
    dataset = dataset[dataset['won'] == 1]
    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['clean_white_card_text'])
                                           if "_____" in row['black_card_text']
                                           else row['black_card_text'] + " " + row['white_card_text'], axis=1)
    dataset.reset_index(drop=True, inplace=True)

    model = AutoModelForSequenceClassification.from_pretrained('Iterations/BERT-final', local_files_only=True, output_attentions=True)
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    num = random.randint(0, len(dataset['inputs']))
    inputs = tokenizer.encode(dataset['inputs'][num], return_tensors='pt')
    outputs = model(inputs)
    attention = outputs[-1]
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])

    head_view(attention, tokens)

eval_bertviz()

In [None]:
import together

together.api_key = 'KEY'
together.Models.start('togethercomputer/llama-2-70b')

In [None]:
# Evaluates the performance of an LLM (Llama 70B) on the data.
def eval_LLM(cont=False):
    fsc_prompt = "SELECT THE FUNNIEST JOKE:\n(jokes removed as offensive)\nSELECTION:"

    dataset = pd.read_csv('Data/test_cah_data.csv')
    save_dataset = pd.read_csv('Data/test_cah_data.csv')

    if not cont:
        save_dataset['llm_pred'] = 0
    save_dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['clean_white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['clean_white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)
    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    for batch in tqdm(dataset, desc="Evaluation progress"):
        round_id = batch['round_id'][0]
        if (save_dataset[save_dataset['round_id'] == round_id]["llm_pred"] == 1).any():
            continue

        jokes = fsc_prompt + "\n\nSELECT THE FUNNIEST JOKE:"
        truth_ind = list(np.flatnonzero(batch['won'] == np.max(batch['won'])))[0]

        for joke in batch['inputs']:
            jokes = f"{jokes}\n{joke}"

        jokes = jokes + "\nSELECTION:  "
        # print(jokes)
        # print("Winner: ", batch['inputs'][truth_ind])

        output = together.Complete.create(
          prompt = jokes,
          model = "togethercomputer/llama-2-70b",
          max_tokens = 512,
          temperature = 0.7,
          top_k = 50,
          top_p = 0.7,
          repetition_penalty = 1,
          stop = ['SELECT THE FUNNIEST JOKE:']
        )

        selection = (output['prompt'][0]+output['output']['choices'][0]['text'])
        selection = selection.split("SELECTION:  ")[-1].replace("SELECT THE FUNNIEST JOKE:", "").strip()
        # print("Response:", selection)

        save_dataset.loc[(save_dataset['round_id'] == round_id) & (save_dataset['inputs'] == selection), 'llm_pred'] = 1

        save_dataset.to_csv(f"Data/test_cah_data.csv", index=False)

    save_dataset.drop(columns=['inputs'], inplace=True)
    save_dataset.to_csv(f"Data/test_cah_data.csv", index=False)

eval_LLM(True)

In [None]:
# Evaluates the performance of models on the Reddit data.
def eval_reddit(seed=None, reward=False):
    dataset = pd.read_csv('Data/Experimental Data/train_reddit_data.csv')
    dataset = dataset.sample(frac=1, random_state=None).reset_index(drop=True)
    dataset['inputs'] = dataset['joke']
    dataset['black_card_text'] = dataset['body']
    dataset['white_card_text'] = dataset['punchline']

    num_rounds = len(dataset) // 10
    round_ids = np.repeat(range(1, num_rounds + 1), 10)
    round_ids = np.append(round_ids, np.arange(num_rounds * 10, len(dataset)) + 1)
    dataset['round_id'] = round_ids

    round_counts = dataset['round_id'].value_counts()
    valid_rounds = round_counts[round_counts == 10].index
    dataset = dataset[dataset['round_id'].isin(valid_rounds)]

    dataset = dataset.groupby('round_id').apply(lambda x: x.sort_values('normalized_score', ascending=False)).reset_index(drop=True)
    dataset['won'] = 0.0
    dataset.loc[dataset.groupby('round_id').head(1).index, 'won'] = 1.0
    dataset.to_csv('Data/train_reddit_data.csv', index=False)
    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    results = eval_model(dataset, 'Iterations/BERT-reddit', reward)
    display_results(results, reward)

eval_reddit()

---
### Multiple Models
This section is centred around comparing separate models and their performance or styles.

In [None]:
# Performs an evaluation of the model on all of the test data.
def eval_data(models=['Iterations/BERT-final'], iterations=0, reward=False, name="cah_results"):
    dataset = pd.read_csv('Data/test_cah_data.csv')

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", str(row['clean_white_card_text']))
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + str(row['white_card_text']), axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    # Iterates through every specified model, displaying and storing their results as they are evaluated.
    total_results = []
    for model in models:
        results = eval_model(dataset, model, reward)
        display_results(results, reward)
        results['model_name'] = model

        total_results.append(results)

    # Displays the total results on a graph if we are evaluating iterations of a model.
    if iterations > 0:
        save_results(total_results, name)
        display_models(total_results, iterations, reward)

itr = 0
models = ['Iterations/BERT-final']
for num in range(0, 20):
    itr += 25
    models.append(f'Iterations/BERT-final/RLHF{itr}')

eval_data(models, 25, reward=True)

In [None]:
# Performs an evaluation of the model on a randomly generated rounds.
def eval_random(round_num):
    # Defines variables necessary for the process.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model="Iterations/BERT-fixed", batch_size = 10, tokenizer=tokenizer, device=0, function_to_apply="none")

    generator_2 = pipeline("text-classification", model="Iterations/BERT-fixed/RawStyle/Toxicity2/25", batch_size = 10, tokenizer=tokenizer, device=0, function_to_apply="none")

    generator_3 = pipeline("text-classification", model="Iterations/BERT-fixed/RawStyle/Toxicity2/75", batch_size = 10, tokenizer=tokenizer, device=0, function_to_apply="none")

    toxicity_tokenizer = RobertaTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
    toxicity_model = RobertaForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

    # Reads the data and creates a dataframe of every unique prompt and punchline.
    dataset = pd.read_csv('Data/proc_cah_data.csv')
    prompts = dataset['black_card_text'].unique()
    punchlines = dataset['white_card_text'].unique()

    # For the specified number of rounds, samples a prompt and ten punchlines for the model to select a winning joke from within.
    for n in range(round_num):
        prompt = random.choice(prompts)
        punchline = [random.choice(punchlines) for num in range(0,10)]
        scores = []
        scores_2 = []
        scores_3 = []
        jokes = []

        # Combines the prompt and punchlines into jokes and generates a score for them.
        print("Choices: ")
        for joke in punchline:
            if prompt.count("_____") == 0:
                g_joke = prompt + " " + joke
            else:
                g_joke = prompt.replace("_____", joke[0].lower() + joke[1:-1])
            jokes.append(g_joke)

            scores.append(generator(g_joke)[0]['score'])
            scores_2.append(generator_2(g_joke)[0]['score'])
            scores_3.append(generator_3(g_joke)[0]['score'])

            # Resets the pipeline's call count to avoid any warnings.
            generator.call_count = 0


        # Finds the index of the maximum and minimum scores.
        max_ind = list(np.flatnonzero(scores == np.max(scores)))
        max_ind = sample(max_ind,1)[0]

        min_ind = list(np.flatnonzero(scores == np.min(scores)))
        min_ind = sample(min_ind,1)[0]

        # Prints out the highest and lowest scoring jokes for the round.
        print("[1] Highest scored joke:")
        print(f"{jokes[max_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[max_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]")
        print("\n[1] Lowest scored joke:")
        print(f"{jokes[min_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[min_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]\n")

        # Finds the index of the maximum and minimum scores.
        max_ind = list(np.flatnonzero(scores_2 == np.max(scores_2)))
        max_ind = sample(max_ind,1)[0]

        min_ind = list(np.flatnonzero(scores_2 == np.min(scores_2)))
        min_ind = sample(min_ind,1)[0]

        # Prints out the highest and lowest scoring jokes for the round.
        print("[2] Highest scored joke:")
        print(f"{jokes[max_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[max_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]")
        print("\n[2] Lowest scored joke:")
        print(f"{jokes[min_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[min_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]\n")

        # Finds the index of the maximum and minimum scores.
        max_ind = list(np.flatnonzero(scores_3 == np.max(scores_3)))
        max_ind = sample(max_ind,1)[0]

        min_ind = list(np.flatnonzero(scores_3 == np.min(scores_3)))
        min_ind = sample(min_ind,1)[0]

        # Prints out the highest and lowest scoring jokes for the round.
        print("[3] Highest scored joke:")
        print(f"{jokes[max_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[max_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]")
        print("\n[3] Lowest scored joke:")
        print(f"{jokes[min_ind]} [{(toxicity_model(**toxicity_tokenizer(jokes[min_ind], padding=True, truncation=True, return_tensors='pt')).logits.tolist()[0][0] * -1)}]\n")

eval_random(10)

In [None]:
# Removes non-alphanumeric characters from input words.
def remove_characters(word):
    alphanumeric_matches = re.findall(r'[a-zA-Z0-9]+', word)
    main_string = ''.join(alphanumeric_matches)
    return main_string

# Remove duplicate elements in word lists.
def remove_duplicates(word_list):
    seen_words = {}
    final_list = []

    for word in word_list:
        curr_word = word[0]

        if curr_word not in seen_words:
            seen_words[curr_word] = True
            final_list.append(word)

    return final_list

# Measures the impacts certain words have on the overall rating of a joke.
def eval_style(num, style="words", reward=False, scale=5, seed=None):
    # Defines variables necessary for the process.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model="Iterations/BERT-fixed/RawStyle/Toxicity2/25", batch_size = 1, tokenizer=tokenizer, device=0, function_to_apply="none")
    generator_2 = pipeline("text-classification", model="Iterations/BERT-fixed/RawStyle/Toxicity2/75", batch_size = 1, tokenizer=tokenizer, device=0, function_to_apply="none")

    # If reward is required, we also define the reward models.
    if reward:
        toxicity_tokenizer = RobertaTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
        toxicity_model = RobertaForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")

    # Reads the data and creates a dataframe of every unique prompt and punchline.
    dataset = pd.read_csv('Data/proc_cah_data.csv')

    ids = dataset['round_id'].unique()
    ids = pd.Series(ids).sample(n=num, random_state=seed)
    dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]

    word_list = {}
    word_list_2 = {}
    reward_list = {}
    word_count = {}
    for batch in tqdm(dataset, desc="Evaluation progress"):
        # If we track punchlines, get the base score from the score of the joke prompt.
        if style == "punchline":
            base_score = generator(batch['black_card_text'][0])[0]['score'] + scale
            base_score_2 = generator_2(batch['black_card_text'][0])[0]['score'] + scale

            if reward:
                base_reward = toxicity_model(**toxicity_tokenizer(batch['black_card_text'][0], padding=True, truncation=True, return_tensors="pt")).logits.tolist()
                base_reward = ((base_reward[0][0] * -1) + scale)

        for index, joke in enumerate(batch['inputs']):
            # If the desired metric is per word then iterate through every word in the jokes.
            if style == "word":
                prev_score = 0
                prev_score_2 = 0
                curr_word = ""

                for word in joke.split():
                    curr_word += word
                    word = remove_characters(word)

                    new_score = generator(curr_word)[0]['score'] + scale
                    new_score_2 = generator_2(curr_word)[0]['score'] + scale
                    generator.call_count = 0
                    generator_2.call_count = 0

                    if prev_score == 0:
                        prev_score = new_score
                        prev_score_2 = new_score_2
                        continue

                    if word not in word_list:
                        word_list[word] = 0
                        word_list_2[word] = 0
                        word_count[word] = 0

                    word_list[word] += new_score/prev_score
                    word_list_2[word] += new_score_2/prev_score_2
                    word_count[word] += 1

                    prev_score = new_score
                    prev_score_2 = new_score_2

            # If the desired metric is per punchline then iterate through every joke.
            if style == "punchline":
                punchline = batch['white_card_text'][index]
                score = generator(joke)[0]['score'] + scale
                score_2 = generator_2(joke)[0]['score'] + scale
                generator.call_count = 0
                generator_2.call_count = 0

                if joke not in word_list:
                    word_list[punchline] = 0
                    word_list_2[punchline] = 0
                    word_count[punchline] = 0

                    if reward:
                        reward_list[punchline] = 0

                word_list[punchline] += score/base_score
                word_list_2[punchline] += score_2/base_score_2
                word_count[punchline] += 1

                if reward:
                    curr_reward = toxicity_model(**toxicity_tokenizer(joke, padding=True, truncation=True, return_tensors="pt")).logits.tolist()
                    curr_reward = ((curr_reward[0][0] * -1) + scale)

                    reward_list[punchline] += (curr_reward/base_reward)/(scale * 2) + 0.5



    # Averages every score shift value by the number of times a word appears.
    for word in word_list:
        word_list[word] /= word_count[word]
        word_list_2[word] /= word_count[word]

        if reward:
            reward_list[word] /= word_count[word]

    # Grabs the 5 highest and lowest score changing words for each model.
    d_word_list = sorted(word_list.items(), key=lambda item: item[1], reverse=False)[0:5]
    a_word_list = sorted(word_list.items(), key=lambda item: item[1], reverse=True)[0:5]

    d_word_list_2 = sorted(word_list_2.items(), key=lambda item: item[1], reverse=False)[0:5]
    a_word_list_2 = sorted(word_list_2.items(), key=lambda item: item[1], reverse=True)[0:5]

    for word in range(0,5):
        d_word_list.append((d_word_list_2[word][0], word_list[d_word_list_2[word][0]]))
        d_word_list_2.append((d_word_list[word][0], word_list_2[d_word_list[word][0]]))

        a_word_list.append((a_word_list_2[word][0], word_list[a_word_list_2[word][0]]))
        a_word_list_2.append((a_word_list[word][0], word_list_2[a_word_list[word][0]]))

    d_word_list_2 = d_word_list_2[5:] + d_word_list_2[0:5]
    a_word_list_2 = a_word_list_2[5:] + a_word_list_2[0:5]

    d_reward_list, a_reward_list = None, None
    if reward:
        d_reward_list, a_reward_list = [], []
        for word in range(0, 10):
            d_reward_list.append((d_word_list[word][0], reward_list[d_word_list[word][0]]))
            a_reward_list.append((a_word_list[word][0], reward_list[a_word_list[word][0]]))
        d_reward_list = remove_duplicates(d_reward_list)
        a_reward_list = remove_duplicates(a_reward_list)

    display_graph(remove_duplicates(a_word_list), remove_duplicates(a_word_list_2), "Top % increasing "  + style + "s", style, a_reward_list)
    display_graph(remove_duplicates(d_word_list), remove_duplicates(d_word_list_2), "Top % decreasing " + style + "s", style, d_reward_list)

eval_style(100, "punchline", True)