# Evaluation
The code within this notebook is primarily for evaluating the model's performance in an isolated setting.

Thus, it will support:
1. Testing the model's performance against the test dataset.
2. Testing the performance of other models/methodologies on the test dataset.
3. Evaluating the model on random samples of the test dataset.
4. Testing the performance of the model on individual, randomly generated, game rounds.

The following metrics will be printed for each evaluation:
- The success rate of predictions (?).
- The most misclassified prompts & punchlines.
- The most accurately classified prompts & punchlines.
(for now these track correctly predicted jokes and jokes that were not predicted at all)

In [32]:
# Imports libraries relevant to the evaluation process.
from datasets import Dataset
from transformers import AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from random import sample
from tqdm import tqdm
import numpy as np
import torch, gc, random, tabulate
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Evaluates the trained model off an input dataset.
def eval_model(dataset):
    # Defines necessary evaluation variables.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model="Iterations/BERT/AT-3", batch_size = 10, tokenizer=tokenizer, device=0)
    batch_len = len(dataset)

    # Defines the evaluation metrics.
    accuracy = 0
    classified = {"prompts": [], "punchlines":[]}
    misclassified = {"prompts": [], "punchlines": []}

    # Iterates through each batch in the data, with each batch representing a round played.
    for batch in tqdm(dataset, desc="Evaluation progress"):
        # Generates a score for each joke in the batch.
        scores = []
        for out in generator(KeyDataset(batch, "inputs")):
            scores.append(out['score'])

        # Selects the index of the maximum score. If there is more than one as the maximum, one is randomly selected.
        max_ind = list(np.flatnonzero(scores == np.max(scores)))
        max_ind = sample(max_ind,1)[0]

        # The value for chosen is a float, so we avoid using an equality check.
        truth_ind = list(np.flatnonzero(batch['won'] == np.max(batch['won'])))[0]
        if batch['won'][max_ind] > 0:
            accuracy += 1
            classified['prompts'].append(batch['black_card_text'][max_ind])
            classified['punchlines'].append(batch['white_card_text'][max_ind])
        else:
            misclassified['prompts'].append(batch['black_card_text'][truth_ind])
            misclassified['punchlines'].append(batch['white_card_text'][truth_ind])

        # Resets the pipeline's call count to avoid any warnings.
        generator.call_count = 0

    # Clears the memory/cache used from the generation process.
    gc.collect()
    torch.cuda.empty_cache()

    # Stores the evaluation metrics and returns them.
    accuracy /= batch_len

    classified['prompts'] = sort_results(dict(zip(*np.unique(classified['prompts'], return_counts=True))))
    classified['punchlines'] = sort_results(dict(zip(*np.unique(classified['punchlines'], return_counts=True))))

    misclassified['prompts'] = sort_results(dict(zip(*np.unique(misclassified['prompts'], return_counts=True))))
    misclassified['punchlines'] = sort_results(dict(zip(*np.unique(misclassified['punchlines'], return_counts=True))))

    results = {"accuracy": accuracy, "classified": classified, "misclassified": misclassified}
    return results

# Sorts the input dictionaries counts in descending order.
def sort_results(dictionary):
    return dict(sorted(dictionary.items(), key=lambda item: item[1], reverse=True))

In [45]:
# Displays the results.
def display_results(results):
    print(f"Accuracy: {results['accuracy'] * 100}%")

    n = 0
    print("\nCorrectly predicted prompts:")
    for key, value in results["classified"]["prompts"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nCorrectly predicted punchlines:")
    for key, value in results["classified"]["punchlines"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nIncorrectly predicted prompts:")
    for key, value in results["misclassified"]["prompts"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

    n = 0
    print("\nIncorrectly predicted punchlines:")
    for key, value in results["misclassified"]["punchlines"].items():
        print(f"{key} - {value}")
        if n < 5:
            n += 1
            continue
        break

In [None]:
# Performs an evaluation of the model on all of the test data.
def eval_data():
    dataset = pd.read_csv('Data/proc_cah_data.csv')

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", str(row['white_card_text']))
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + str(row['white_card_text']), axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]
    results = eval_model(dataset)
    display_results(results)

eval_data()

In [None]:
# Performs an evaluation of the model on a specified number of games sampled from the test data/
def eval_samples(num, seed=None):
    dataset = pd.read_csv('Data/proc_cah_data.csv')

    ids = dataset['round_id'].unique()
    ids = pd.Series(ids).sample(n=num, random_state=seed)
    dataset = dataset[dataset['round_id'].isin(ids)].reset_index(drop=True)

    dataset['inputs'] = dataset.apply(lambda row: row['black_card_text'].replace("_____", row['white_card_text'])
                                       if "_____" in row['black_card_text']
                                       else row['black_card_text'] + " " + row['white_card_text'], axis=1)

    dataset = [Dataset.from_pandas(group) for _, group in dataset.groupby('round_id')]
    results = eval_model(dataset)
    display_results(results)

eval_samples(10000)

In [None]:
# Performs an evaluation of the model on a randomly generated rounds.ww
def eval_random(round_num):
    # Defines variables necessary for the process.
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    generator = pipeline("text-classification", model="Iterations/BERT/AT-3", batch_size = 10, tokenizer=tokenizer, device=0)

    # Reads the data and creates a dataframe of every unique prompt and punchline.
    dataset = pd.read_csv('Data/proc_cah_data.csv')
    prompts = dataset['black_card_text'].unique()
    punchlines = dataset['white_card_text'].unique()

    # For the specified number of rounds, samples a prompt and ten punchlines for the model to select a winning joke from within.
    for n in range(round_num):
        prompt = random.choice(prompts)
        punchline = [random.choice(punchlines) for num in range(0,10)]
        scores = []
        jokes = []

        # Combines the prompt and punchlines into jokes and generates a score for them.
        print("Choices: ")
        for joke in punchline:
            if prompt.count("_____") == 0:
                g_joke = prompt + " " + joke
            else:
                g_joke = prompt.replace("_____", joke[0].lower() + joke[1:-1])
            jokes.append(g_joke)
            print(g_joke)
            scores.append(generator(g_joke)[0]['score'])

        # Finds the index of the maximum and minimum scores.
        max_ind = list(np.flatnonzero(scores == np.max(scores)))
        max_ind = sample(max_ind,1)[0]

        min_ind = list(np.flatnonzero(scores == np.min(scores)))
        min_ind = sample(min_ind,1)[0]

        # Prints out the highest and lowest scoring jokes for the round.
        print("\nHighest scored joke:")
        print(jokes[max_ind])
        print("\nLowest scored joke:")
        print(jokes[min_ind] + "\n")

        # Resets the pipeline's call count to avoid any warnings.
        generator.call_count = 0

eval_random(10)

In [10]:
# Performs an evaluation of other models on the test data.