# Fine-tuning
In the initial phases of the training process, we first fine-tune the model based on the data. For the purpose of a preliminary analysis we use a BERT-base uncased model.

Several considerations are made:
1. We train a regression model to score each prompt-punchline combination. This allows for a ranking system of some sorts further down the line.
2. To achieve this, we must replace each instance of "_____" with the chosen joke for that combination.
3. We tokenize each combination and use it to fine-tune the regression model.

In [1]:
# Imports libraries relevant to the fine-tuning process.
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
import numpy as np

In [None]:
# Loads the data into a dataset, with a 80:20 training/test split.
dataset = load_dataset("csv", data_files="data/proc_cah_data.csv", split="train")
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Loads the model and tokenizer for sequence classification.
# We set the number of labels to 1 for a regression model.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
# Pre-processes the dataset to match the fine-tuning process.
def preprocess_function(examples):
    # Initialises lists to allow for a combination of prompts/punchlines.
    prompts = examples['black_card_text']
    punchlines = examples['white_card_text']
    combined = []

    # Loops through the punchlines, combining them with their respective prompts. If no instance of "_____" is found, the punchline is added to the end.
    for index, punchline in enumerate(punchlines):
        if prompts[index].count('_____') == 0:
            combined.append(f"{prompts[index]} {punchline}")
        else:
            combined.append(prompts[index].replace("_____", punchline))

    # Tokenizes the jokes formed through these combinations.
    tokenized_examples = tokenizer(combined, max_length=512, padding="max_length", truncation=True)
    tokenized_examples['label'] = examples['won']
    return tokenized_examples


original_columns = dataset["train"].column_names
tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=1, remove_columns=original_columns)
print(tokenized_dataset)

Map:   0%|          | 0/1957088 [00:00<?, ? examples/s]

Map:   0%|          | 0/489272 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1957088
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 489272
    })
})


In [None]:
# Sets the evaluation metric as mse.
metric = evaluate.load("mse")

# Computes the metrics the evaluation phases.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, squared=False)

# Trains the model and evaluates it.
training_args = TrainingArguments(output_dir="Iterations/BERT/AT-3", evaluation_strategy="epoch", save_total_limit=2)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

# trainer.train(resume_from_checkpoint=True)
trainer.train()
trainer.save_model()