In [None]:
# !pip uninstall torch -y

In [None]:
# !pip install torch==2.2.1 -q

In [None]:
!pip install datasets -q

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import json
import ast

In [None]:
dataset = load_dataset("daily_dialog")

In [None]:
train = pd.DataFrame(dataset["train"]) #80% of the data set used to train the LLm
validation = pd.DataFrame(dataset["validation"]) #The correct output for the 20%
test = pd.DataFrame(dataset["test"]) #The output of the LLM

In [None]:
train

In [None]:
def format_GPT2_last_utterance(row):
    # Convert string representations into actual Python objects/lists
    dialog = row['dialog']
    emotions = row['emotion']

    formatted_dialog_GPT2 = ""
    for i, (utterance, emotion) in enumerate(zip(dialog, emotions)):

        speaker = "<Speaker_1>" if i % 2 == 0 else "<Speaker_2>"
        if i < len(dialog)-1:
            formatted_dialog_GPT2 += f"{speaker}: {utterance} | <Emotion>: {emotion} <|endofutterance|> "
        else:
            formatted_dialog_GPT2 += f"{speaker}: {utterance} | <Emotion>: "

    last_utterance_emotion = emotions[-1]

    return formatted_dialog_GPT2, last_utterance_emotion

In [None]:
def format_GPT2_no_last_utterance(row):
    dialog = ast.literal_eval(row['dialog'])[:-1]  # Convert string to list and remove last utterance
    labels = row['classification']

    # Map classification to numeric labels
    label_map = {'No Derailment': 0, 'Positive Derailment': 1, 'Negative Derailment': 2}
    numeric_label = label_map[labels]

    formatted_dialog_GPT2 = ""
    for i, utterance in enumerate(dialog):
        speaker = "<Speaker_1>" if i % 2 == 0 else "<Speaker_2>"
        formatted_dialog_GPT2 += f"{speaker}: {utterance}<|endofutterance|> "

    return pd.Series([formatted_dialog_GPT2, numeric_label])

In [None]:
## Formatting the training set

formatted_dialogs = []
labels = []
for _, row in train.iterrows():
    formatted_dialog, label = format_GPT2_last_utterance(row)
    formatted_dialogs.append(formatted_dialog)
    labels.append(label)

formatted_dialogs

In [None]:
## Formatting the validation set

formatted_dialogs_val = []
labels_val = []
for _, row in test.iterrows():
    formatted_dialog_val, label_val = format_GPT2_last_utterance(row)
    formatted_dialogs_val.append(formatted_dialog_val)
    labels_val.append(label_val)

In [None]:
## Formatting the test set

formatted_dialogs_test = []
labels_test = []
for _, row in test.iterrows():
    formatted_dialog_test, label_test = format_GPT2_last_utterance(row)
    formatted_dialogs_test.append(formatted_dialog_test)
    labels_test.append(label_test)

In [None]:
# pip install torch

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from tqdm import tqdm

# SETTING UP T4 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
torch.__version__

In [None]:
# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                          add_bos_token = True,
                                          add_eos_token = True)
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=7)
model.to(device)

# Define special tokens
special_tokens = {'bos_token': '<|startoftext|>','sep_token': '<|endofutterance|>', 'additional_special_tokens': ['<Speaker_1>', '<Speaker_2>', '<Emotion>']}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))



In [None]:
len(tokenizer)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token
tokenizer.sep_token
tokenizer.bos_token
tokenizer.eos_token

In [None]:
# Tokenize train dataset
inputs = tokenizer(formatted_dialogs, padding=True, truncation=True, return_tensors="pt", max_length= 1024)
# Create TensorDataset
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
labels_tensor = torch.tensor(labels) #COME BACK TO THIS#
dataset = TensorDataset(input_ids, attention_mask, labels_tensor)

In [None]:
from collections import Counter
label_count = Counter(labels)
print(label_count)

In [None]:
input_ids


In [None]:
tokenizer.decode(input_ids[0])

In [None]:
#Tokenize the validation set
val_inputs = tokenizer(formatted_dialogs_val, padding=True, truncation=True, return_tensors="pt", max_length= 1024)
# Create TensorDataset for the validation set
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels_tensor = torch.tensor(labels_val) #COME BACK TO THIS#
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels_tensor)

In [None]:
# Tokenize the test set
test_inputs = tokenizer(formatted_dialogs_test, padding=True, truncation=True, return_tensors="pt", max_length= 1024)
# Create a TensorDataset for the test set
test_input_ids = test_inputs['input_ids']
test_attention_mask = test_inputs['attention_mask']
test_labels_tensor = torch.tensor(labels_test) #COME BACK TO THIS#
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels_tensor)

In [None]:
## Creating the data loaders
batch_size = 2
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
import torch

In [None]:
from torch.optim import AdamW

In [None]:
# Defining the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
import copy

from sklearn.metrics import f1_score

# Training and Validation Loop

model.train()
num_epochs = 5
train_losses = []
val_losses = []
f1_scores = []
best_val_loss = float('inf')
patience = 4
epochs_no_improve = 0
early_stop = False

for epoch in tqdm(range(num_epochs), desc = "Epochs"):
    if early_stop:
        print("Early stopping")
        break

    print(f"Epoch {epoch+1} of {num_epochs}")

    #Training
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc='Training Progress', miniters=10):
        input_ids, attention_mask, batch_labels = batch
        input_ids, attention_mask, batch_labels = input_ids.to(device), attention_mask.to(device), batch_labels.to(device)
        model.config.pad_token_id = tokenizer.pad_token_id
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Total training loss: {total_train_loss}")
    print(f"Training loss: {avg_train_loss}")

    #validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc='Validation Progress', miniters=10):
            input_ids, attention_masks, batch_labels = batch
            input_ids, attention_masks, batch_labels = input_ids.to(device), attention_masks.to(device), batch_labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=batch_labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            pred_classes = torch.argmax(probs, dim=1)

            val_predictions.extend(pred_classes.cpu().numpy())
            val_true_labels.extend(batch_labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)
    val_macro_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    f1_scores.append(val_macro_f1)
    print(f"Validation loss: {avg_val_loss}")
    print(f"Validation Macro F1 score: {val_macro_f1}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            early_stop = True


    # Set the model back to training mode
    model.train()