# Deeplearning with Transformer Architectures

(Explanation of method here)

## Relevant Imports and PIP Installs

In [101]:
!pip3 install sentence-transformers
!pip3 inst

Defaulting to user installation because normal site-packages is not writeable


In [239]:
# Import all relevant modules

import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import nltk
import os
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

torch.manual_seed(42)

cuda


<torch._C.Generator at 0x742cc81fb550>

## Dataset Class and Preprocessing Function

In [187]:
# Define Preprocessing

def preprocess_line(line: str, params: set, tokenizer=None) -> list[str]:
    '''
    Preprocesses a line of text

    :param line:
    :param params:
    :return:

    Program flow:
        If the line contains an email, trims out the email header
        Tokenises the line
        Applies various transformations
            Removes stop words
            Stems
            Lemmatises
            Remvoes non-alphanumeric characters
            Sets all lowercase
        Returns the list of tokens
    '''

    if "trim email" in params:
        if "-- Forwarded by" in line:
            before_email = line.split("-")[0].strip()
            email = "-" + line.split("-", 1)[1] if "-" in line else ""
            email_subject = email.split("Subject:")[-1].strip() # if there's no subject, this keeps the whole email
            line = before_email + " " + email_subject
            line = line.strip()
        line
    
    if tokenizer == None:
        tokens = nltk.tokenize.word_tokenize(line)
    else:
        tokens = tokenizer.tokenize(line)
        starts_with_space_labels = [1 if "Ġ" in token else 0 for token in tokens]
        tokens = [token.replace("Ġ","") if "Ġ" in token else token for token in tokens ]
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")

    operations = {
        "stop words": lambda tokens: [token if token.lower() not in nltk.corpus.stopwords.words('english') else "" for token in tokens],
        "stem": lambda tokens: [nltk.PorterStemmer().stem(token) for token in tokens],
        "lemmatise": lambda tokens: [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens],
        "alphanumeric": lambda tokens: ["".join(filter(str.isalnum, token)) for token in tokens],
        "lowercase": lambda tokens: [token.lower() for token in tokens]
    }

    # Apply each operation if we define it in the params set
    for key, action in operations.items():
        if key in params:
            tokens = action(tokens)
    
    if tokenizer == None:
        return [token for token in tokens if token != ""]
    else:
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")
        return [tokens[i] if not starts_with_space_labels[i] else "".join(["Ġ", tokens[i]]) for i in range(len(tokens))]

In [205]:
# Define and create dataset and data loader

class reformattedDataset(Dataset):
    def __init__(self, df_name, parameters, reset_cache=False):
        # Name is in form <path>/<name>.csv
        name = df_name.split("/")[-1]
        name = name.split(".")[0]
        print(name)

        df = pd.read_csv(df_name)
        
        if os.path.exists(f"Cached_MPNet/{name}_output1.pt") and os.path.exists(f"Cached_MPNet/{name}_output2.pt") and reset_cache == False:
            print("Loading cached MPNet outputs...")
            loaded_o1 = torch.load(f"Cached_MPNet/{name}_output1.pt", weights_only=True)
            self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
            self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True)])
            print(f"Successfully loaded {len(self.output1)} MPNet outputs")

            self.labels = df["label"][:len(self.output1)].reset_index(drop=True)

        else:
            mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

            texts1 = df["text_1"].astype(str).tolist()
            texts2 = df["text_2"].astype(str).tolist()

            self.output1 = torch.tensor(mpnet.encode(texts1, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            self.output2 = torch.tensor(mpnet.encode(texts2, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            
            self.labels = df["label"]

            torch.save(self.output1, f"Cached_MPNet/{name}_output1.pt")
            torch.save(self.output2, f"Cached_MPNet/{name}_output2.pt")

    def __len__(self) -> int:
        return len(self.output1)

    def __getitem__(self, i):

        return self.output1[i], self.output2[i], torch.tensor(self.labels.iloc[i], dtype=torch.float)

class MPNetTestDataset(Dataset):
    def __init__(self, df_name, parameters, reset_cache=False):
        # Name is in form <path>/<name>.csv
        name = df_name.split("/")[-1]
        name = name.split(".")[0]
        print(name)

        df = pd.read_csv(df_name)
        
        if os.path.exists(f"Cached_MPNet/{name}_output1.pt") and os.path.exists(f"Cached_MPNet/{name}_output2.pt") and reset_cache == False:
            print("Loading cached MPNet outputs...")
            loaded_o1 = torch.load(f"Cached_MPNet/{name}_output1.pt", weights_only=True)
            self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
            self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True)])
            print(f"Successfully loaded {len(self.output1)} MPNet outputs")

        else:
            mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

            texts1 = df["text_1"].astype(str).tolist()
            texts2 = df["text_2"].astype(str).tolist()

            self.output1 = torch.tensor(mpnet.encode(texts1, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            self.output2 = torch.tensor(mpnet.encode(texts2, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            
            torch.save(self.output1, f"Cached_MPNet/{name}_output1.pt")
            torch.save(self.output2, f"Cached_MPNet/{name}_output2.pt")

    def __len__(self) -> int:
        return len(self.output1)

    def __getitem__(self, i):

        return self.output1[i], self.output2[i]

# dataset = reformattedDataset("Data/train.csv", parameters={}, reset_cache=False) #TODO try parameters
# val_dataset = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=True)
# test_dataset = reformattedDataset("Data/AV_trial.csv", parameters={}, reset_cache=True)
# dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

## Classifier Definition

In [228]:
# define our architecture

class classifierMPNet(nn.Module):
    def __init__(self):
        super(classifierMPNet, self).__init__()

        # add our custom binary classifier layer to end
        self.fc1 = nn.Linear(768*4 + 1, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 1)
        
        self.classifier = nn.Sequential(
            self.fc1,
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Dropout(0.3),
            self.fc2, 
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Dropout(0.3),
            self.fc3
        )
        self.sigmoid = nn.Sigmoid() #to convert classifier output to probability

    def forward(self, input1, input2):
        # use pre computed MPNet tensors
        cos_sim = nn.functional.cosine_similarity(input1, input2, dim=1).unsqueeze(1)
        combined_inputs = torch.cat((input1, input2, torch.abs(input1-input2), input1*input2, cos_sim), dim=1)

        # pass result through our appended classifier layers
        classifier_output = self.classifier(combined_inputs)
        return classifier_output


class AverageClassifier(nn.Module):
    def __init__(self):
        super(AverageClassifier, self).__init__()
        self.clfs = nn.ModuleList([classifierMPNet() for _ in range(5)])

    def forward(self, input1, input2):
        outs = [clf(input1, input2) for clf in self.clfs]
        stack = torch.stack(outs, dim=0)
        return torch.mean(stack, dim=0)

## Train and Test Functions

In [255]:
def train_classifier():
    # Hyper-Parameters:
    batch_size = 64
    lr = 0.001
    epochs = 20
    threshold = 0
    weight_decay = 0.01

    model = AverageClassifier().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimiser = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    train_ds = reformattedDataset("Data/train.csv", parameters={}, reset_cache=False)
    val_ds = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=False)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=1, shuffle=False)
    
    for epoch in range(epochs):
        model.train()
        loss_list = []

        for s1, s2, l in train_dl:
            s1, s2, l = s1.to(device), s2.to(device), l.to(device)
            
            optimiser.zero_grad()
            output = model(s1, s2).reshape(l.shape[0])
            loss = criterion(output, l)
            loss_list.append(loss.detach().cpu().numpy())
            loss.backward()
            optimiser.step()

        print(f"Mean training loss for epoch {epoch+1}: {np.mean(loss_list)}")

        loss_list = []
        preds = []
        true_labels = []

        model.eval()
        with torch.no_grad():
            for s1, s2, l in val_dl:
                s1, s2, l = s1.to(device), s2.to(device), l.to(device)
                output = model(s1, s2).reshape(l.shape[0])
                loss = criterion(output, l)
                loss_list.append(loss.detach().cpu().numpy())
                preds.append(1 if output[0] > threshold else 0)
                true_labels.append(l.detach().cpu().numpy())

        correct = 0
        TPs, FPs, FNs, TNs = 0, 0, 0, 0
        for j in range(len(true_labels)):
            if true_labels[j] == preds[j]:
                correct += 1
                if true_labels[j] == 1:
                    TPs += 1
                else:
                    TNs += 1
            else:
                if true_labels[j] == 1:
                    FNs += 1
                else:
                    FPs += 1

        accuracy = correct / len(true_labels)
        recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
        precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
        f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

        print(f"Testing Stats for Epoch {epoch+1}")
        print(f"    Mean Loss: {np.mean(loss_list)}")
        print(f"    Accuracy: {accuracy}")
        print(f"    _____________________________")
        print(f"    |True\Pred|Positive|Negative|")
        print(f"    |---------|--------|--------|")
        print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    ")
        print(f"    Precision: {precision}")
        print(f"    Recall: {recall}")
        print(f"    F1 Score: {f1}")

    return model
        

In [281]:
def eval_classifier(model):
    val_ds = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=False)
    val_dl = DataLoader(val_ds, batch_size=1, shuffle=False)

    criterion = nn.BCEWithLogitsLoss()
    
    model.eval()
    loss_list = []
    preds = []
    true_labels = []

    threshold = 0
    
    model.eval()
    with torch.no_grad():
        for s1, s2, l in val_dl:
            s1, s2, l = s1.to(device), s2.to(device), l.to(device)
            output = model(s1, s2).reshape(l.shape[0])
            loss = criterion(output, l)
            loss_list.append(loss.detach().cpu().numpy())
            preds.append(1 if output[0] > threshold else 0)
            true_labels.append(l.detach().cpu().numpy())

    correct = 0
    TPs, FPs, FNs, TNs = 0, 0, 0, 0
    for j in range(len(true_labels)):
        if true_labels[j] == preds[j]:
            correct += 1
            if true_labels[j] == 1:
                TPs += 1
            else:
                TNs += 1
        else:
            if true_labels[j] == 1:
                FNs += 1
            else:
                FPs += 1

    accuracy = correct / len(true_labels)
    recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
    precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
    f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

    print(f"Testing Stats for MPNet Classifer (Threshold: {threshold})")
    print(f"    Mean Loss: {np.mean(loss_list)}")
    print(f"    Accuracy: {accuracy}")
    print(f"    _____________________________")
    print(f"    |True\Pred|Positive|Negative|")
    print(f"    |---------|--------|--------|")
    print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    ")
    print(f"    Precision: {precision}")
    print(f"    Recall: {recall}")
    print(f"    F1 Score: {f1}")

In [282]:
def test_classifier(model, data_path):
    test_ds = MPNetTestDataset(test_data_path, parameters={}, reset_cache=False)
    test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)

    criterion = nn.BCEWithLogitsLoss()

    loss_list = []
    preds = []
    true_labels = []

    threshold = 0

    model.eval()

    with torch.no_grad():
        for s1, s2 in test_dl:
            s1, s2 = s1.to(device), s2.to(device)

            output = model(s1, s2)
            preds.append(1 if output[0] > threshold else 0)

    return preds

## Notebook "Engine"

In [283]:
# Notebook parameters - decides which of the next cells actually run
generate_new_encodings = False
train_new_classifier = False
test_the_classifier = True
generate_evaluation_csv = True
evaluate_classifier = True

test_data_path = "Data/test.csv"

In [284]:
# Encode dataset - run_next usually set to False as this means you can spam shift+enter to get through notebook
if generate_new_encodings:
    dataset = reformattedDataset("Data/train.csv", parameters={}, reset_cache=True) #TODO try parameters
    val_dataset = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=True)

In [285]:
if train_new_classifier:
    mpnet_classifier = train_classifier()
    torch.save(mpnet_classifier.state_dict(), "Cached_MPNet/trained_classifier_weights.pt")

In [286]:
if evaluate_classifier:
    mpnet_classifier = AverageClassifier().to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True))
    eval_classifier(mpnet_classifier)

dev
Loading cached MPNet outputs...
Successfully loaded 5993 MPNet outputs
Testing Stats for MPNet Classifer (Threshold: 0)
    Mean Loss: 1.5296602249145508
    Accuracy: 0.6821291506757884
    _____________________________
    |True\Pred|Positive|Negative|
    |---------|--------|--------|
    |Positive |TPs: 2023|FNs: 1033|
    |---------|--------|--------|
    |Negative |FPs: 872|TNs: 2065|
    |---------|--------|--------|
    
    Precision: 0.6987910189982729
    Recall: 0.661976439790576
    F1 Score: 0.6798857334901698


In [209]:
if test_the_classifier:
    mpnet_classifier = AverageClassifier().to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True))
    predictions = test_classifier(mpnet_classifier, test_data_path)
    preds_df = pd.DataFrame(predictions)
    preds_df.to_csv("Group_17_C.csv")

test
Loading cached MPNet outputs...


  self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
  self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True)])


Successfully loaded 5985 MPNet outputs


In [220]:
if generate_evaluation_csv:
    mpnet_classifier = AverageClassifier().to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True))
    predictions = test_classifier(mpnet_classifier, "Data/dev.csv")
    print("Done Test")
    predictions = ["prediction"] + predictions
    preds_df = pd.DataFrame(predictions)
    preds_df.to_csv("MPNet_Eval.csv", index=False, header=False)

test
Loading cached MPNet outputs...
Successfully loaded 5985 MPNet outputs


  self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
  self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True)])


Done Test
