# Deeplearning with Transformer Architectures

There are three sections to this implementation.
- Relevant Imports and PIP Installs (self explanatory)
- Dataset class and preprocessing function. Creates and stores the mpnet embeddings on the pre-processed dataset.
- Classifier Definition. Defines the ensemble as well as the binary classifier members.
- Functions for training and testing.
- The Notebook "Engine", which controls which cells will execute. For example sometimes we might want to skip training or eval etc.
- Demo block (This requires running the previous cells to define the needed functions)

## Relevant Imports and PIP Installs

In [1]:
!pip3 install sentence-transformers
!pip3 inst




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: unknown command "inst" - maybe you meant "list"



In [2]:
# Import all relevant modules

import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import nltk
import os
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score
from torch.utils.data import Subset

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

torch.manual_seed(42)

  from tqdm.autonotebook import tqdm, trange



cuda


<torch._C.Generator at 0x226330a1af0>

## Dataset Class and Preprocessing Function

In [3]:
# Define Preprocessing

def preprocess_line(line: str, params: set, tokenizer=None) -> list[str]:
    '''
    Preprocesses a line of text

    :param line:
    :param params:
    :return:

    Program flow:
        If the line contains an email, trims out the email header
        Tokenises the line
        Applies various transformations
            Removes stop words
            Stems
            Lemmatises
            Remvoes non-alphanumeric characters
            Sets all lowercase
        Returns the list of tokens
    '''

    if "trim email" in params:
        if "-- Forwarded by" in line:
            before_email = line.split("-")[0].strip()
            email = "-" + line.split("-", 1)[1] if "-" in line else ""
            email_subject = email.split("Subject:")[-1].strip() # if there's no subject, this keeps the whole email
            line = before_email + " " + email_subject
            line = line.strip()
        line
    
    if tokenizer == None:
        tokens = nltk.tokenize.word_tokenize(line)
    else:
        tokens = tokenizer.tokenize(line)
        starts_with_space_labels = [1 if "Ġ" in token else 0 for token in tokens]
        tokens = [token.replace("Ġ","") if "Ġ" in token else token for token in tokens ]
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")

    operations = {
        "stop words": lambda tokens: [token if token.lower() not in nltk.corpus.stopwords.words('english') else "" for token in tokens],
        "stem": lambda tokens: [nltk.PorterStemmer().stem(token) for token in tokens],
        "lemmatise": lambda tokens: [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens],
        "alphanumeric": lambda tokens: ["".join(filter(str.isalnum, token)) for token in tokens],
        "lowercase": lambda tokens: [token.lower() for token in tokens]
    }

    # Apply each operation if we define it in the params set
    for key, action in operations.items():
        if key in params:
            tokens = action(tokens)
    
    if tokenizer == None:
        return [token for token in tokens if token != ""]
    else:
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")
        return [tokens[i] if not starts_with_space_labels[i] else "".join(["Ġ", tokens[i]]) for i in range(len(tokens))]

In [None]:
# Define and create dataset and data loader

class reformattedDataset(Dataset):
    def __init__(self, df_name, parameters, reset_cache=False):
        # Name is in form <path>/<name>.csv
        name = df_name.split("/")[-1]
        name = name.split(".")[0]
        print(name)

        df = pd.read_csv(df_name)
        
        # load stores embeddings if found
        if os.path.exists(f"Cached_MPNet/{name}_output1.pt") and os.path.exists(f"Cached_MPNet/{name}_output2.pt") and reset_cache == False:
            print("Loading cached MPNet outputs...")
            loaded_o1 = torch.load(f"Cached_MPNet/{name}_output1.pt", weights_only=True, map_location=device)
            self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
            self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True, map_location=device)])
            print(f"Successfully loaded {len(self.output1)} MPNet outputs")

            self.labels = df["label"][:len(self.output1)].reset_index(drop=True)

        # create embeddings if not found
        else:
            mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

            texts1 = df["text_1"].astype(str).tolist()
            texts2 = df["text_2"].astype(str).tolist()

            self.output1 = torch.tensor(mpnet.encode(texts1, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            self.output2 = torch.tensor(mpnet.encode(texts2, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            
            self.labels = df["label"]

            torch.save(self.output1, f"Cached_MPNet/{name}_output1.pt")
            torch.save(self.output2, f"Cached_MPNet/{name}_output2.pt")

    def __len__(self) -> int:
        return len(self.output1)

    def __getitem__(self, i):

        return self.output1[i], self.output2[i], torch.tensor(self.labels.iloc[i], dtype=torch.float)

# same as above class, except no label
class MPNetTestDataset(Dataset):
    def __init__(self, df_name, parameters, reset_cache=False):
        # Name is in form <path>/<name>.csv
        name = df_name.split("/")[-1]
        name = name.split(".")[0]
        print(name)

        df = pd.read_csv(df_name)
        
        if os.path.exists(f"Cached_MPNet/{name}_output1.pt") and os.path.exists(f"Cached_MPNet/{name}_output2.pt") and reset_cache == False:
            print("Loading cached MPNet outputs...")
            loaded_o1 = torch.load(f"Cached_MPNet/{name}_output1.pt", weights_only=True, map_location=device)
            self.output1 = torch.stack([torch.tensor(t) for t in loaded_o1])
            self.output2 = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_MPNet/{name}_output2.pt", weights_only=True, map_location=device)])
            print(f"Successfully loaded {len(self.output1)} MPNet outputs")

        else:
            mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

            texts1 = df["text_1"].astype(str).tolist()
            texts2 = df["text_2"].astype(str).tolist()

            self.output1 = torch.tensor(mpnet.encode(texts1, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            self.output2 = torch.tensor(mpnet.encode(texts2, batch_size=16, convert_to_tensor=True, show_progress_bar=True))
            
            torch.save(self.output1, f"Cached_MPNet/{name}_output1.pt")
            torch.save(self.output2, f"Cached_MPNet/{name}_output2.pt")

    def __len__(self) -> int:
        return len(self.output1)

    def __getitem__(self, i):

        return self.output1[i], self.output2[i]


## Classifier Definition

In [None]:
# define our architecture

class classifierMPNet(nn.Module):
    def __init__(self,child_num):
        super(classifierMPNet, self).__init__()

        self.child_num = child_num

        # add our custom binary classifier layer to end
        self.fc1 = nn.Linear(768*4 + 1, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 1)
        
        self.classifier = nn.Sequential(
            self.fc1,
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Dropout(0.3),
            self.fc2, 
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Dropout(0.3),
            self.fc3
        )
        self.sigmoid = nn.Sigmoid() #to convert classifier output to probability

    def forward(self, input1, input2):
        # use pre computed MPNet tensors
        cos_sim = nn.functional.cosine_similarity(input1, input2, dim=1).unsqueeze(1)
        combined_inputs = torch.cat((input1, input2, torch.abs(input1-input2), input1*input2, cos_sim), dim=1)

        # pass result through our appended classifier layers
        classifier_output = self.classifier(combined_inputs)
        return classifier_output


class AverageClassifier(nn.Module):
    def __init__(self, children=5):
        super(AverageClassifier, self).__init__()

        # create ensemble members
        self.clfs = nn.ModuleList([classifierMPNet(child_num) for child_num in range(children)])

    def forward(self, input1, input2):
        outs = [clf(input1, input2) for clf in self.clfs]
        stack = torch.stack(outs, dim=0)
        return torch.mean(stack, dim=0)

## Train and Test Functions

In [None]:
def train_classifier():
    # Hyper-Parameters:
    batch_size = 64
    lr = 0.001
    epochs = 40
    threshold = 0
    weight_decay = 0.01

    num_children = 5
    model = AverageClassifier(num_children).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimiser = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    train_ds = reformattedDataset("Data/train.csv", parameters={}, reset_cache=False)
    val_ds = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=False)
    val_dl = DataLoader(val_ds, batch_size=1, shuffle=False)
    
    # split training dataset into subsets, one for each child
    subset_ratio = 0.8
    train_ds_splits = []
    for _ in range(num_children):
        indices = np.random.choice(len(train_ds), int(len(train_ds) * subset_ratio), replace=True)
        subset = Subset(train_ds, indices)
        train_ds_splits.append(subset)
    
    # train each child on its own data split
    for child_num, child_model in enumerate(model.clfs):
        print(f"Training child #{child_num + 1}")

        optimiser = torch.optim.AdamW(child_model.parameters(), lr=lr, weight_decay=weight_decay)
        child_dl = DataLoader(train_ds_splits[child_num], batch_size=batch_size, shuffle=True)
        for epoch in range(epochs):
            child_model.train()
            loss_list = []

            for s1, s2, l in child_dl:
                s1, s2, l = s1.to(device), s2.to(device), l.to(device)

                optimiser.zero_grad()
                output = child_model(s1, s2).reshape(l.shape[0]) 
                loss = criterion(output, l)
                loss_list.append(loss.detach().cpu().numpy()) 
                loss.backward() 
                optimiser.step()
            
            print(f"Child: {child_num+1}, Epoch {epoch+1}, Loss: {np.mean(loss_list)}")

        # eval the child
        loss_list = []
        preds = []
        true_labels = []

        model.eval()
        with torch.no_grad():
            for s1, s2, l in val_dl:
                s1, s2, l = s1.to(device), s2.to(device), l.to(device)
                output = model(s1, s2).reshape(l.shape[0])
                loss = criterion(output, l)
                loss_list.append(loss.detach().cpu().numpy())
                preds.append(1 if output[0] > threshold else 0)
                true_labels.append(l.detach().cpu().numpy())

        correct = 0
        TPs, FPs, FNs, TNs = 0, 0, 0, 0
        for j in range(len(true_labels)):
            if true_labels[j] == preds[j]:
                correct += 1
                if true_labels[j] == 1:
                    TPs += 1
                else:
                    TNs += 1
            else:
                if true_labels[j] == 1:
                    FNs += 1
                else:
                    FPs += 1

        accuracy = correct / len(true_labels)
        recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
        precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
        f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

        print(f"Testing Stats for Epoch {epoch+1}")
        print(f"    Mean Loss: {np.mean(loss_list)}")
        print(f"    Accuracy: {accuracy}")
        print(f"    _____________________________")
        print(f"    |True\Pred|Positive|Negative|")
        print(f"    |---------|--------|--------|")
        print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    ")
        print(f"    Precision: {precision}")
        print(f"    Recall: {recall}")
        print(f"    F1 Score: {f1}")

    return model
        

In [None]:
# test a finished trained model
def eval_classifier(model):
    val_ds = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=False)
    val_dl = DataLoader(val_ds, batch_size=1, shuffle=False)

    criterion = nn.BCEWithLogitsLoss()
    
    model.eval()
    loss_list = []
    preds = []
    true_labels = []

    threshold = 0
    
    model.eval()
    with torch.no_grad():
        for s1, s2, l in val_dl:
            s1, s2, l = s1.to(device), s2.to(device), l.to(device)
            output = model(s1, s2).reshape(l.shape[0])
            loss = criterion(output, l)
            loss_list.append(loss.detach().cpu().numpy())
            preds.append(1 if output[0] > threshold else 0)
            true_labels.append(l.detach().cpu().numpy())

    correct = 0
    TPs, FPs, FNs, TNs = 0, 0, 0, 0
    for j in range(len(true_labels)):
        if true_labels[j] == preds[j]:
            correct += 1
            if true_labels[j] == 1:
                TPs += 1
            else:
                TNs += 1
        else:
            if true_labels[j] == 1:
                FNs += 1
            else:
                FPs += 1

    accuracy = correct / len(true_labels)
    recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
    precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
    f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

    print(f"Testing Stats for MPNet Classifer (Threshold: {threshold})")
    print(f"    Mean Loss: {np.mean(loss_list)}")
    print(f"    Accuracy: {accuracy}")
    print(f"    _____________________________")
    print(f"    |True\Pred|Positive|Negative|")
    print(f"    |---------|--------|--------|")
    print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    ")
    print(f"    Precision: {precision}")
    print(f"    Recall: {recall}")
    print(f"    F1 Score: {f1}")

In [8]:
def test_classifier(model, data_path):
    test_ds = MPNetTestDataset(data_path, parameters={}, reset_cache=False)
    test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)

    criterion = nn.BCEWithLogitsLoss()

    loss_list = []
    preds = []
    true_labels = []

    threshold = 0

    model.eval()

    with torch.no_grad():
        for s1, s2 in test_dl:
            s1, s2 = s1.to(device), s2.to(device)

            output = model(s1, s2)
            preds.append(1 if output[0] > threshold else 0)

    return preds

## Notebook "Engine"

In [9]:
# Notebook parameters - decides which of the next cells actually run
generate_new_encodings = False
train_new_classifier = False
test_the_classifier = False
generate_evaluation_csv = False
evaluate_classifier = False

test_data_path = "Data/test.csv"

In [None]:
# Encode dataset - run_next usually set to False as this means you can spam shift+enter to get through notebook
if generate_new_encodings:
    dataset = reformattedDataset("Data/train.csv", parameters={}, reset_cache=True)
    val_dataset = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=True)

In [11]:
if train_new_classifier:
    mpnet_classifier = train_classifier()
    torch.save(mpnet_classifier.state_dict(), "Cached_MPNet/trained_classifier_weights.pt")

train
Loading cached MPNet outputs...
Successfully loaded 27643 MPNet outputs
dev
Loading cached MPNet outputs...
Successfully loaded 5993 MPNet outputs
Training child #1
Child: 1, Epoch 1, Loss: 0.7068110108375549
Child: 1, Epoch 2, Loss: 0.6446200013160706
Child: 1, Epoch 3, Loss: 0.5968915224075317
Child: 1, Epoch 4, Loss: 0.5501615405082703
Child: 1, Epoch 5, Loss: 0.49667781591415405
Child: 1, Epoch 6, Loss: 0.4278343915939331
Child: 1, Epoch 7, Loss: 0.359526664018631
Child: 1, Epoch 8, Loss: 0.29951244592666626
Child: 1, Epoch 9, Loss: 0.2529545724391937
Child: 1, Epoch 10, Loss: 0.20428334176540375
Child: 1, Epoch 11, Loss: 0.17147284746170044
Child: 1, Epoch 12, Loss: 0.1475798785686493
Child: 1, Epoch 13, Loss: 0.12940286099910736
Child: 1, Epoch 14, Loss: 0.11391938477754593
Child: 1, Epoch 15, Loss: 0.09809981286525726
Child: 1, Epoch 16, Loss: 0.08831790834665298
Child: 1, Epoch 17, Loss: 0.08057603985071182
Child: 1, Epoch 18, Loss: 0.07367617636919022
Child: 1, Epoch 19,

In [None]:
if evaluate_classifier:
    mpnet_classifier = AverageClassifier(children=5).to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True, map_location=device))
    eval_classifier(mpnet_classifier)

dev
Loading cached MPNet outputs...
Successfully loaded 5993 MPNet outputs
Testing Stats for MPNet Classifer (Threshold: 0)
    Mean Loss: 1.2083250284194946
    Accuracy: 0.6704488569998331
    _____________________________
    |True\Pred|Positive|Negative|
    |---------|--------|--------|
    |Positive |TPs: 2062|FNs: 994|
    |---------|--------|--------|
    |Negative |FPs: 981|TNs: 1956|
    |---------|--------|--------|
    
    Precision: 0.6776207689779823
    Recall: 0.6747382198952879
    F1 Score: 0.6761764223643221


In [None]:
if test_the_classifier:
    mpnet_classifier = AverageClassifier().to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True, map_location=device))
    predictions = test_classifier(mpnet_classifier, test_data_path)
    preds_df = pd.DataFrame(predictions)
    preds_df.to_csv("Group_17_B.csv")

test
Loading cached MPNet outputs...
Successfully loaded 5985 MPNet outputs


In [None]:
if generate_evaluation_csv:
    mpnet_classifier = AverageClassifier().to(device)
    mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True, map_location=device))
    predictions = test_classifier(mpnet_classifier, "Data/dev.csv")
    print("Done Test")
    predictions = ["prediction"] + predictions
    preds_df = pd.DataFrame(predictions)
    preds_df.to_csv("MPNet_Eval.csv", index=False, header=False)

dev
Loading cached MPNet outputs...
Successfully loaded 5993 MPNet outputs
Done Test


# Demo block

In [None]:
# load pre-trained weights
mpnet_classifier = AverageClassifier().to(device)
mpnet_classifier.load_state_dict(torch.load("Cached_MPNet/trained_classifier_weights.pt", weights_only=True, map_location=device))

demo_path = "demo.csv"
predictions = test_classifier(mpnet_classifier, demo_path)
print("Created Demo Predictions")
preds_df = pd.DataFrame(predictions)
preds_df.to_csv("Demo_predictions.csv")