# Deeplearning with Transformer Architectures

(Explanation of method here)

## Training Section

In [1]:
# Import all relevant modules

import torch
import torch.nn as nn
from transformers import BigBirdModel, BigBirdTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import nltk
import os
from tqdm import tqdm

# CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING']='1'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
# Define Preprocessing

def preprocess_line(line: str, params: set, tokenizer=None) -> list[str]:
    '''
    Preprocesses a line of text

    :param line:
    :param params:
    :return:

    Program flow:
        If the line contains an email, trims out the email header
        Tokenises the line
        Applies various transformations
            Removes stop words
            Stems
            Lemmatises
            Remvoes non-alphanumeric characters
            Sets all lowercase
        Returns the list of tokens
    '''

    if "trim email" in params:
        if "-- Forwarded by" in line:
            before_email = line.split("-")[0].strip()
            email = "-" + line.split("-", 1)[1] if "-" in line else ""
            email_subject = email.split("Subject:")[-1].strip() # if there's no subject, this keeps the whole email
            line = before_email + " " + email_subject
            line = line.strip()
        line
    
    if tokenizer == None:
        tokens = nltk.tokenize.word_tokenize(line)
    else:
        tokens = tokenizer.tokenize(line)
        starts_with_space_labels = [1 if "Ġ" in token else 0 for token in tokens]
        tokens = [token.replace("Ġ","") if "Ġ" in token else token for token in tokens ]
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")

    operations = {
        "stop words": lambda tokens: [token if token.lower() not in nltk.corpus.stopwords.words('english') else "" for token in tokens],
        "stem": lambda tokens: [nltk.PorterStemmer().stem(token) for token in tokens],
        "lemmatise": lambda tokens: [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens],
        "alphanumeric": lambda tokens: ["".join(filter(str.isalnum, token)) for token in tokens],
        "lowercase": lambda tokens: [token.lower() for token in tokens]
    }

    # Apply each operation if we define it in the params set
    for key, action in operations.items():
        if key in params:
            tokens = action(tokens)
    
    if tokenizer == None:
        return [token for token in tokens if token != ""]
    else:
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")
        return [tokens[i] if not starts_with_space_labels[i] else "".join(["Ġ", tokens[i]]) for i in range(len(tokens))]

In [3]:
# Define and create dataset and data loader

class reformattedDataset(Dataset):
    def __init__(self, df_name, parameters, reset_cache=False):
        # Name is in form <path>/<name>.csv
        name = df_name.split("/")[-1]
        name = name.split(".")[0]
        print(name)

        df = pd.read_csv(df_name, nrows=100)
        
        if os.path.exists(f"Cached_Roberta/{name}_output1_roberta.pt") and os.path.exists(f"Cached_Roberta/{name}_output2_roberta.pt") and reset_cache == False:
            print("Loading cached Roberta outputs...")
            loaded_o1 = torch.load(f"Cached_Roberta/{name}_output1_roberta.pt", weights_only=True)
            self.output1_roberta = torch.stack([torch.tensor(t) for t in loaded_o1])
            self.output2_roberta = torch.stack([torch.tensor(t) for t in torch.load(f"Cached_Roberta/{name}_output2_roberta.pt", weights_only=True)])
            print(f"Successfully loaded {len(self.output1_roberta)} Roberta outputs")

            self.labels = df["label"][:len(self.output1_roberta)].reset_index(drop=True)

        else:
            tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")

            print("Preprocessing and converting tokens to strings...")
            texts1 = pd.Series([tokenizer.convert_tokens_to_string(preprocess_line(row,parameters,tokenizer)) for row in tqdm(df["text_1"], desc="Processing text_1", position=0, leave=True)])
            texts2 = pd.Series([tokenizer.convert_tokens_to_string(preprocess_line(row,parameters,tokenizer)) for row in tqdm(df["text_2"], desc="Processing text_2", position=0, leave=True)])

            print("Tokenising with BigBird")
            encodeds1 = pd.Series([tokenizer(text, return_tensors = "pt", padding="max_length", max_length=514, truncation=True) for text in tqdm(texts1, desc="Tokenising text_1", position=0, leave=True)])
            encodeds2 = pd.Series([tokenizer(text, return_tensors = "pt", padding="max_length", max_length=514, truncation=True) for text in tqdm(texts2, desc="Tokenising text-2", position=0, leave=True)])

            self.inputs1 = pd.Series([encoded["input_ids"].squeeze(0) for encoded in encodeds1])
            self.attention_masks1 = pd.Series([encoded["attention_mask"].squeeze(0) for encoded in encodeds1])
            self.inputs2 = pd.Series([encoded["input_ids"].squeeze(0) for encoded in encodeds2])
            self.attention_masks2 = pd.Series([encoded["attention_mask"].squeeze(0) for encoded in encodeds2])

            # pre compute roberta tensors to save time
            print("Loading BigBird")
            self.roberta = BigBirdModel.from_pretrained("google/bigbird-roberta-base").to(device)
            for parameter in self.roberta.parameters():
                parameter.requires_grad = False

            print("Encoding with BigBird")
            outputs = []
            self.roberta.eval()
            batch_size = 16
            with torch.no_grad():
                for i in tqdm(range(0, len(self.inputs1), batch_size), desc="Encoding input1", position=0, leave=True):
                    batch_inputs = list(self.inputs1[i:i+batch_size])
                    batch_masks = list(self.attention_masks1[i:i+batch_size])

                    batch_inputs_tensor = torch.stack(batch_inputs).to(device)
                    batch_masks_tensor = torch.stack(batch_masks).to(device)

                    batch_outputs = self.roberta(batch_inputs_tensor, attention_mask=batch_masks_tensor).last_hidden_state[:, 0, :]

                    outputs.extend(batch_outputs.cpu())
                    torch.cuda.empty_cache()
            
            self.output1_roberta = torch.stack(outputs)
            torch.save(self.output1_roberta.tolist(), f"Cached_Roberta/{name}_output1_roberta.pt")

            outputs = []
            with torch.no_grad():
                for i in tqdm(range(0, len(self.inputs2), batch_size), desc="Encoding input2", position=0, leave=True):
                    batch_inputs = list(self.inputs2[i:i+batch_size])
                    batch_masks = list(self.attention_masks2[i:i+batch_size])

                    batch_inputs_tensor = torch.stack(batch_inputs).to(device)
                    batch_masks_tensor = torch.stack(batch_masks).to(device)

                    batch_outputs = self.roberta(batch_inputs_tensor, attention_mask=batch_masks_tensor).last_hidden_state[:, 0, :]

                    outputs.extend(batch_outputs.cpu())
                    torch.cuda.empty_cache()

            self.output2_roberta = torch.stack(outputs)
            torch.save(self.output2_roberta.tolist(), f"Cached_Roberta/{name}_output2_roberta.pt")
            
            # self.output1_roberta = torch.stack([self.roberta(input1.unsqueeze(0).to(device), attention_mask=attention_mask1.unsqueeze(0).to(device)).last_hidden_state[:, 0, :].squeeze(0).detach() for input1, attention_mask1 in tqdm(zip(self.inputs1, self.attention_masks1), total=len(self.inputs1), desc="Encoding input1", position=0, leave=True)])
            
            
            # self.output2_roberta = torch.stack([self.roberta(input2.unsqueeze(0).to(device), attention_mask=attention_mask2.unsqueeze(0).to(device)).last_hidden_state[:, 0, :].squeeze(0).detach() for input2, attention_mask2 in tqdm(zip(self.inputs2, self.attention_masks2), total=len(self.inputs2), desc="Encoding input2", position=0, leave=True)])
            # torch.save(self.output2_roberta.tolist(), "Cached_Roberta/output2_roberta.pt")
            # save roberta tensors to file, prevents rerunning each time
            


            self.labels = df["label"]

    def __len__(self) -> int:
        return len(self.output1_roberta)

    def __getitem__(self, i):

        return self.output1_roberta[i], self.output2_roberta[i], torch.tensor(self.labels.iloc[i], dtype=torch.float)

dataset = reformattedDataset("Data/train.csv", parameters={}, reset_cache=False) #TODO try parameters
val_dataset = reformattedDataset("Data/dev.csv", parameters={}, reset_cache=False)
test_dataset = reformattedDataset("Data/AV_trial.csv", parameters={}, reset_cach=True)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

train
Loading cached Roberta outputs...
Successfully loaded 100 Roberta outputs
dev
Loading cached Roberta outputs...
Successfully loaded 100 Roberta outputs


In [4]:
# define our architecture

class classifierRoberta(nn.Module):
    def __init__(self):
        super(classifierRoberta, self).__init__()

        # load and freeze pretrained Roberta
        self.roberta = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
        for parameter in self.roberta.parameters():
            parameter.requires_grad = False 
        
        # add our custom binary classifier layer to end
        self.fc1 = nn.Linear(768*2, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 1)
        
        self.classifier = nn.Sequential(self.fc1, self.fc2, self.fc3)
        self.sigmoid = nn.Sigmoid() #to convert classifier output to probability

    def forward(self, input1, input2):
        
        # use pre computed Roberta tensors
        combined_inputs = torch.cat((input1.squeeze(1), input2.squeeze(1)), dim=1)

        # pass result through our appended classifier layers
        classifier_output = self.classifier(combined_inputs)
        return classifier_output.squeeze()
        # probability = self.sigmoid(classifier_output)
        # return probability.squeeze() #convert to 1D


In [5]:
# Create Model and Hyperparams
model = classifierRoberta().to(device)
optimiser = torch.optim.AdamW(model.classifier.parameters(), lr=0.00001, weight_decay=0.01)
loss_model = nn.BCEWithLogitsLoss()

In [6]:
# train model

epochs = 25
for epoch in range(epochs):
    
    model.train()
    total_loss = 0

    for s1, s2, l in dataloader:
        s1, s2, l = s1.to(device), s2.to(device), l.to(device)

        optimiser.zero_grad()
        predictions = model(s1, s2)
        
        loss = loss_model(predictions, l)
        total_loss += loss.item()
        loss.backward()
        optimiser.step()

    if epoch % 1000 == 0:
        print(f"Loss {total_loss/len(dataloader)}  Epoch: {epoch} ") 

# save model
torch.save(model.state_dict(), "Cached_Roberta/trained_classifier_weights.pt")

# load model
# model = classifierRoberta().to(device)
# model.load_state_dict(torch.load("Cached_Roberta/trained_classifier_weights.pt", weights_only=True))
# model.eval()




Loss 0.6956040263175964  Epoch: 0 


In [7]:
# TODO find optimal threshhold after training

T = 0.4

In [None]:
def test_classifier(model):
    test_data = torch.load(

In [8]:
model.eval()

def makePrediction(index):
    data = dataset[index]
    input_1 = data["input1"].to(device)
    input_2 = data["input2"].to(device)
    label = data["label"].to(device)

    prediction = model(input_1.unsqueeze(0), input_2.unsqueeze(0))
    return prediction.item(), label.item()

predictions = []
for i in range(len(dataset)):
    prediction, label = makePrediction(i)
    prediction = 1 if prediction > T else 0 
    predictions.append((prediction,int(label)))
    print(predictions[i])

correct = sum(pred == label for pred, label in predictions)
accuracy = correct / len(predictions)
print(f"Accuracy: {accuracy:.3f}")


TypeError: tuple indices must be integers or slices, not str