# Deeplearning with Transformer Architectures

(Explanation of method here)

## Training Section

In [1]:
# Import all relevant modules

import torch
import torch.nn as nn
from transformers import BigBirdModel, BigBirdTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import nltk
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# Define Preprocessing

def preprocess_line(line: str, params: set, tokenizer=None) -> list[str]:
    '''
    Preprocesses a line of text

    :param line:
    :param params:
    :return:

    Program flow:
        If the line contains an email, trims out the email header
        Tokenises the line
        Applies various transformations
            Removes stop words
            Stems
            Lemmatises
            Remvoes non-alphanumeric characters
            Sets all lowercase
        Returns the list of tokens
    '''

    if "trim email" in params:
        if "-- Forwarded by" in line:
            before_email = line.split("-")[0].strip()
            email = "-" + line.split("-", 1)[1] if "-" in line else ""
            email_subject = email.split("Subject:")[-1].strip() # if there's no subject, this keeps the whole email
            line = before_email + " " + email_subject
            line = line.strip()
        line
    
    if tokenizer == None:
        tokens = nltk.tokenize.word_tokenize(line)
    else:
        tokens = tokenizer.tokenize(line)
        starts_with_space_labels = [1 if "Ġ" in token else 0 for token in tokens]
        tokens = [token.replace("Ġ","") if "Ġ" in token else token for token in tokens ]
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")

    operations = {
        "stop words": lambda tokens: [token if token.lower() not in nltk.corpus.stopwords.words('english') else "" for token in tokens],
        "stem": lambda tokens: [nltk.PorterStemmer().stem(token) for token in tokens],
        "lemmatise": lambda tokens: [nltk.WordNetLemmatizer().lemmatize(token) for token in tokens],
        "alphanumeric": lambda tokens: ["".join(filter(str.isalnum, token)) for token in tokens],
        "lowercase": lambda tokens: [token.lower() for token in tokens]
    }

    # Apply each operation if we define it in the params set
    for key, action in operations.items():
        if key in params:
            tokens = action(tokens)
    
    if tokenizer == None:
        return [token for token in tokens if token != ""]
    else:
        if len(tokens) != len(starts_with_space_labels):
            print(len(tokens))
            print(len(starts_with_space_labels))
            raise Exception("A token has been removed, labels dont align")
        return [tokens[i] if not starts_with_space_labels[i] else "".join(["Ġ", tokens[i]]) for i in range(len(tokens))]

In [None]:
# Define and create dataset and data loader

class reformattedDataset(Dataset):
    def __init__(self, df, parameters, reset_cache=False):
        
        if os.path.exists("Cached_Roberta/output1_roberta.pt") and os.path.exists("Cached_Roberta/output2_roberta.pt") and reset_cache == False:
            print("Loading cached Roberta outputs...")
            self.output1_roberta = torch.stack(torch.load("Cached_Roberta/output1_roberta.pt", weights_only=True))
            self.output2_roberta = torch.stack(torch.load("Cached_Roberta/output2_roberta.pt", weights_only=True))
            print(f"Successfully loaded {len(self.output1_roberta)} Roberta outputs")

            self.labels = df["label"][:len(self.output1_roberta)].reset_index(drop=True)

        else:
            tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
            texts1 = pd.Series([tokenizer.convert_tokens_to_string(preprocess_line(row,parameters,tokenizer)) for row in df["text_1"]])
            texts2 = pd.Series([tokenizer.convert_tokens_to_string(preprocess_line(row,parameters,tokenizer)) for row in df["text_2"]])
            print(texts1)

            encodeds1 = pd.Series([tokenizer(text, return_tensors = "pt", padding="max_length", max_length=514, truncation=True) for text in texts1])
            encodeds2 = pd.Series([tokenizer(text, return_tensors = "pt", padding="max_length", max_length=514, truncation=True) for text in texts2])

            self.inputs1 = pd.Series([encoded["input_ids"].squeeze(0) for encoded in encodeds1])
            self.attention_masks1 = pd.Series([encoded["attention_mask"].squeeze(0) for encoded in encodeds1])
            self.inputs2 = pd.Series([encoded["input_ids"].squeeze(0) for encoded in encodeds2])
            self.attention_masks2 = pd.Series([encoded["attention_mask"].squeeze(0) for encoded in encodeds2])

            # pre compute roberta tensors to save time
            self.roberta = BigBirdModel.from_pretrained("google/bigbird-roberta-base").to(device)
            for parameter in self.roberta.parameters():
                parameter.requires_grad = False
            self.output1_roberta = torch.stack([self.roberta(input1.unsqueeze(0).to(device), attention_mask=attention_mask1.unsqueeze(0).to(device)).last_hidden_state[:, 0, :].squeeze(0).detach() for input1, attention_mask1 in zip(self.inputs1, self.attention_masks1)])
            self.output2_roberta = torch.stack([self.roberta(input2.unsqueeze(0).to(device), attention_mask=attention_mask2.unsqueeze(0).to(device)).last_hidden_state[:, 0, :].squeeze(0).detach() for input2, attention_mask2 in zip(self.inputs2, self.attention_masks2)])

            # save roberta tensors to file, prevents rerunning each time
            torch.save(self.output1_roberta.tolist(), "Cached_Roberta/output1_roberta.pt")
            torch.save(self.output2_roberta.tolist(), "Cached_Roberta/output2_roberta.pt")


            self.labels = df["label"]

    def __len__(self) -> int:
        return len(self.output1_roberta)

    def __getitem__(self, i):

        return {
            "input1": self.output1_roberta[i],
            "input2": self.output2_roberta[i],
            "label": torch.tensor(self.labels.iloc[i], dtype=torch.float).to(device)
        }

dataset = reformattedDataset(pd.read_csv("Data/train.csv", nrows=27644), parameters={}, reset_cache=True) #TODO try parameters
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

0     halloween with kids is fun - getting them to b...
1     everybody sing! urlLink http://www.smokeybear....
2     'The preservation of health is a duty. Few see...
3     Following 1940's SAPS AT SEA , Stan Laurel and...
4     Marybeth: The call's ended and I'll be availab...
                            ...                        
95    I have created a new subdirectory aptly called...
96    Well there isnt much to wirte about god damn t...
97    urlLink Unstuff this file in Applications/Ghos...
98    urlLink FOXNews.com - Top Stories - World Join...
99    ACCESS.BENEFITS.2001 Benefits Customer Service...
Length: 100, dtype: object


Attention type 'block_sparse' is not possible if sequence_length: 514 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


In [4]:
# define our architecture

class classifierRoberta(nn.Module):
    def __init__(self):
        super(classifierRoberta, self).__init__()

        # load and freeze pretrained Roberta
        self.roberta = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
        for parameter in self.roberta.parameters():
            parameter.requires_grad = False 
        
        # add our custom binary classifier layer to end
        self.classifier = nn.Linear(768*2, 1)
        self.sigmoid = nn.Sigmoid() #to convert classifier output to probability

    def forward(self, input1, input2):
        
        # use pre computed Roberta tensors
        combined_inputs = torch.cat((input1.squeeze(1), input2.squeeze(1)), dim=1)

        # pass result through our appended classifier layers
        classifier_output = self.classifier(combined_inputs)
        probability = self.sigmoid(classifier_output)
        return probability.squeeze() #convert to 1D


In [5]:
# Create and train model

model = classifierRoberta().to(device)
optimiser = torch.optim.AdamW(model.classifier.parameters(), lr=0.00001, weight_decay=0.01)
loss_model = nn.BCELoss()

epochs = 25_000
for epoch in range(epochs):
    
    model.train()
    total_loss = 0

    for batch in dataloader:
        
        input_1 = batch["input1"].to(device)
        input_2 = batch["input2"].to(device)
        label = batch["label"].to(device)

        optimiser.zero_grad()
        predictions = model(input_1, input_2)
        
        loss = loss_model(predictions, label)
        total_loss += loss.item()
        loss.backward()
        optimiser.step()

    if epoch % 1000 == 0:
        print(f"Loss {total_loss/len(dataloader)}  Epoch: {epoch} ") 

# save model
torch.save(model.state_dict(), "Cached_Roberta/trained_classifier_weights.pt")

# load model
# model = classifierRoberta().to(device)
# model.load_state_dict(torch.load("Cached_Roberta/trained_classifier_weights.pt", weights_only=True))
# model.eval()




Loss 0.7076087594032288  Epoch: 0 
Loss 0.6445196270942688  Epoch: 1000 
Loss 0.6001457571983337  Epoch: 2000 
Loss 0.5600441694259644  Epoch: 3000 
Loss 0.523388147354126  Epoch: 4000 
Loss 0.4896755814552307  Epoch: 5000 
Loss 0.4585280120372772  Epoch: 6000 
Loss 0.4296446144580841  Epoch: 7000 
Loss 0.4027692973613739  Epoch: 8000 
Loss 0.3776927888393402  Epoch: 9000 
Loss 0.35425063967704773  Epoch: 10000 
Loss 0.33230605721473694  Epoch: 11000 
Loss 0.3117361068725586  Epoch: 12000 
Loss 0.2924351692199707  Epoch: 13000 
Loss 0.27430781722068787  Epoch: 14000 
Loss 0.25726643204689026  Epoch: 15000 
Loss 0.2412378191947937  Epoch: 16000 
Loss 0.2261602133512497  Epoch: 17000 
Loss 0.21198096871376038  Epoch: 18000 
Loss 0.1986524909734726  Epoch: 19000 
Loss 0.18612729012966156  Epoch: 20000 
Loss 0.17435844242572784  Epoch: 21000 
Loss 0.1633017659187317  Epoch: 22000 
Loss 0.15291500091552734  Epoch: 23000 
Loss 0.1431588977575302  Epoch: 24000 


In [6]:
# TODO find optimal threshhold after training

T = 0.4

In [10]:
model.eval()

def makePrediction(index):
    data = dataset[index]
    input_1 = data["input1"].to(device)
    input_2 = data["input2"].to(device)
    label = data["label"].to(device)

    prediction = model(input_1.unsqueeze(0), input_2.unsqueeze(0))
    return prediction.item(), label.item()

predictions = []
for i in range(len(dataset)):
    prediction, label = makePrediction(i)
    prediction = 1 if prediction > T else 0 
    predictions.append((prediction,int(label)))
    print(predictions[i])

correct = sum(pred == label for pred, label in predictions)
accuracy = correct / len(predictions)
print(f"Accuracy: {accuracy:.3f}")


(0, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(0, 0)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
Accuracy: 1.000
