# Pytorch Training UI

### Import Libraries

In [1]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [2]:
import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages/pytorch" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    !pip install einops
else:
    py_file_location = './PrivatePackages/pytorch'
    home_directory = './'

sys.path.append(os.path.abspath(py_file_location))

from environment import *
from utils import *

from collections import defaultdict as dd
from typing import Tuple

from sklearn.model_selection import train_test_split

In [3]:
from model.model_class import LSTM, BERT

### Set Seed and Load Data

In [4]:
SEED = 2608

In [5]:
data1 = []
with open(home_directory + '/data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open(home_directory + './data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [6]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

---
Preprocess data

In [7]:
def crop_sentence_length(data: list, max_sentence_length: int, make_cropped_remains_into_new_instance: bool) -> dict:
    
    """ For every sentence, reduce the length of sentence to a fixed value 
    if it is over the max_length """

    new_data = []

    for instance in tqdm(data):
        for i in range(len(data)//max_sentence_length+1):

            new_instance = {}
            
            cropped_text = instance['text'][i*max_sentence_length:(i+1)*max_sentence_length]

            if len(cropped_text) == 0:
                continue

            if i == 0 or make_cropped_remains_into_new_instance:

                new_instance['text'] = cropped_text
                new_instance['label'] = instance['label'] 
                new_instance['domain'] = instance['domain']
                new_instance['id'] = instance['id'] + (i*0.01) # TODO
                
                new_instance['remains'] = 0 if i == 0 else 1 
                
                new_data.append(new_instance)

            else:
                break

    return new_data

In [8]:
def get_raw_token_pytorch_map(data: list, min_frequency: int = 0) -> dict:

    """ get a mapping that:
        1. places restrictions on how many times an instance is observed to be eligible to be learnt in model as an embedding
        2. get a map of raw token values to compressed and sorted token values, with addition of CLS, Padding and Unknown.
    """
    
    tally_of_unique_train_tokens = dd(int)
    for instance in tqdm(data):
        for token in instance['text']:
            tally_of_unique_train_tokens[token] += 1
    
    if 1 not in tally_of_unique_train_tokens: # ensure that 1 is in there so not to mess up our mapping (1 is this dataset's unknown)
        tally_of_unique_train_tokens[1] = np.inf

    list_of_unique_train_tokens = list(tally_of_unique_train_tokens.items())
    list_of_unique_train_tokens = [token_count[0] for token_count in list_of_unique_train_tokens if token_count[1] > min_frequency]
    list_of_unique_train_tokens.sort() # so to keep our original tokens in order

    # Need to compress the used tokens (in training) onto a denser map for pytorch embedding
    raw_token_pytorch_map = {token: i+2 for i, token in enumerate(list_of_unique_train_tokens)}
    raw_token_pytorch_map['CLS'] = 0 # CLS takes on 0 in our map
    raw_token_pytorch_map['PAD'] = 1 # Padding takes on 1 in our map
    raw_token_pytorch_map['UNK'] = 2 # Unknown takes on 2 in our map as per data (in data 1 was value of unknown)

    # even if we don't decide to use CLS, it doesn't affect our model at all.

    return raw_token_pytorch_map

In [9]:
def ReTokenise_Tokens(data, raw_token_pytorch_map, max_sentence_length, CLS=True):
    
    """ Convert the Token index into one useable by Pytorch Embedding Layer """
    
    data = copy.deepcopy(data)
    for instance in tqdm(data):
        instance['text'] = instance['text'] = [raw_token_pytorch_map[token] if token in raw_token_pytorch_map else raw_token_pytorch_map['UNK'] for token in instance['text']]
        if CLS: 
            instance['text'] = [raw_token_pytorch_map['CLS']] + instance['text']

    if max_sentence_length:
        for instance in data:
            if len(instance['text']) < max_sentence_length:
                instance['text'] = instance['text'] + [raw_token_pytorch_map['PAD']] * (max_sentence_length - len(instance['text'])) # 1 is pad in our map
            else:
                instance['text'] = instance['text'][:max_sentence_length]

    return data

In [10]:
def Data_Factory(train_data, val_data, test_data, max_sentence_length, raw_token_pytorch_map, CLS=True):
    
    """ Convert our (cropped) data into train x, train y, val x, val y, test x, test y etc """

    train_data_transformed = ReTokenise_Tokens(train_data, raw_token_pytorch_map, max_sentence_length, CLS)
    val_data_transformed = ReTokenise_Tokens(val_data, raw_token_pytorch_map, max_sentence_length, CLS)
    test_data_transformed = ReTokenise_Tokens(test_data, raw_token_pytorch_map, max_sentence_length, CLS)

    train_x = [instance['text'] for instance in train_data_transformed]
    train_y = [instance['label'] for instance in train_data_transformed]
    val_x = [instance['text'] for instance in val_data_transformed]
    val_y = [instance['label'] for instance in val_data_transformed]
    test_x = [instance['text'] for instance in test_data_transformed]
    test_y = [instance['label'] for instance in test_data_transformed]

    train_y = [[0, 1] if label == 1 else [1, 0] for label in train_y] #TODO: check this
    val_y = [[0, 1] if label == 1 else [1, 0] for label in val_y]
    test_y = [[0, 1] if label == 1 else [1, 0] for label in test_y]

    return train_x, train_y, val_x, val_y, test_x, test_y
    

In [11]:
def get_distribution(train_y) -> Tuple[float, float]:

    """ get the distribution of labels in this set - for processing the loss function """

    label = [y[1] for y in train_y]

    return np.mean(label), 1-np.mean(label)

In [12]:
def W2V_DataFactory(data: list, context_window: int, seed: int, raw_token_pytorch_map: dict, k) -> list:

    """ Get W2V training data """
    
    assert context_window % 2 == 1, 'context window must be odd'

    np.random.seed(seed)

    MAX_SAMPLED_NEGATIVE_TOKENS = 10000

    retokenised_keys = list(raw_token_pytorch_map.keys())

    negative_tokens = np.random.choice(retokenised_keys, MAX_SAMPLED_NEGATIVE_TOKENS)

    negative_up_to = 0

    w2v_data = []

    for instance in tqdm(data):
        tokens = [context_window//2 * 'CLS'] + instance['text'] + [context_window//2 * raw_token_pytorch_map['PAD']]

        for i in range(context_window//2, len(tokens) - context_window//2):
            
            focus_token_retokenised = raw_token_pytorch_map.get(tokens[i], raw_token_pytorch_map['UNK'])
            context_words = set()

            for j in range(-context_window//2, context_window//2+1):
                if j != 0: # don't want to make positive sample with self
                    if tokens[j] in context_words: # CLS and Padding (being start and end) being repeated
                        continue 
                    
                    new_instance = {'token': focus_token_retokenised, 'context': raw_token_pytorch_map.get(tokens[j], raw_token_pytorch_map['UNK']), 'label': 1}
                    w2v_data.append(new_instance)
                    context_words.add(tokens[j])
            
            for j in range(len(context_words)): # sample the same number of negatives
                # TODO: different for each round?
                while True:
                    
                    if negative_up_to == MAX_SAMPLED_NEGATIVE_TOKENS:
                        negative_up_to = 0
                        #TODO: shuffle

                    sampled_negative_retokenised = negative_tokens[negative_up_to]
                    negative_up_to += 1
                    if sampled_negative_retokenised not in context_words: # didn't sample a positive case
                        break

                new_instance = {'token': focus_token_retokenised, 'context': sampled_negative_retokenised, 'label': 0}
                w2v_data.append(new_instance)
    
    return w2v_data

In [13]:
MAX_SENTENCE_LENGTH = 128
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right

In [14]:
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance =MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance =MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)

 30%|██▉       | 3766/12600 [00:00<00:00, 37654.74it/s]

100%|██████████| 12600/12600 [00:00<00:00, 26908.93it/s]
100%|██████████| 2700/2700 [00:00<00:00, 94999.63it/s]
100%|██████████| 2700/2700 [00:00<00:00, 120369.69it/s]


In [15]:
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY) 

  0%|          | 0/12600 [00:00<?, ?it/s]

100%|██████████| 12600/12600 [00:00<00:00, 48064.47it/s]


In [16]:
train_x, train_y, val_x, val_y, test_x, test_y = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    MAX_SENTENCE_LENGTH, \
                                                                        raw_token_pytorch_map, \
                                                                            CLS=True)

100%|██████████| 12600/12600 [00:00<00:00, 70358.86it/s]
100%|██████████| 2700/2700 [00:00<00:00, 60702.95it/s]
100%|██████████| 2700/2700 [00:00<00:00, 55708.92it/s]


In [17]:
pos_prior, neg_prior = get_distribution(train_y)

In [18]:
train_w2v_data = W2V_DataFactory(train_data, context_window= W2V_CONTEXT_WINDOW, seed = SEED, raw_token_pytorch_map = raw_token_pytorch_map)
val_w2v_data = W2V_DataFactory(val_data, context_window= W2V_CONTEXT_WINDOW, seed = SEED, raw_token_pytorch_map = raw_token_pytorch_map)

TypeError: W2V_DataFactory() missing 1 required positional argument: 'k'

In [None]:
# class Dataset():
#     """ Pytorch style dataset """

#     def __init__(self, data, maxlen):
#         self.data = data
#         self.maxlen = maxlen
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, index):
#         return self.data[index]['text'], self.data[index]['label']
#         # return self.data[index]['text'], self.data[index]['label'], self.data[index]['domain']

---
### Models

#### 1. Prediction only

In [18]:
# LSTM

class LSTM_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    n_recurrent_layers = 1
    bidirectional = True
    n_heads = 8
    dropout = 0.1
    n_mlp_layers = 0
    flatten = False
    activation = nn.ReLU()
    res_learning = False
    mask_flag = False # TODO
    train_embedding = False
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-3
    patience = 5
    # loss = nn.BCELoss()
    loss = WeightedBinaryCrossEntropyLoss(1-pos_prior, 1-neg_prior)
    # validation_loss = nn.BCELoss()
    validation_loss = WeightedBinaryCrossEntropyLoss(1-pos_prior, 1-neg_prior)
    regularisation_loss = None
    scheduler = True
    grad_clip = True
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_Classifier'
    


model = LSTM(LSTM_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, best_epoch, evaluation_mode = True)

  from .autonotebook import tqdm as notebook_tqdm
  nn.utils.clip_grad_norm(self.model.parameters(), 2)
  3%|▎         | 3/99 [00:22<11:54,  7.44s/it]


KeyboardInterrupt: 

In [None]:
# BERT

class BERT_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 128
    d_ff = 512 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 3 
    embedding_aggregation = 'cls' # TODO
    n_mlp_layers = 0
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = False
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 2
    lr = 1e-3
    patience = 2
    loss = nn.BCELoss()
    validation_loss = nn.BCELoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory + f'./results/'
    name = f'BERT_Classifier'
    


model = BERT(BERT_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, best_epoch, evaluation_mode = True)

 99%|█████████▉| 98/99 [03:31<00:02,  2.99s/it]

--- 
### W2V

In [None]:
import torch
import torch.nn as nn

class SkipGramLoss(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramLoss, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Define the embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize the weights of the embedding layer
        self.embeddings.weight.data.uniform_(-1, 1)
        
        # Define the output layer
        self.output_layer = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, input_word_indices, context_word_indices):
        # Lookup embeddings for input word indices
        input_embeddings = self.embeddings(input_word_indices)
        
        # Predict context word logits
        output_logits = self.output_layer(input_embeddings)
        
        # Calculate the loss
        loss = nn.CrossEntropyLoss()(output_logits, context_word_indices)
        
        return loss


---
### Domain Adversarial Network