### Import Libraries

In [1]:
import sys
import os
sys.path.append(os.path.abspath('./PrivatePackages/pytorch/'))

from environment import *
from utils import *

from sklearn.model_selection import train_test_split

In [2]:
from model.model_class import LSTM, BERT

  from .autonotebook import tqdm as notebook_tqdm


### Set Seed and Load Data

In [3]:
SEED = 2608

In [4]:
data1 = []
with open('./data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('./data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [5]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [6]:
set_of_unique_train_tokens = set()
for instance in train_data:
    set_of_unique_train_tokens.update(instance['text'])

# Need to compress the used tokens (in training) onto a denser map for pytorch embedding
raw_token_pytorch_map = {token: i+2 for i, token in enumerate(set_of_unique_train_tokens)}
raw_token_pytorch_map[0] = 0 # CLS takes on 0 in our map

In [7]:
def DataFactory(data, raw_token_pytorch_map, max_len, CLS=True):
    """ Convert the Token index into one useable by Pytorch Embedding Layer """
    data = copy.deepcopy(data)
    for instance in tqdm(data):
        instance['text'] = instance['text'] = [raw_token_pytorch_map[token] if token in raw_token_pytorch_map else 2 for token in instance['text']]
        # because 1 is our default value 
        if CLS: # CLS takes on 0 in our map
            instance['text'] = [0] + instance['text']
        
    if max_len:
        for instance in data:
            if len(instance['text']) < max_len:
                instance['text'] = instance['text'] + [1] * (max_len - len(instance['text'])) # 1 is pad in our map
            else:
                instance['text'] = instance['text'][:max_len]

    return data

In [12]:
train_data_transformed = DataFactory(train_data, raw_token_pytorch_map, 512)
val_data_transformed = DataFactory(val_data, raw_token_pytorch_map, 512)
test_data_transformed = DataFactory(test_data, raw_token_pytorch_map, 512)

100%|██████████| 12600/12600 [00:00<00:00, 26995.93it/s]
100%|██████████| 2700/2700 [00:00<00:00, 27885.05it/s]
100%|██████████| 2700/2700 [00:00<00:00, 26039.96it/s]


In [13]:
train_x = [instance['text'] for instance in train_data_transformed]
train_y = [instance['label'] for instance in train_data_transformed]
val_x = [instance['text'] for instance in val_data_transformed]
val_y = [instance['label'] for instance in val_data_transformed]
test_x = [instance['text'] for instance in test_data_transformed]
test_y = [instance['label'] for instance in test_data_transformed]

In [14]:
class Dataset():
    """ Pytorch style dataset """

    def __init__(self, data, maxlen):
        self.data = data
        self.maxlen = maxlen
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]['text'], self.data[index]['label']
        # return self.data[index]['text'], self.data[index]['label'], self.data[index]['domain']

---
### Models

#### 1. Prediction only

In [16]:
# A-LSTM

class LSTM_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 128
    n_recurrent_layers = 1
    bidirectional = True
    n_heads = 0
    dropout = 0.1
    n_mlp_layers = 0
    flatten = False
    activation = nn.ReLU()
    res_learning = False
    mask_flag = False
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 10
    loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = True
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = 512
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = './'
    saving_address = f'./results/'
    name = f'LSTM_Classifier'
    


model = LSTM(LSTM_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_y, val_y, best_epoch, evaluation_mode = True)

  nn.utils.clip_grad_norm(self.model.parameters(), 2)
100%|██████████| 99/99 [05:06<00:00,  3.10s/it]


Epoch 1 Train | Loss:  0.6874 | Accuracy:  0.7759| F1:  0.0014 | Balanced Accuracy:  0.4990 
Epoch 1 Val   | Loss:  0.6816 | Accuracy:  0.7778| F1:  0.0000 | Balanced Accuracy:  0.5000 


  nn.utils.clip_grad_norm(self.model.parameters(), 2)
 52%|█████▏    | 51/99 [02:33<02:23,  3.00s/it]

In [11]:
# BERT

class BERT_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 128
    d_ff = 512
    n_heads = 8
    dropout = 0.1
    e_layers = 3
    embedding_aggregation = 'cls'
    n_mlp_layers = 0
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 10
    loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = True
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = 512
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = './'
    saving_address = f'./results/'
    name = f'BERT_Classifier'
    


model = BERT(BERT_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_y, val_y, best_epoch, evaluation_mode = True)

  nn.utils.clip_grad_norm(self.model.parameters(), 2)
100%|██████████| 2/2 [00:53<00:00, 26.84s/it]


Epoch 1 Train | Loss:  4.7970 | Accuracy:  0.5300| F1:  0.6928 | Balanced Accuracy:  0.5000 
Epoch 1 Val   | Loss:  4.7977 | Accuracy:  0.4900| F1:  0.6577 | Balanced Accuracy:  0.5000 


  0%|          | 0/2 [00:30<?, ?it/s]


KeyboardInterrupt: 

--- 
### W2V

In [None]:
import torch
import torch.nn as nn

class SkipGramLoss(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramLoss, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Define the embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize the weights of the embedding layer
        self.embeddings.weight.data.uniform_(-1, 1)
        
        # Define the output layer
        self.output_layer = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, input_word_indices, context_word_indices):
        # Lookup embeddings for input word indices
        input_embeddings = self.embeddings(input_word_indices)
        
        # Predict context word logits
        output_logits = self.output_layer(input_embeddings)
        
        # Calculate the loss
        loss = nn.CrossEntropyLoss()(output_logits, context_word_indices)
        
        return loss


---
### Domain Adversarial Network