### Import Libraries

In [1]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [20]:
import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages/pytorch" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    !pip install einops
else:
    py_file_location = './PrivatePackages/pytorch'
    home_directory = './'

sys.path.append(os.path.abspath(py_file_location))

from environment import *
from utils import *

from sklearn.model_selection import train_test_split

In [3]:
from model.model_class import LSTM, BERT

### Set Seed and Load Data

In [4]:
SEED = 2608

In [5]:
data1 = []
with open('./data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('./data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [6]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [7]:
set_of_unique_train_tokens = set()
for instance in train_data:
    set_of_unique_train_tokens.update(instance['text'][:64])

list_of_unique_train_tokens = list(set_of_unique_train_tokens)
list_of_unique_train_tokens.sort()

# Need to compress the used tokens (in training) onto a denser map for pytorch embedding
raw_token_pytorch_map = {token: i+2 for i, token in enumerate(list_of_unique_train_tokens)}
raw_token_pytorch_map['CLS'] = 0 # CLS takes on 0 in our map
raw_token_pytorch_map['PAD'] = 1

In [None]:
word_freq = dict()
word_freq_1 = dict()
word_freq_2 = dict()
for i in range(len(data1)):
    for word in data1[i]['text']:
        if word not in word_freq_1:
            word_freq_1[word] = 1
        else:
            word_freq_1[word] += 1

        if word not in word_freq:
            word_freq[word] = 1
        else:
            word_freq[word] += 1

for i in range(len(data2)):
    for word in data2[i]['text']:
        if word not in word_freq_2:
            word_freq_2[word] = 1
        else:
            word_freq_2[word] += 1
        
        if word not in word_freq:
            word_freq[word] = 1
        else:
            word_freq[word] += 1

word_freq_1 = sorted(word_freq_1.items(), key=lambda x: x[1], reverse=True)
word_freq_2 = sorted(word_freq_2.items(), key=lambda x: x[1], reverse=True)
word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

word_freq = [x[0] for x in word_freq if x[1] > 100]
word_freq.sort()

In [8]:
def DataFactory(data, raw_token_pytorch_map, max_len, CLS=True):
    """ Convert the Token index into one useable by Pytorch Embedding Layer """
    data = copy.deepcopy(data)
    for instance in tqdm(data):
        instance['text'] = instance['text'] = [raw_token_pytorch_map[token] if token in raw_token_pytorch_map else 2 for token in instance['text']]
        # because 2 is our default value
        if CLS: # CLS takes on 0 in our map
            instance['text'] = [raw_token_pytorch_map['CLS']] + instance['text']

    if max_len:
        for instance in data:
            if len(instance['text']) < max_len:
                instance['text'] = instance['text'] + [raw_token_pytorch_map['PAD']] * (max_len - len(instance['text'])) # 1 is pad in our map
            else:
                instance['text'] = instance['text'][:max_len]

    return data

In [9]:
train_data_transformed = DataFactory(train_data, raw_token_pytorch_map, 64)
train_data_transformed = train_data_transformed[:200]
val_data_transformed = DataFactory(val_data, raw_token_pytorch_map, 64)
val_data_transformed = val_data_transformed[:200]
test_data_transformed = DataFactory(test_data, raw_token_pytorch_map, 64)

100%|██████████| 12600/12600 [00:00<00:00, 23884.62it/s]
100%|██████████| 2700/2700 [00:00<00:00, 27390.08it/s]
100%|██████████| 2700/2700 [00:00<00:00, 30750.03it/s]


In [10]:
train_x = [instance['text'] for instance in train_data_transformed]
train_y = [instance['label'] for instance in train_data_transformed]
val_x = [instance['text'] for instance in val_data_transformed]
val_y = [instance['label'] for instance in val_data_transformed]
test_x = [instance['text'] for instance in test_data_transformed]
test_y = [instance['label'] for instance in test_data_transformed]

In [11]:
train_y = [[0, 1] if label == 1 else [1, 0] for label in train_y]
val_y = [[0, 1] if label == 1 else [1, 0] for label in val_y]
test_y = [[0, 1] if label == 1 else [1, 0] for label in test_y]

In [12]:
# class Dataset():
#     """ Pytorch style dataset """

#     def __init__(self, data, maxlen):
#         self.data = data
#         self.maxlen = maxlen
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, index):
#         return self.data[index]['text'], self.data[index]['label']
#         # return self.data[index]['text'], self.data[index]['label'], self.data[index]['domain']

---
### Models

#### 1. Prediction only

In [13]:
# LSTM

class LSTM_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 128
    n_recurrent_layers = 1
    bidirectional = True
    n_heads = 0
    dropout = 0.1
    n_mlp_layers = 0
    flatten = False
    activation = nn.ReLU()
    res_learning = False
    mask_flag = False
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 2
    lr = 1e-3
    patience = 5
    loss = nn.BCELoss()
    validation_loss = nn.BCELoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = True
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = 64
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = './'
    saving_address = f'./results/'
    name = f'LSTM_Classifier'
    


model = LSTM(LSTM_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, best_epoch, evaluation_mode = True)

  from .autonotebook import tqdm as notebook_tqdm
  nn.utils.clip_grad_norm(self.model.parameters(), 2)
100%|██████████| 2/2 [00:00<00:00,  2.01it/s]


Epoch 1 Train | Loss:  0.6919 | Accuracy:  0.5050| F1:  0.5520 | Balanced Accuracy:  0.5005 
Epoch 1 Val   | Loss:  0.6878 | Accuracy:  0.5750| F1:  0.5933 | Balanced Accuracy:  0.5761 


  nn.utils.clip_grad_norm(self.model.parameters(), 2)
100%|██████████| 2/2 [00:00<00:00,  2.98it/s]


Epoch 2 Train | Loss:  0.6818 | Accuracy:  0.6700| F1:  0.6857 | Balanced Accuracy:  0.6694 
Epoch 2 Val   | Loss:  0.6837 | Accuracy:  0.5550| F1:  0.5782 | Balanced Accuracy:  0.5563 

Epoch 2 Val   | Loss:  0.6837 | Accuracy:  0.5550| F1:  0.5782 | Balanced Accuracy:  0.5563 


In [14]:
model.predict(train_x)[:10]

[[0.4894677698612213, 0.5105322599411011],
 [0.6054280996322632, 0.3945719301700592],
 [0.4914986193180084, 0.508501410484314],
 [0.48853105306625366, 0.5114689469337463],
 [0.49079567193984985, 0.5092043280601501],
 [0.49549156427383423, 0.504508376121521],
 [0.4960257112979889, 0.5039742588996887],
 [0.4931079149246216, 0.5068920850753784],
 [0.48357969522476196, 0.5164202451705933],
 [0.47911831736564636, 0.520881712436676]]

In [15]:
model.predict(val_x)[:10]

[[0.5132982730865479, 0.4867016673088074],
 [0.6065260767936707, 0.39347392320632935],
 [0.4989036023616791, 0.5010964274406433],
 [0.4935510754585266, 0.5064489841461182],
 [0.4965856969356537, 0.5034143924713135],
 [0.4941680133342743, 0.5058320164680481],
 [0.4849659204483032, 0.5150341391563416],
 [0.5000945925712585, 0.49990543723106384],
 [0.5082995295524597, 0.49170055985450745],
 [0.4829690158367157, 0.5170309543609619]]

In [16]:
# BERT

class BERT_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 128
    d_ff = 512
    n_heads = 8
    dropout = 0.1
    e_layers = 3
    embedding_aggregation = 'cls'
    n_mlp_layers = 0
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 2
    lr = 1e-3
    patience = 2
    loss = nn.BCELoss()
    validation_loss = nn.BCELoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = 64
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = './'
    saving_address = f'./results/'
    name = f'BERT_Classifier'
    


model = BERT(BERT_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, val_x, val_y)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, best_epoch, evaluation_mode = True)

100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Epoch 1 Train | Loss:  1.4933 | Accuracy:  0.4900| F1:  0.5641 | Balanced Accuracy:  0.4815 
Epoch 1 Val   | Loss:  1.2334 | Accuracy:  0.5100| F1:  0.0000 | Balanced Accuracy:  0.5000 


100%|██████████| 2/2 [00:01<00:00,  1.56it/s]


Epoch 2 Train | Loss:  0.9996 | Accuracy:  0.4700| F1:  0.0000 | Balanced Accuracy:  0.5000 
Epoch 2 Val   | Loss:  0.8670 | Accuracy:  0.4900| F1:  0.6577 | Balanced Accuracy:  0.5000 

Epoch 2 Val   | Loss:  0.8670 | Accuracy:  0.4900| F1:  0.6577 | Balanced Accuracy:  0.5000 


In [17]:
model.predict(train_x)[:10]

[[0.23434628546237946, 0.7656537294387817],
 [0.24878013134002686, 0.7512198686599731],
 [0.2295651137828827, 0.7704349160194397],
 [0.2324276715517044, 0.7675723433494568],
 [0.23661142587661743, 0.7633885145187378],
 [0.23522336781024933, 0.7647765874862671],
 [0.2274104803800583, 0.7725894451141357],
 [0.22060248255729675, 0.7793975472450256],
 [0.24059750139713287, 0.7594025135040283],
 [0.2309846580028534, 0.769015371799469]]

In [18]:
model.predict(val_x)[:10]

[[0.22902269661426544, 0.7709773182868958],
 [0.2724650204181671, 0.7275350093841553],
 [0.22975383698940277, 0.7702462077140808],
 [0.23741821944713593, 0.7625817060470581],
 [0.2260596752166748, 0.77394038438797],
 [0.24640795588493347, 0.7535920143127441],
 [0.232538640499115, 0.7674614191055298],
 [0.22662198543548584, 0.7733779549598694],
 [0.23393823206424713, 0.7660617828369141],
 [0.23056721687316895, 0.7694327235221863]]

--- 
### W2V

In [19]:
import torch
import torch.nn as nn

class SkipGramLoss(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramLoss, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Define the embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize the weights of the embedding layer
        self.embeddings.weight.data.uniform_(-1, 1)
        
        # Define the output layer
        self.output_layer = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, input_word_indices, context_word_indices):
        # Lookup embeddings for input word indices
        input_embeddings = self.embeddings(input_word_indices)
        
        # Predict context word logits
        output_logits = self.output_layer(input_embeddings)
        
        # Calculate the loss
        loss = nn.CrossEntropyLoss()(output_logits, context_word_indices)
        
        return loss


---
### Domain Adversarial Network