In [1]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

In [2]:
import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages/pytorch" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    !pip install einops
else:
    py_file_location = './PrivatePackages/pytorch'
    home_directory = './'

sys.path.append(os.path.abspath(py_file_location))

from environment import *
from utils import *

from sklearn.model_selection import train_test_split

In [3]:
from model.model_class import LSTM, BERT, LSTM_DANN, BERT_DANN, LSTM_DCE_DANN, BERT_DCE_DANN, LSTM_Hinge, BERT_Hinge, W2V

In [4]:
SEED = 2608

In [5]:
data1 = []
with open(home_directory + '/data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open(home_directory + './data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open(home_directory + '/data/raw/comp90051-2024s1-project-1/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 0
for i in range(len(data2)):
    data2[i]['domain'] = 1

In [6]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [7]:
MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

print('---')

100%|██████████| 12600/12600 [00:00<00:00, 27317.66it/s]
100%|██████████| 2700/2700 [00:00<00:00, 88180.81it/s]
100%|██████████| 2700/2700 [00:00<00:00, 125557.08it/s]
100%|██████████| 4000/4000 [00:00<00:00, 91272.29it/s]
100%|██████████| 12600/12600 [00:00<00:00, 39957.50it/s]
100%|██████████| 12600/12600 [00:00<00:00, 44726.71it/s]
100%|██████████| 2700/2700 [00:00<00:00, 44775.33it/s]
100%|██████████| 2700/2700 [00:00<00:00, 45783.61it/s]
100%|██████████| 4000/4000 [00:00<00:00, 45908.00it/s]

class prior: 0.2222222222222222 0.7777777777777778
domain prior: 0.7222222222222222 0.2777777777777778
dom1 class prior: 0.5 0.5
dom2 class prior: 0.11538461538461539 0.8846153846153846
---





In [61]:
def W2V_DataFactory(data: list, context_window: int, seed: int, raw_token_pytorch_map: dict, k) -> list:

    """ Get W2V training data """
    
    assert context_window % 2 == 1, 'context window must be odd'

    np.random.seed(seed)

    MAX_SAMPLED_NEGATIVE_TOKENS = 10000

    retokenised_keys = list(raw_token_pytorch_map.keys())

    negative_tokens = np.random.choice(retokenised_keys, MAX_SAMPLED_NEGATIVE_TOKENS)
    negative_tokens = [x if x[0].isalpha() else int(x) for x in negative_tokens]

    negative_up_to = 0

    w2v_data = []

    for instance in tqdm(data): # every sentence
        tokens = [context_window//2 * 'CLS'] + instance['text'] + [context_window//2 * raw_token_pytorch_map['PAD']]

        for i in range(context_window//2, len(tokens) - context_window//2): # avoid pad and cls # every token
            
            focus_token_retokenised = raw_token_pytorch_map.get(tokens[i], raw_token_pytorch_map['UNK'])
            
            context_words = dict()

            for j in range(-context_window//2+1, context_window//2+1):
                if j != 0:
                    context_words[tokens[i+j]] = 0

            for j in range(-context_window//2+1, context_window//2+1): # every neighbour in window
                if j != 0: # don't want to make positive sample with self
                    if context_words.get(tokens[i+j], 0): # CLS and Padding (being start and end) being repeated
                        continue 
                    else:
                        context_words[tokens[i+j]] = 1
                    
                    mask = [raw_token_pytorch_map.get(tokens[i+j], raw_token_pytorch_map['UNK'])]
                    for _ in range(k): # sample the same number of negatives
                        
                        while True:
                            
                            if negative_up_to == MAX_SAMPLED_NEGATIVE_TOKENS:
                                negative_up_to = 0

                            sampled_negative_retokenised = negative_tokens[negative_up_to]
                            negative_up_to += 1

                            if sampled_negative_retokenised not in context_words: # didn't sample a positive case
                                mask.append(raw_token_pytorch_map[sampled_negative_retokenised])
                                break

                    
                    new_instance = {'token': focus_token_retokenised, 'mask': mask}
                    w2v_data.append(new_instance)
    
    return w2v_data

In [62]:
w2v_data = W2V_DataFactory(train_data, 3, SEED, raw_token_pytorch_map, 4)
val_w2v_data = W2V_DataFactory(val_data, 3, SEED, raw_token_pytorch_map, 4)

100%|██████████| 12600/12600 [01:00<00:00, 207.00it/s]
100%|██████████| 2700/2700 [00:03<00:00, 696.21it/s]


In [63]:
train_x = [x['token'] for x in w2v_data]
train_mask = [x['mask'] for x in w2v_data]

val_x = [x['token'] for x in w2v_data]
val_mask = [x['mask'] for x in val_w2v_data]

In [19]:
# train_x = train_x[:100]
# train_mask = train_mask[:100]

# val_x = val_x[:100]
# val_mask = val_mask[:100]

In [64]:
# BERT - CELoss

class W2V_Config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    k=5
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory + f'./results/'
    name = f'W2V_Pretrain_Embeddings'

model = W2V(W2V_Config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_mask, val_x, val_mask)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_mask, best_epoch, evaluation_mode = True)

  1%|          | 7400/765490 [01:19<2:15:44, 93.08it/s] 


KeyboardInterrupt: 