# Pytorch Training UI

### Import Libraries

In [1]:
# detect whether this is a google environment

COLAB_ENVIRONMENT = False

try:
    from google.colab import drive
    drive.mount('/content/drive')
    COLAB_ENVIRONMENT = True
except:
    pass

import sys
import os
import pickle

if COLAB_ENVIRONMENT:
    py_file_location = "./drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/PrivatePackages/pytorch" # my private packages are stored here
    home_directory = './drive/MyDrive/LAB/COMP90051-A1__Groupwork__Py/' # my home directory is stored in ./LAB of google drive
    !pip install einops
else:
    py_file_location = './PrivatePackages/pytorch'
    home_directory = './'

sys.path.append(os.path.abspath(py_file_location))

from environment import *
from utils import *

from sklearn.model_selection import train_test_split

from model.model_class import LSTM, BERT, LSTM_DANN, BERT_DANN, LSTM_DCE_DANN, BERT_DCE_DANN, BERT_Hinge, W2V, \
BERT_DCE_DANN_DoubleDecoder, BERT_DoubleDecoder, BERT_DANN_DoubleDecoder, LSTM_DCE_DANN_DoubleDecoder, LSTM_DoubleDecoder, LSTM_DANN_DoubleDecoder

### Set Seed and Load Data

In [3]:
SEED = 2608

data1 = []
with open(home_directory + '/data/curated/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open(home_directory + './data/curated/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open(home_directory + '/data/curated/comp90051-2024s1-project-1/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 0
for i in range(len(data2)):
    data2[i]['domain'] = 1


# Train Val Test Split
# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

---
### Load Pretraining

In [3]:
# # BERT - CELoss

# MAX_SENTENCE_LENGTH = 256
# MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
# MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
# LOW_FREQ_TOKEN = False
# CLS = True
# PAD_FRONT = False
# W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
# cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
# cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
# train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
#                                                               cropped_val_data, \
#                                                                 cropped_test_data, \
#                                                                     cropped_future_data, \
#                                                                         MAX_SENTENCE_LENGTH, \
#                                                                             raw_token_pytorch_map, \
#                                                                                 CLS=CLS, \
#                                                                                     low_freq_special_token=LOW_FREQ_TOKEN, \
#                                                                                         pad_front=PAD_FRONT)
# pos_prior, neg_prior = get_distribution(train_y)
# print('class prior:', pos_prior, neg_prior)
# pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
# print('domain prior:', pos_dom_prior, neg_dom_prior)
# dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
# print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
# dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
# print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

# class W2V_Config:
#     # ----------------- architectual hyperparameters ----------------- #
#     d_model = 256
#     k=5
#     # ----------------- optimisation hyperparameters ----------------- #
#     random_state = SEED
#     batch_size = 8
#     epochs = 32
#     lr = 1e-5
#     patience = 10
#     pretrain_loss = nn.CrossEntropyLoss()
#     pretrain_validation_loss = nn.CrossEntropyLoss()
#     regularisation_loss = None
#     scheduler = False
#     grad_clip = False
#     train_embedding = False
#     # ----------------- operation hyperparameters ----------------- #
#     n_unique_tokens = len(raw_token_pytorch_map)
#     # ----------------- saving hyperparameters ----------------- #
#     rootpath = home_directory + './'
#     saving_address = home_directory + f'./results/'
#     name = f'W2V_Pretrain_Embeddings-pretrained'

# pretrained = W2V(W2V_Config) # initialise the model

# pretrained.load()

# w2v_embed = pretrained.model.embed.weight
# w2v_combined_embed = (pretrained.model.embed.weight + pretrained.model.linear.weight)/2

In [4]:
# MAX_SENTENCE_LENGTH = 256
# MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
# MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
# LOW_FREQ_TOKEN = False
# CLS = True
# PAD_FRONT = False
# W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
# cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
# cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
# train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
#                                                               cropped_val_data, \
#                                                                 cropped_test_data, \
#                                                                     cropped_future_data, \
#                                                                         MAX_SENTENCE_LENGTH, \
#                                                                             raw_token_pytorch_map, \
#                                                                                 CLS=CLS, \
#                                                                                     low_freq_special_token=LOW_FREQ_TOKEN, \
#                                                                                         pad_front=PAD_FRONT)
# pos_prior, neg_prior = get_distribution(train_y)
# print('class prior:', pos_prior, neg_prior)
# pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
# print('domain prior:', pos_dom_prior, neg_dom_prior)
# dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
# print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
# dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
# print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# # pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

# print('---')

# class BERT_config:
#     # ----------------- architectual hyperparameters ----------------- #
#     d_model = 256
#     d_ff = 1024 # = 4* d_model
#     n_heads = 8
#     dropout = 0.1
#     e_layers = 8
#     embedding_aggregation = 'cls' # TODO
#     n_mlp_layers = 1
#     res_learning = False
#     activation = nn.ReLU()
#     mask_flag = False # causal mask
#     train_embedding = True
#     # ----------------- optimisation hyperparameters ----------------- #
#     random_state = SEED
#     batch_size = 8
#     epochs = 32
#     lr = 1e-5
#     patience = 10
#     loss = nn.CrossEntropyLoss()
#     # loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     validation_loss = nn.CrossEntropyLoss()
#     # validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     pretrain_loss = nn.CrossEntropyLoss()
#     pretrain_validation_loss = nn.CrossEntropyLoss()
#     regularisation_loss = None
#     scheduler = False
#     grad_clip = False
#     # ----------------- operation hyperparameters ----------------- #
#     d_output = 2
#     seq_len = MAX_SENTENCE_LENGTH
#     n_unique_tokens = len(raw_token_pytorch_map)
#     # ----------------- saving hyperparameters ----------------- #
#     rootpath = home_directory + './'
#     saving_address = home_directory + f'./results/'
#     name = f'BERT_Classifier_pretrained'

# pretrained_bert = BERT(BERT_config) # initialise the model
# pretrained_bert.load()

# pretrained_bert_embed = pretrained_bert.model.embedding.embedding.weight

---
### Models


#### 1. DANN

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 128
MIN_FREQUENCY = 0 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class LSTM_DANN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 512
    n_recurrent_layers = 2
    bidirectional = False
    n_heads = 8
    dropout = 0.1
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    flatten = False
    activation = nn.ReLU()
    res_learning = True
    mask_flag = False # TODO
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 5
    loss = nn.BCELoss()
    # loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    validation_loss = nn.BCELoss()
    # validation_loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    domain_loss = nn.BCELoss()
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    gradient_reversal_every_n_epoch = 1
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_DANN'

model = LSTM_DANN(LSTM_DANN_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class BERT_DANN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 8
    embedding_aggregation = 'cls' # TODO
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    loss = nn.CrossEntropyLoss()
    # loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    validation_loss = nn.CrossEntropyLoss()
    # validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    alpha = 0.1
    gradient_reversal_every_n_epoch = 1
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    saving_address = home_directory + f'./results/'
    name = f'BERT_DANN'

model = BERT_DANN(BERT_DANN_config) # initialise the model

## PRETRAIN
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_combined_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(pretrained_bert_embed, freeze=False)
# model.model.encoder = pretrained_bert.model.encoder

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

In [None]:
# EXPERIMENT_NAME = '8bert_1mlpc_1mlpd_256d1024_256d40_8h_0.1_embed_4batch_bwce_low_freq'

# future_pred_y, future_pred_dom = model.predict(future_x)

# future_pred_y = [1 if x[1] > x[0] else 0 for x in future_pred_y]

# predictions = pd.DataFrame({'id': range(len(future_pred_y)), 'class': future_pred_y})
# predictions.to_csv(home_directory + f'predictions/{EXPERIMENT_NAME}_classification.csv', index=False)

100%|██████████| 12600/12600 [00:00<00:00, 15485.68it/s]
100%|██████████| 2700/2700 [00:00<00:00, 25772.33it/s]
100%|██████████| 2700/2700 [00:00<00:00, 3127.14it/s]
100%|██████████| 4000/4000 [00:00<00:00, 41878.65it/s]
100%|██████████| 12600/12600 [00:01<00:00, 12264.92it/s]
100%|██████████| 12600/12600 [00:00<00:00, 37174.52it/s]
100%|██████████| 2700/2700 [00:00<00:00, 40789.31it/s]
100%|██████████| 2700/2700 [00:00<00:00, 35825.73it/s]
100%|██████████| 4000/4000 [00:00<00:00, 39010.34it/s]


class prior: 0.2222222222222222 0.7777777777777778
domain prior: 0.7222222222222222 0.2777777777777778
dom1 class prior: 0.5 0.5
dom2 class prior: 0.11538461538461539 0.8846153846153846


100%|██████████| 12600/12600 [00:02<00:00, 4654.85it/s]
100%|██████████| 2700/2700 [00:00<00:00, 6566.89it/s]


---
Epoch 21 Val   | Classification Loss:  0.1625 | Accuracy:  0.8574| F1:  0.7150 | Balanced Accuracy:  0.8387 | Dom Avg Accuracy:  0.8010 |
                            Domain Loss:  0.5397 | Domain Accuracy:  0.7581 |  
                            Domain 1 Accuracy:  0.7920| Domain 1 F1:  0.8050 | Domain 1 Balanced Accuracy:  0.7920 |  
                            Domain 2 Accuracy:  0.8826| Domain 2 F1:  0.5844 | Domain 2 Balanced Accuracy:  0.8100
Epoch 21 Val   | Classification Loss:  0.1658 | Accuracy:  0.8589| F1:  0.7167 | Balanced Accuracy:  0.8390 | Dom Avg Accuracy:  0.8013 |
                            Domain Loss:  0.5434 | Domain Accuracy:  0.7607 |  
                            Domain 1 Accuracy:  0.7667| Domain 1 F1:  0.7804 | Domain 1 Balanced Accuracy:  0.7667 |  
                            Domain 2 Accuracy:  0.8944| Domain 2 F1:  0.6241 | Domain 2 Balanced Accuracy:  0.8359


---
#### 2. DCE_DANN

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 128
MIN_FREQUENCY = 0 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class LSTM_DCE_DANN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 512
    n_recurrent_layers = 2
    bidirectional = False
    n_heads = 8
    dropout = 0.1
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    flatten = False
    activation = nn.ReLU()
    res_learning = True
    mask_flag = False # TODO
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 5
    # loss = nn.CrossEntropyLoss()
    loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    # validation_loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_1_loss = nn.CrossEntropyLoss()
    # domain_1_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_1_validation_loss = nn.CrossEntropyLoss()
    # domain_1_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_2_loss = nn.CrossEntropyLoss()
    # domain_2_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_2_validation_loss = nn.CrossEntropyLoss()
    # domain_2_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_prior = [2*pos_dom_prior, 2*neg_dom_prior]
    # domain_prior = [1, 1]
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    gradient_reversal_every_n_epoch = 1
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_DCE_DANN'

model = LSTM_DCE_DANN(LSTM_DCE_DANN_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class BERT_DCE_DANN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 8
    embedding_aggregation = 'cls' # TODO
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    optimised_metric = 'global' # 'dom1', 'dom2'
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    # loss = nn.CrossEntropyLoss()
    loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    # validation_loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_1_loss = nn.CrossEntropyLoss()
    # domain_1_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_1_validation_loss = nn.CrossEntropyLoss()
    # domain_1_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_2_loss = nn.CrossEntropyLoss()
    # domain_2_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_2_validation_loss = nn.CrossEntropyLoss()
    # domain_2_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_prior = [2*pos_dom_prior, 2*neg_dom_prior]
    # domain_prior = [1, 1]
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    alpha = 0.1
    gradient_reversal_every_n_epoch = 1
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    saving_address = home_directory + f'./results/'
    name = f'BERT_Classifier'

model = BERT_DCE_DANN(BERT_DCE_DANN_config) # initialise the model

## PRETRAIN
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_combined_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(pretrained_bert_embed, freeze=False)
# model.model.encoder = pretrained_bert.model.encoder

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

---
#### 3. Double Decoder DANN

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 128
MIN_FREQUENCY = 0 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class LSTM_DANN_DoubleDecoder_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 512
    n_recurrent_layers = 2
    bidirectional = False
    n_heads = 8
    dropout = 0.1
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    flatten = False
    activation = nn.ReLU()
    res_learning = True
    mask_flag = False # TODO
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 5
    loss = nn.CrossEntropyLoss()
    # loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    validation_loss = nn.CrossEntropyLoss()
    # validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    gradient_reversal_every_n_epoch = 1
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_DANN_DoubleDecoder'

model = LSTM_DANN_DoubleDecoder(LSTM_DANN_DoubleDecoder_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

train_x = train_x[:10]
train_y = train_y[:10]
train_dom = train_dom[:10]
train_aux = train_aux[:10]
val_x = val_x[:10]
val_y = val_y[:10]
val_dom = val_dom[:10]
val_aux = val_aux[:10]

print('---')

class BERT_DANN_DoubleDecoder_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 8
    embedding_aggregation = 'cls' # TODO
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    loss = nn.CrossEntropyLoss()
    # loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    validation_loss = nn.CrossEntropyLoss()
    # validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    alpha = 0.1
    gradient_reversal_every_n_epoch = 1
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    saving_address = home_directory + f'./results/'
    name = f'BERT_DANN_DoubleDecoder'

model = BERT_DANN_DoubleDecoder(BERT_DANN_DoubleDecoder_config) # initialise the model

## PRETRAIN
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_combined_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(pretrained_bert_embed, freeze=False)
# model.model.encoder = pretrained_bert.model.encoder

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

---
#### 4. Double Decoder DANN_DCE

In [None]:
AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 128
MIN_FREQUENCY = 0 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class LSTM_DANN_DCE_DoubleDecoder_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 512
    n_recurrent_layers = 2
    bidirectional = False
    n_heads = 8
    dropout = 0.1
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    flatten = False
    activation = nn.ReLU()
    res_learning = True
    mask_flag = False # TODO
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 128
    epochs = 32
    lr = 1e-5
    patience = 5
    # loss = nn.CrossEntropyLoss()
    loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    # validation_loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_1_loss = nn.CrossEntropyLoss()
    # domain_1_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_1_validation_loss = nn.CrossEntropyLoss()
    # domain_1_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_2_loss = nn.CrossEntropyLoss()
    # domain_2_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_2_validation_loss = nn.CrossEntropyLoss()
    # domain_2_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_prior = [2*pos_dom_prior, 2*neg_dom_prior]
    # domain_prior = [1, 1]
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    gradient_reversal_every_n_epoch = 1
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    optimised_metric = 'dom1' # 'global', 'dom1', 'dom2'
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_DANN_DoubleDecoder'

model = LSTM_DANN_DCE_DoubleDecoder(LSTM_DANN_DCE_DoubleDecoder_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

In [None]:
# BERT DBCE - celoss expr - just bal domain

AUX_FEATURES = ['perplexity', 'burstiness', 'length', 'unique_word_ratio', 'domain']
MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x, future_dom = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
train_aux, val_aux, test_aux, future_aux = Data_Factory_aux(cropped_train_data, cropped_val_data, cropped_test_data, cropped_future_data, AUX_FEATURES)

pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)

print('---')

class BERT_DCE_DANN_DoubleDecoder_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 8
    embedding_aggregation = 'cls' # TODO
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    d_extra_decoder_features = len(AUX_FEATURES)
    # ----------------- optimisation hyperparameters ----------------- #
    optimised_metric = 'global' # 'dom1', 'dom2'
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    # loss = nn.CrossEntropyLoss()
    loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    # validation_loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_prior, 2*neg_prior]))
    domain_1_loss = nn.CrossEntropyLoss()
    # domain_1_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_1_validation_loss = nn.CrossEntropyLoss()
    # domain_1_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom1_pos_prior, 2*dom1_neg_prior]))
    domain_2_loss = nn.CrossEntropyLoss()
    # domain_2_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_2_validation_loss = nn.CrossEntropyLoss()
    # domain_2_validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*dom2_pos_prior, 2*dom2_neg_prior]))
    domain_prior = [2*pos_dom_prior, 2*neg_dom_prior]
    # domain_prior = [1, 1]
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    domain_loss = nn.CrossEntropyLoss()
    # domain_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([2*pos_dom_prior, 2*neg_dom_prior]))
    alpha = 0.1
    gradient_reversal_every_n_epoch = 1
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    saving_address = home_directory + f'./results/'
    name = f'BERT_DCE_DANN_DoubleDecoder'

model = BERT_DCE_DANN_DoubleDecoder(BERT_DCE_DANN_DoubleDecoder_config) # initialise the model

## PRETRAIN
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_combined_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(pretrained_bert_embed, freeze=False)
# model.model.encoder = pretrained_bert.model.encoder

# train the model (all cells except this one will print training log and evaluation at each batch)
if len(AUX_FEATURES) == 0:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
else:
    best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom, train_aux = train_aux, val_aux = val_aux)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
if len(AUX_FEATURES) == 0:
    model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)
else:
    model.eval(val_x, val_y, val_dom, best_epoch, aux= val_aux, evaluation_mode = True)
    model.eval(test_x, test_y, test_dom, best_epoch, aux=test_aux, evaluation_mode = True)

---
# Hinge Loss

In [None]:
# BERT - hinge work in progress

MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)


pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

print('---')

class BERT_Hinge_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 6
    embedding_aggregation = 'cls' # TODO
    n_mlp_layers = 1
    # n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-3
    patience = 10
    # loss = nn.BCELoss()
    # loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    loss = HingeLoss()
    # validation_loss = nn.BCELoss()
    # validation_loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    validation_loss = HingeLoss()
    domain_loss = nn.BCELoss()
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    alpha = 0
    gradient_reversal_every_n_epoch = 1
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 1
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory + f'./results/'
    name = f'BERT_Hinge'

model = BERT_Hinge(BERT_Hinge_config) # initialise the model

# BERT - DANN WCELoss

MAX_SENTENCE_LENGTH = 256
MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = True
PAD_FRONT = False
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

print('---')

class BERT_DANN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    d_ff = 1024 # = 4* d_model
    n_heads = 8
    dropout = 0.1
    e_layers = 8
    embedding_aggregation = 'cls' # TODO
    n_mlp_clf_layers = 1
    n_mlp_dom_layers = 1
    res_learning = False
    activation = nn.ReLU()
    mask_flag = False # causal mask
    train_embedding = True
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 10
    # loss = nn.CrossEntropyLoss()
    loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    # validation_loss = nn.CrossEntropyLoss()
    validation_loss = nn.CrossEntropyLoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    domain_loss = nn.CrossEntropyLoss()
    alpha = 0.1
    gradient_reversal_every_n_epoch = 1
    regularisation_loss = None
    scheduler = False
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory + f'./results/'
    name = f'BERT_DANN'

model = BERT_DANN(BERT_DANN_config) # initialise the model

# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_combined_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(w2v_embed, freeze=False)
# model.model.embedding.embedding = nn.Embedding.from_pretrained(pretrained_bert_embed, freeze=False)

# model.model.encoder = pretrained_bert.model.encoder

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)

---

# Historic Best

In [None]:
# # BERT - SoftmaxBCELoss+OptBalAccu

# MAX_SENTENCE_LENGTH = 256
# MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
# MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
# LOW_FREQ_TOKEN = False
# CLS = True
# PAD_FRONT = False
# W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
# cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
# cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
# train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
#                                                               cropped_val_data, \
#                                                                 cropped_test_data, \
#                                                                     cropped_future_data, \
#                                                                         MAX_SENTENCE_LENGTH, \
#                                                                             raw_token_pytorch_map, \
#                                                                                 CLS=CLS, \
#                                                                                     low_freq_special_token=LOW_FREQ_TOKEN, \
#                                                                                         pad_front=PAD_FRONT)
# pos_prior, neg_prior = get_distribution(train_y)
# print('class prior:', pos_prior, neg_prior)
# pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
# print('domain prior:', pos_dom_prior, neg_dom_prior)
# dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
# print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
# dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
# print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# # pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

# print('---')

# class BERT_DANN_config:
#     # ----------------- architectual hyperparameters ----------------- #
#     d_model = 256
#     d_ff = 1024 # = 4* d_model
#     n_heads = 8
#     dropout = 0.1
#     e_layers = 6 # actually 8 was better
#     embedding_aggregation = 'cls' # TODO
#     n_mlp_clf_layers = 1
#     n_mlp_dom_layers = 1
#     res_learning = False
#     activation = nn.ReLU()
#     mask_flag = False # causal mask
#     train_embedding = True
#     # ----------------- optimisation hyperparameters ----------------- #
#     random_state = SEED
#     batch_size = 8
#     epochs = 32
#     lr = 1e-5
#     patience = 10
#     # loss = nn.BCELoss()
#     loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     # validation_loss = nn.BCELoss()
#     validation_loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     domain_loss = nn.BCELoss()
#     pretrain_loss = nn.CrossEntropyLoss()
#     pretrain_validation_loss = nn.CrossEntropyLoss()
#     alpha = 0.1
#     gradient_reversal_every_n_epoch = 1
#     regularisation_loss = None
#     scheduler = False
#     grad_clip = False
#     # ----------------- operation hyperparameters ----------------- #
#     d_output = 2
#     seq_len = MAX_SENTENCE_LENGTH
#     n_unique_tokens = len(raw_token_pytorch_map)
#     # ----------------- saving hyperparameters ----------------- #
#     rootpath = home_directory + './'
#     saving_address = home_directory + f'./results/'
#     name = f'BERT_DANN'

# model = BERT_DANN(BERT_DANN_config) # initialise the model

# # train the model (all cells except this one will print training log and evaluation at each batch)
# best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
# print()

# # as model automatically saves best epoch, will now load the best epoch and evaluate on test set
# model.load()
# model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
# model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)

In [None]:
# # BERT - with low freq token - SoftmaxBCELoss+OptBalAccu

# MAX_SENTENCE_LENGTH = 256
# MIN_FREQUENCY = 40 # because 40 is statistical sample requirement
# MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
# LOW_FREQ_TOKEN = True
# CLS = True
# PAD_FRONT = False
# W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
# cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
# cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
# raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
# train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
#                                                               cropped_val_data, \
#                                                                 cropped_test_data, \
#                                                                     cropped_future_data, \
#                                                                         MAX_SENTENCE_LENGTH, \
#                                                                             raw_token_pytorch_map, \
#                                                                                 CLS=CLS, \
#                                                                                     low_freq_special_token=LOW_FREQ_TOKEN, \
#                                                                                         pad_front=PAD_FRONT)
# pos_prior, neg_prior = get_distribution(train_y)
# print('class prior:', pos_prior, neg_prior)
# pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
# print('domain prior:', pos_dom_prior, neg_dom_prior)
# dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
# print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
# dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
# print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# # pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

# print('---')

# class BERT_DANN_config:
#     # ----------------- architectual hyperparameters ----------------- #
#     d_model = 256
#     d_ff = 1024 # = 4* d_model
#     n_heads = 8
#     dropout = 0.1
#     e_layers = 6
#     embedding_aggregation = 'cls' # TODO
#     n_mlp_clf_layers = 1
#     n_mlp_dom_layers = 1
#     res_learning = False
#     activation = nn.ReLU()
#     mask_flag = False # causal mask
#     train_embedding = True
#     # ----------------- optimisation hyperparameters ----------------- #
#     random_state = SEED
#     batch_size = 8
#     epochs = 32
#     lr = 1e-5
#     patience = 10
#     # loss = nn.BCELoss()
#     loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     # validation_loss = nn.BCELoss()
#     validation_loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
#     domain_loss = nn.BCELoss()
#     pretrain_loss = nn.CrossEntropyLoss()
#     pretrain_validation_loss = nn.CrossEntropyLoss()
#     alpha = 0.1
#     gradient_reversal_every_n_epoch = 1
#     regularisation_loss = None
#     scheduler = False
#     grad_clip = False
#     # ----------------- operation hyperparameters ----------------- #
#     d_output = 2
#     seq_len = MAX_SENTENCE_LENGTH
#     n_unique_tokens = len(raw_token_pytorch_map)
#     # ----------------- saving hyperparameters ----------------- #
#     rootpath = home_directory + './'
#     saving_address = home_directory + f'./results/'
#     name = f'BERT_DANN'

# model = BERT_DANN(BERT_DANN_config) # initialise the model

# # train the model (all cells except this one will print training log and evaluation at each batch)
# best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
# print()

# # as model automatically saves best epoch, will now load the best epoch and evaluate on test set
# model.load()
# model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
# model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)

---
---
# LSTM Graveyard

In [None]:
# LSTM
MAX_SENTENCE_LENGTH = 64
MIN_FREQUENCY = 0 # because 40 is statistical sample requirement
MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE = False
LOW_FREQ_TOKEN = False
CLS = False
PAD_FRONT = True
W2V_CONTEXT_WINDOW = 5 # 2 to left, 2 to right
cropped_train_data = crop_sentence_length(train_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = MAKE_CROPPED_REMAINS_INTO_NEW_INSTANCE)
cropped_val_data = crop_sentence_length(val_data, max_sentence_length =  MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_test_data = crop_sentence_length(test_data, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
cropped_future_data = crop_sentence_length(data_test, max_sentence_length = MAX_SENTENCE_LENGTH, make_cropped_remains_into_new_instance = False)
raw_token_pytorch_map = get_raw_token_pytorch_map(data = cropped_train_data, min_frequency = MIN_FREQUENCY)
train_x, train_y, val_x, val_y, test_x, test_y, train_dom, val_dom, test_dom, future_x = Data_Factory(cropped_train_data, \
                                                              cropped_val_data, \
                                                                cropped_test_data, \
                                                                    cropped_future_data, \
                                                                        MAX_SENTENCE_LENGTH, \
                                                                            raw_token_pytorch_map, \
                                                                                CLS=CLS, \
                                                                                    low_freq_special_token=LOW_FREQ_TOKEN, \
                                                                                        pad_front=PAD_FRONT)
pos_prior, neg_prior = get_distribution(train_y)
print('class prior:', pos_prior, neg_prior)
pos_dom_prior, neg_dom_prior = get_distribution(train_dom)
print('domain prior:', pos_dom_prior, neg_dom_prior)
dom1_pos_prior, dom1_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label1])
print('dom1 class prior:', dom1_pos_prior, dom1_neg_prior)
dom2_pos_prior, dom2_neg_prior = get_distribution([[0, 1] if label == 1 else [1, 0] for label in label2])
print('dom2 class prior:', dom2_pos_prior, dom2_neg_prior)
# pretrain_x, pretrain_y, pretrain_mask, pretrain_dom, preval_x, preval_y, preval_mask, preval_dom = BERT_pretrain_DataFactory(train_data, val_data, SEED, raw_token_pytorch_map, MAX_SENTENCE_LENGTH)

print('---')

class LSTM_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    n_recurrent_layers = 2
    bidirectional = False
    n_heads = 8
    dropout = 0.1
    n_mlp_layers = 1
    flatten = False
    activation = nn.ReLU()
    res_learning = True
    mask_flag = False # TODO
    train_embedding = True
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    batch_size = 8
    epochs = 32
    lr = 1e-5
    patience = 4
    loss = nn.BCELoss()
    # loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    validation_loss = nn.BCELoss()
    # validation_loss = nn.BCELoss(weight=torch.FloatTensor([pos_prior, neg_prior]))
    pretrain_loss = nn.CrossEntropyLoss()
    pretrain_validation_loss = nn.CrossEntropyLoss()
    regularisation_loss = None
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    d_output = 2
    seq_len = MAX_SENTENCE_LENGTH
    n_unique_tokens = len(raw_token_pytorch_map)
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory + './'
    saving_address = home_directory +  f'./results/'
    name = f'LSTM_Classifier'



model = LSTM(LSTM_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model.fit(train_x, train_y, train_dom, val_x, val_y, val_dom)
print()

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model.load()
model.eval(val_x, val_y, val_dom, best_epoch, evaluation_mode = True)
model.eval(test_x, test_y, test_dom, best_epoch, evaluation_mode = True)