In [1]:
import csv
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torchtext
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator

import transformers
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

seed = 0
torch.manual_seed(seed)
np.random.seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Current device:', torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Current device: NVIDIA GeForce RTX 3070 Laptop GPU


In [2]:
# tokeniser test
tokenizer.encode('Yo this is Daniel!')

[101, 10930, 2023, 2003, 3817, 999, 102]

In [3]:
# load dataset: transfer to json type so that torchtext reads it
def prepare_train_dataset():
        
    # X = [[sentence1, word1], [sentence2, word2], ...], y = [score1, score2, ...]
    X_tr = []
    y_tr = []
    f = open("./datasets/Sub-task 1/lcp_single_train.tsv")
    read_tsv = csv.reader(f, delimiter="\t")
    next(read_tsv, None) # skip header
    
    train_json = open('./temp/train.json', 'w')
    
    for data in read_tsv:
        d = {'sentence': data[2], 'label': float(data[4])}
        json.dump(d, train_json)
        train_json.write('\n')
    f.close()

prepare_train_dataset()

In [None]:
# text and label preprocessing - TO FIX
batch_size = 32


sos_idx = tokenizer.cls_token_id
eos_idx = tokenizer.sep_token_id
pad_idx = tokenizer.pad_token_id
unk_idx = tokenizer.unk_token_id

print(f'Special token ids: \n SOS:{sos_idx}, EOS: {eos_idx}, PAD: {pad_idx}, UNK: {unk_idx}')

# define fields
text_field = Field(use_vocab=True,
                   tokenize=tokenizer.encode,
                   preprocessing = tokenizer.convert_tokens_to_ids,
                   batch_first=True,
                   pad_token=pad_idx,
                   unk_token=unk_idx,
                   init_token=sos_idx,
                   eos_token=eos_idx)

label_field = LabelField(use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)

fields = {'sentence': ('sentence', text_field), 'label': ('label',label_field)}

# dataloader using json format
json_path = "./temp/"
train_data = TabularDataset(
    path =  json_path + 'train.json',
    format='json',
    fields = fields)

text_field.build_vocab(train_data)

# iterators
train_it = BucketIterator(train_data,
                          batch_size=batch_size,
                          sort_key= lambda x: len(x.sentence),
                          repeat=True,
                          sort=False,
                          shuffle=True,
                          device=device)
# valid_it ...
# test_it ...

print('Training terator created with', len(train_it), 'batches.')

Special token ids: 
 SOS:101, EOS: 102, PAD: 0, UNK: 100


In [5]:
# print an example
for batch in train_it:
    print(batch.sentence[0])
    print(tokenizer.convert_ids_to_tokens(batch.sentence[0]))
    print(batch.label)
    break

tensor([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
# define model: BERT + biLSTM + MLP
class BERT_model(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, layers, bidirectional, dropout):
        super().__init__()
        self.bert = bert
        self.bidirectional = bidirectional
        emb_dim = bert.config.to_dict()['hidden_size']
        
        self.lstm = nn.LSTM(emb_dim, 
                            hidden_dim, 
                            layers,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout = 0 if layers < 2 else dropout)
        
        if self.bidirectional:
            fc_input_dim = hidden_dim * 2
        else:
            fc_input_dim = hidden_dim
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(fc_input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        with torch.no_grad():
            embeddings = self.bert(x)[0] # [bs, len, emb_dim]
        
        _, hn = self.lstm(embeddings) # [l * d, bs, emb_dim]
        
        if self.bidirectional:
            hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
        
        hn = self.relu(hn)
        hn = self.fc(hn)
        output = self.sigmoid(hn)
        
        return output

In [None]:
hidden_dim = 128
output_dim = 1
layers = 3
bidirectional = True
dropout = 0.1

model = BERT_model(bert=bert,
                   hidden_dim=hidden_dim,
                   output_dim=output_dim,
                   layers=layers,
                   bidirectional=bidirectional,
                   dropout=dropout)
model.to(device)

# freeze bert params
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


print(f'The model has {sum(p.numel() for p in model.parameters()):,} total parameters')
print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters')

In [None]:
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
criterion = criterion.to(device)

In [37]:
# train function
def train(iterator, model, optimiser, criterion):
    
    epoch_loss = 0
    
    model.train()
    for batch in iterator:
        
        optimizer = zero_grad()
        pred = model(batch.sentence)
        loss = criterion(pre, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return eposs_loss / len(iterator)

In [38]:
# evaluation function
def eval(iterator, model, criterion):
    model.eval()
    pass

In [None]:
# main loop
n_epochs = 10
start = time.time()

for epoch in tqdm(n_epochs):
    
    epoch_start = time.time()
    train_loss = train(train_it, model, optimiser, criterion)
    # valid_loss ...
    
    epoch_end = time.time()
    