In [49]:
from utils import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torchtext
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator

import transformers
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
# tokeniser test
print('Tokeniser test:', tokenizer.encode('Yo this is Daniel!'))

seed = 0
torch.manual_seed(seed)
np.random.seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Current device:', torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokeniser test: [101, 10930, 2023, 2003, 3817, 999, 102]
Current device: NVIDIA GeForce RTX 3070 Laptop GPU


In [2]:
# load datasets
src_path = './datasets/'
train_file = 'lcp_single_train.tsv.txt'
test_file = 'lcp_single_test_labels.tsv.txt'
temp_path = './temp/'

load_dataset_as_json(src_path, train_file, test_file, temp_path)

Loaded 7662 training entries and 917 test entreies.


In [90]:
# preprocessing
batch_size = 32
max_length = tokenizer.max_model_input_sizes['bert-base-uncased']

sos_idx = tokenizer.cls_token_id
eos_idx = tokenizer.sep_token_id
pad_idx = tokenizer.pad_token_id
unk_idx = tokenizer.unk_token_id

print(f'Special token ids: \n SOS:{sos_idx}, EOS: {eos_idx}, PAD: {pad_idx}, UNK: {unk_idx}')

# define fields
# text_field = Field(sequential=False,
#                    use_vocab=True,
#                    init_token=sos_idx,
#                    eos_token=eos_idx,
#                    tokenize=tokenizer.encode,
# #                    preprocessing = tokenizer.convert_tokens_to_ids,
#                    lower=False,
#                    batch_first=True,
#                    pad_token=pad_idx,
#                    unk_token=unk_idx,
#                    is_target=False)

text_field = Field(use_vocab=True,
                   tokenize=tokenizer.encode,
#                    preprocessing = tokenizer.convert_tokens_to_ids,
                   batch_first=True,
                   pad_token=pad_idx,
                   unk_token=unk_idx,
                   init_token=sos_idx,
                   eos_token=eos_idx)

label_field = LabelField(use_vocab=False,
                         batch_first=True,
                         is_target=True,
                         dtype=torch.float)

fields = {'sentence': ('sentence', text_field), 'complexity': ('complexity', label_field)}

# dataloader using json format
train_data = TabularDataset(path = temp_path + 'train.json', format='json', fields=fields)
test_data = TabularDataset(path = temp_path + 'test.json', format='json', fields=fields)

text_field.build_vocab(train_data)

# iterators
train_it = BucketIterator(train_data,
                          batch_size=batch_size,
                          sort_key= None,
                          repeat=True,
                          sort=False,
                          shuffle=True,
                          device=device)
# valid_it ...
# test_it ...

print('Training iterator created with', len(train_it), 'batches.')
vars(train_data.examples[0])

Special token ids: 
 SOS:101, EOS: 102, PAD: 0, UNK: 100
Training iterator created with 240 batches.


{'sentence': [100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100,
  100],
 'complexity': 0.0}

In [86]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('hi I am'))

[7632, 1045, 2572]

In [89]:
# print an example
print('########Dataloader test########')
random_idx = np.random.randint(0, len(train_data))
print('Tokenised sentence: \n    ', vars(train_data.examples[random_idx])['sentence'])
print('Original sentence: \n    ', tokenizer.convert_ids_to_tokens(vars(train_data.examples[random_idx])['sentence']))
print('Lexical complexity label:\n    ', vars(train_data.examples[random_idx])['complexity'], '\n')

print('########Iterator test########')
for batch in train_it:
    sentence, complexity = batch.sentence[0].cpu().tolist(), batch.complexity[0].cpu().numpy()
    print('Tokenised sentence: \n    ', sentence)
    print('Original sentence: \n    ', tokenizer.convert_ids_to_tokens(sentence))
    print('Lexical complexity label:\n    ', complexity)
    break

########Dataloader test########
Tokenised sentence: 
     [101, 5678, 1010, 2048, 5022, 2007, 3972, 20624, 5644, 4593, 19129, 1996, 9666, 2475, 25206, 2031, 2351, 2013, 3674, 27480, 2019, 9626, 11983, 2164, 3729, 2232, 1031, 4229, 1522, 9932, 12740, 1033, 1012, 102]
Original sentence: 
     ['[CLS]', 'additionally', ',', 'two', 'patients', 'with', 'del', '##eti', '##ons', 'apparently', 'encompassing', 'the', 'fog', '##2', 'locus', 'have', 'died', 'from', 'multiple', 'congenital', 'an', '##oma', '##lies', 'including', 'cd', '##h', '[', '38', '‚', 'ai', '##40', ']', '.', '[SEP]']
Lexical complexity label:
     0.5277777777777778 

########Iterator test########
Tokenised sentence: 
     [2, 2, 467, 440, 17, 4, 546, 694, 1664, 1025, 1119, 191, 14, 555, 12, 922, 2357, 61, 36, 652, 5, 4, 3426, 8262, 1341, 13, 108, 153, 12, 622, 1467, 6, 171, 506, 2357, 61, 36, 652, 2088, 1562, 10, 5745, 1047, 6, 11, 439, 243, 8, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Original sentence: 
     ['[unus

In [None]:
# define model: BERT + biLSTM + MLP
class BERT_model(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, layers, bidirectional, dropout):
        super().__init__()
        self.bert = bert
        self.bidirectional = bidirectional
        emb_dim = bert.config.to_dict()['hidden_size']
        
        self.lstm = nn.LSTM(emb_dim, 
                            hidden_dim, 
                            layers,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout = 0 if layers < 2 else dropout)
        
        if self.bidirectional:
            fc_input_dim = hidden_dim * 2
        else:
            fc_input_dim = hidden_dim
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(fc_input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        with torch.no_grad():
            embeddings = self.bert(x)[0] # [bs, len, emb_dim]
        
        _, hn = self.lstm(embeddings) # [l * d, bs, emb_dim]
        
        if self.bidirectional:
            hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
        
        hn = self.relu(hn)
        hn = self.fc(hn)
        output = self.sigmoid(hn)
        
        return output

In [None]:
hidden_dim = 128
output_dim = 1
layers = 3
bidirectional = True
dropout = 0.1

model = BERT_model(bert=bert,
                   hidden_dim=hidden_dim,
                   output_dim=output_dim,
                   layers=layers,
                   bidirectional=bidirectional,
                   dropout=dropout)
model.to(device)

# freeze bert params
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


print(f'The model has {sum(p.numel() for p in model.parameters()):,} total parameters')
print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters')

In [None]:
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
criterion = criterion.to(device)

In [None]:
# train function
def train(iterator, model, optimiser, criterion):
    
    epoch_loss = 0
    
    model.train()
    for batch in iterator:
        
        optimizer = zero_grad()
        pred = model(batch.sentence)
        loss = criterion(pre, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return eposs_loss / len(iterator)

In [None]:
# evaluation function
def eval(iterator, model, criterion):
    model.eval()
    pass

In [None]:
# main loop
n_epochs = 10
start = time.time()

for epoch in tqdm(n_epochs):
    
    epoch_start = time.time()
    train_loss = train(train_it, model, optimiser, criterion)
    # valid_loss ...
    
    epoch_end = time.time()
    