BI-LSTM Model training based on polarised or pre-trained embeddings

Based on the following tutorials:

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb


In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!ls "drive/My Drive/Colab Notebooks" 
!pip install fasttext

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
import random
import time
from torchtext import data
from torchtext.vocab import Vectors
import fasttext
import os

In [0]:
def calc_accuracy(preds, gold):
    """returns accuracy per batch."""
    #round to closest integer, cause binary classification
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = 0
    for x in range(len(rounded_preds)):
        if rounded_preds[x] == gold[x]:
            correct += 1
        
    acc = correct / len(rounded_preds)
    return acc

def count_parameters(model):
    """count number of trainable parameters"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def progress_tracker(total, current, start_time):
    per = (current / total) * 100
    global milestone
    cminutes, cseconds = epoch_time(start_time, time.time())
    ctime = '{0}m {1}s'.format(cminutes,cseconds)
    if per > 90:
        if milestone != 90:
            print('Progress training: 90% | current runtime: {0}'.format(ctime))
            milestone = 90
    elif per > 80:
        if milestone != 80:
            print('Progress training: 80% | current runtime: {0}'.format(ctime))
            milestone = 80
    elif per > 70:
        if milestone != 70:
            print('Progress training: 70% | current runtime: {0}'.format(ctime))
            milestone = 70
    elif per > 60:
        if milestone != 60:
            print('Progress training: 60% | current runtime: {0}'.format(ctime))
            milestone = 60
    elif per > 50:
        if milestone != 50:
            print('Progress training: 50% | current runtime: {0}'.format(ctime))
            milestone = 50
    elif per > 40:
        if milestone != 40:
            print('Progress training: 40% | current runtime: {0}'.format(ctime))
            milestone = 40
    elif per > 30:
        if milestone != 30:
            print('Progress training: 30% | current runtime: {0}'.format(ctime))
            milestone = 30
    elif per > 20:
        if milestone != 20:
            print('Progress training: 20% | current runtime: {0}'.format(ctime))
            milestone = 20
    elif per > 10:
        if milestone != 10:
            print('Progress training: 10% | current runtime: {0}'.format(ctime))
            milestone = 10


def calc_metrics(preds, y):
    """returns F1 score per batch."""
    rounded_preds = torch.round(torch.sigmoid(preds))
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    for x in range(len(rounded_preds)):
        if rounded_preds[x] == 1:
            if y[x] == 1:
                tp += 1
            else:
                fp += 1
        else:
            if y[x] == 1:
                fn += 1
            if y[x] == 0:
                tn += 1
    return tp, fp, fn, tn
            
   
def train(model, train_iterator, optimizer, criterion, start_time):
    """train the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.train() # activate  training
    progress = 0
    tn = 0
    tp = 0
    fp = 0
    fn = 0
    for batch in train_iterator:
        optimizer.zero_grad() #reset gradient
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = calc_accuracy(predictions, batch.label)
        batch_tp, batch_fp, batch_fn, batch_tn = calc_metrics(predictions, batch.label)
        tn += batch_tn
        tp += batch_tp
        fp += batch_fp
        fn += batch_fn
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
        progress += len(batch)
        prevbatch = batch
       # progress_tracker(len(train_iterator), progress, start_time)
    if tp != 0:    
        prec = tp / (tp + fp)
        reca = tp / (tp +fn)
    else:
        prec = 'no TP'
        reca = 'no TP'
    
    # return average loss and acc over all batches    
    print(len(train_iterator))
    return epoch_loss / len(train_iterator), epoch_acc / len(train_iterator), prec, reca

    
def evaluate(model, dev_iterator, criterion):
    """evaluate the model (no updateing model) """
    epoch_loss = 0
    epoch_acc = 0
    tn = 0
    tp = 0
    fp = 0
    fn = 0
    model.eval() #deactivate training (no updating model)
    with torch.no_grad():
        for batch in dev_iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = calc_accuracy(predictions, batch.label)
            batch_tp, batch_fp, batch_fn, batch_tn = calc_metrics(predictions, batch.label)
            tn += batch_tn
            tp += batch_tp
            fp += batch_fp
            fn += batch_fn
            epoch_loss += loss.item()
            epoch_acc += acc
    if tp != 0:    
        prec = tp / (tp + fp)
        reca = tp / (tp +fn)
    else:
        prec = 'no TP (tp:{0},fp:{1},tn:{2},fn:{3})'.format(tp,fp,tn,fn)
        reca = 'no TP'
    return epoch_loss / len(dev_iterator), epoch_acc / len(dev_iterator), prec, reca, tn, tp, fp, fn


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
    

class Classifier(nn.Module):
    """BI-LSTM Neural Network Model"""
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx): 
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True,
                           num_layers=n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]
        #packing ensured we do not process the the padded elements
        # this makes it faster
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout       
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))          
        #hidden = [batch size, hid dim * num directions]
        return self.out(hidden.squeeze(0))

In [0]:
BASE_PATH ="drive/My Drive/Colab Notebooks/experiments"
TASK_NAME = '' #give up name of task
OUTPUT_DIR = f'{BASE_PATH}/outputs/{TASK_NAME}/'
BASE_REPORTS_DIR = f'{BASE_PATH}/reports/{TASK_NAME}_evaluation_report'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(BASE_REPORTS_DIR):
    os.makedirs(BASE_REPORTS_DIR)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
SEED = 111
torch.manual_seed(SEED)
TEXT = data.Field(include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
datafields = [("text", TEXT),
             ("label", LABEL)]
train_data  = data.TabularDataset( #select right training data
            path="drive/My Drive/Colab Notebooks/experiments/data/fastexttagger/abusivetrain.tsv",
            format="tsv", fields=datafields)
dev_data  = data.TabularDataset( # select right development data
            path="drive/My Drive/Colab Notebooks/experiments/data/fastexttagger/abusivedev.tsv",
            format="tsv", fields=datafields)

Code used to approximate embeddings for OOV words from the training data. Only run first time: new embeddings will be saved.

In [0]:
#fasttextmodel = fasttext.load_model("drive/My Drive/Colab Notebooks/embeddings/badwordembeddingsxxl.bin")
#wordlist = []
#with open('embeddings.vec', 'w') as f:
#    for x in range(len(train_data)):
#        for word in train_data[x].text:
#            if word not in wordlist:
#                vec = [x for x in fasttextmodel[word]]
#                print(word + ' ' + (' '.join(str(x) for x in vec)), file=f)
#                wordlist.append(word)                

Select pre-trained or polarised embeddings

In [0]:
print('loading vectors')
#pre-trained embeddings:
vec = Vectors(name='pretrained_embeddings.vec', cache='drive/My Drive/Colab Notebooks/embeddings')

#polarised embeddings:
#vec = Vectors(name='drive/My Drive/Colab Notebooks/embeddings/abusive_embeddings.vec', cache='/content/')

print('building vocabulary')
TEXT.build_vocab(train_data, unk_init = torch.Tensor.normal_,
                 vectors = vec) 
LABEL.build_vocab(train_data)

In [0]:
INPUT_DIM = len(TEXT.vocab) 
EMBEDDING_DIM = 300 #should be same as embedding dimensions
HIDDEN_DIM = 300 
OUTPUT_DIM = 1 
N_LAYERS = 2
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

print('creating model')
model = Classifier(INPUT_DIM,EMBEDDING_DIM,HIDDEN_DIM,OUTPUT_DIM,N_LAYERS,DROPOUT,PAD_IDX)
model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')
print('preparing model')


print('loading embeddings')
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)
BATCH_SIZE = 24
train_iterator, dev_iterator = data.BucketIterator.splits(
    (train_data, dev_data),
    batch_size = BATCH_SIZE,
    device = device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True)

In [0]:
EPOCHS = 20
if not os.path.exists(BASE_REPORTS_DIR):
    os.makedirs(BASE_REPORTS_DIR)
for epoch in range(EPOCHS):
    milestone = 0
    start_time = time.time()
    print('training...')
    train_loss, train_acc, train_prec, train_reca  = train(model, train_iterator, optimizer, criterion, start_time)
    print('testing on development set...')
    dev_loss, dev_acc, dev_prec, dev_reca, tn, tp, fp, fn = evaluate(model, dev_iterator, criterion)
    try:
        pos_prec = tp / (tp +fp) 
        pos_reca = tp / (tp + fn)
        neg_prec = tn / (tn + fn)
        neg_reca = tn / (tn + fp)
        pos_f1 = (2 * ((pos_prec * pos_reca) / (pos_prec + pos_reca)))
        neg_f1 = (2 * ((neg_prec * neg_reca) / (neg_prec + neg_reca)))
        macro_f1 = (pos_f1 + neg_f1) / 2
    except ZeroDivisionError:
        macro_f1 = 'zerodevision'
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    etime = '{0}m {1}s'.format(epoch_mins,epoch_secs)
    
    torch.save(model.state_dict(), f'{OUTPUT_DIR}model_{epoch}.pt')

    print('Epoch: {0} / {1} | Epoch Time: {2}'.format(epoch + 1, EPOCHS, etime))
    print('Train Loss: {0} | Train Acc: {1}'.format(train_loss, train_acc * 100))
    print('Train Pos Prec: {0} | Train Pos Reca: {1}'.format(train_prec, train_reca))
    print('dev Loss: {0} | dev Acc: {1}'.format(dev_loss, dev_acc * 100))
    print('dev Prec: {0} | dev Reca: {1}'.format(dev_prec, dev_reca))

In [0]:
REPORTS_DIR = f'{BASE_REPORTS_DIR}/abusive/'
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
dev_data  = data.TabularDataset(
            path="drive/My Drive/Colab Notebooks/experiments/data/fastexttagger/abusivedev.tsv",
            format="tsv", fields=datafields)
dev_iterator = data.BucketIterator(dev_data,
    batch_size = BATCH_SIZE,
    device = device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True)

for epoch in range(20):#
    model.load_state_dict(torch.load(f'{OUTPUT_DIR}model_{epoch}.pt'))
    dev_loss, dev_acc, dev_prec, dev_reca, tn, tp, fp, fn = evaluate(model, dev_iterator, criterion)
    try:
        pos_prec = tp / (tp +fp) 
        pos_reca = tp / (tp + fn)
        neg_prec = tn / (tn + fn)
        neg_reca = tn / (tn + fp)
        pos_f1 = (2 * ((pos_prec * pos_reca) / (pos_prec + pos_reca)))
        neg_f1 = (2 * ((neg_prec * neg_reca) / (neg_prec + neg_reca)))
        macro_f1 = (pos_f1 + neg_f1) / 2
    except ZeroDivisionError:
        macro_f1 = 'zerodevision'


    with open(f'{REPORTS_DIR}epoch{epoch}.txt', 'w') as f:
        print('dev Loss: {0} | dev Acc: {1}'.format(dev_loss, dev_acc * 100), file=f)
        print('dev Prec: {0} | dev Reca: {1}'.format(dev_prec, dev_reca), file=f)
        print('tp: {0} | tn: {1} | fp: {2} | fn: {3}'.format(tp,tn,fp,fn), file=f)
       print('Macro_F1: {0}'.format(macro_f1), file=f)
   print('Epoch: {0}'.format(epoch))
   print('dev Loss: {0} | dev Acc: {1}'.format(dev_loss, dev_acc * 100))
   print('dev Prec: {0} | dev Reca: {1}'.format(dev_prec, dev_reca))