In [1]:
import utils
from get_dataset import get_dataset
import random
import time
import os
import torch
import torch.nn as nn
import torchtext.data as data
from torch.optim import Adam
from torchtext.data import Iterator
from importlib import reload
import math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# If you have dark theme
from matplotlib.pyplot import rcParams
rcParams['figure.facecolor'] = 'white'

In [3]:
# Setup logger
import logging
logging.basicConfig(filename='log.txt',
                    level=logging.INFO,
                    format='%(asctime)s %(levelname)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

In [4]:
dataset, emb_weights = get_dataset()
dataset.fields['review'].include_lengths = True

random.seed(43)
ds_train, ds_val, ds_test = dataset.split(split_ratio=[0.8, 0.1, 0.1], random_state=random.getstate())

<br>

In [5]:
from torchtext.data import Iterator
from torchtext.data import BucketIterator

In [6]:
padding_list = []
for i in range(1, 11):
    it = Iterator(dataset, 8, shuffle=True)
    for batch in it:
        lengths = batch.review[1]
        max_length = lengths.max().item()
        padding_sum = (max_length - lengths).tolist()
        padding_list.extend(padding_sum)
avg_pad_length = sum(padding_list)/len(padding_list)
print(avg_pad_length)

14.366


In [7]:
padding_list = []
for i in range(1, 11):
    it = BucketIterator(dataset, 8, shuffle=True, sort_key=lambda x: len(x.review))
    for batch in it:
        lengths = batch.review[1]
        max_length = lengths.max().item()
        padding_sum = (max_length - lengths).tolist()
        padding_list.extend(padding_sum)
avg_pad_length = sum(padding_list)/len(padding_list)
print(avg_pad_length)

14.366


In [8]:
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 50):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [9]:
padding_list = []
for i in range(1, 11):
    it = MyIterator(dataset, 8, shuffle=True, train=True, sort_key=lambda x: len(x.review))
    for batch in it:
        lengths = batch.review[1]
        max_length = lengths.max().item()
        padding_sum = (max_length - lengths).tolist()
        padding_list.extend(padding_sum)
avg_pad_length = sum(padding_list)/len(padding_list)
print(avg_pad_length)

0.7153333333333334


<br>

In [5]:
class SelfAttention(nn.Module):
    def __init__(self, query_dim, n_outputs, dropout=0.1):
        super().__init__()
        self.W1 = nn.Linear(query_dim, query_dim//2)
        self.W2 = nn.Linear(query_dim//2, n_outputs)
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query):
        attn_weights = self.W2(self.dropout(torch.tanh(self.W1(query))))
        attn_weights = attn_weights.permute(0,2,1)
        
        return self.softmax(attn_weights)

In [6]:
class GRU_SelfAttention_model(nn.Module):
    def __init__(self, vocab_size, embed_dim, embed_vecs=None, hidden_size=512,
                num_layers=1, attn_ouput_size=8, dropout=(0, 0), bidirectional=False):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.attn_output_size = attn_ouput_size
        if embed_vecs is not None:
            self.embedding = nn.Embedding.from_pretrained(embed_vecs)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_size,
                          num_layers=num_layers, dropout=dropout[1],
                          bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(p=dropout[0])
        self.attention = SelfAttention(self.num_directions*self.hidden_size, attn_ouput_size, dropout[0])
        self.head = nn.Linear(self.attn_output_size*self.num_directions*self.hidden_size, 2)

        
    def forward(self, batch):
        batch, lengths = batch
        batch_dim, _ = batch.shape
        
        embedded = self.dropout(self.embedding(batch))
        embedded_packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        
        outputs_packed, hiddens = self.gru(embedded_packed)
        
        outputs, lengths = nn.utils.rnn.pad_packed_sequence(outputs_packed, batch_first=True)
#         last_hidden = hiddens.view(self.num_layers, self.num_directions, batch_dim, self.hidden_size)[-1,:,:,:]
#         hidden_concat = last_hidden.transpose(1,0).reshape(batch_dim, self.num_directions*self.hidden_size)
        
        attn_weights = self.attention(self.dropout(outputs))
        attn_output = torch.bmm(attn_weights, outputs).view(batch_dim, -1)
        
        logging.debug('batch shape : {}'.format(batch.shape))
        logging.debug('embedding shape : {}'.format(embedded.shape))
        logging.debug('hiddens shape : {}'.format(hiddens.shape))
        logging.debug('outputs shape : {}'.format(outputs.shape))
#         logging.debug('hidden_concat shape : {}'.format(hidden_concat.shape))
        logging.debug('attn_weights shape : {}'.format(attn_weights.shape))
        logging.debug('attn_output shape : {}'.format(attn_output.shape))
        
        return self.head(self.dropout(attn_output))

In [43]:
import numpy as np
def validate(ds, loss_fn, model, bs=1, device=device):
    """
        Loops over a dataset (validation or test) and evaluates average
        loss and accuracy of a given model.
    """
    is_in_train = model.training
    model.eval()
    with torch.no_grad():
#         size = len(ds)
        predictions = []
        gt = []
        loss = 0
        for i, batch in enumerate(MyIterator(ds, bs, sort_key=lambda x: len(x.review), shuffle=False, train=False, device=device)):
            output = model(batch.review)
            predictions.extend(output.argmax(dim=1).tolist())
            gt.extend(batch.label.tolist())
            loss += loss_fn(output, batch.label).item()
        avg_loss = loss/(i+1)

    accuracy = np.mean(np.array(predictions) == np.array(gt))
    if is_in_train: model.train()
        
    return avg_loss, accuracy

In [44]:
def learner(model, loss_fn, optimiser, epochs=1, bs=4, device=device, grad_clip=None):
    start_time = time.time()
    for epoch in range(epochs):
        
        total_loss = 0
        for i, batch in enumerate(MyIterator(ds_train, bs, sort_key=lambda x: len(x.review), shuffle=True, device=device), 1):
            optimiser.zero_grad()
            
            output = model(batch.review)
            loss = loss_fn(output, batch.label)
            total_loss += loss.item()
        
            loss.backward()
            if grad_clip is not None:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            optimiser.step()

            if not i % (len(ds_train)//(bs*3)):
                avg_loss = total_loss / (len(ds_train)//(bs*3))
                val_loss, val_accuracy = validate(ds_val, loss_fn, model, bs=bs)
                print('Epoch : {}, batch : {}, train_loss = {:.4f}, val_loss = {:.4f}, val_accuracy : {:.3f}, time = {:.0f}s'.format(
                        epoch + 1, i, avg_loss, val_loss, val_accuracy, time.time() - start_time))
                total_loss = 0

In [54]:
vocab_size = len(emb_weights)
embed_size = 300

model = GRU_SelfAttention_model(vocab_size, embed_size, emb_weights.clone(), bidirectional=True,
                                num_layers=2, hidden_size=32, attn_ouput_size=8, dropout=(0.4, 0.5)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)

In [55]:
optimiser = Adam(model.parameters(), lr=3e-4)
learner(model, loss_fn, optimiser, epochs=8, bs=8)

Epoch : 1, batch : 100, train_loss = 0.6843, val_loss = 0.6942, val_accuracy : 0.480, time = 4s
Epoch : 1, batch : 200, train_loss = 0.6477, val_loss = 0.6067, val_accuracy : 0.657, time = 8s
Epoch : 1, batch : 300, train_loss = 0.6081, val_loss = 0.5375, val_accuracy : 0.750, time = 13s
Epoch : 2, batch : 100, train_loss = 0.5271, val_loss = 0.4929, val_accuracy : 0.750, time = 18s
Epoch : 2, batch : 200, train_loss = 0.4800, val_loss = 0.4609, val_accuracy : 0.783, time = 23s
Epoch : 2, batch : 300, train_loss = 0.4774, val_loss = 0.4398, val_accuracy : 0.790, time = 28s
Epoch : 3, batch : 100, train_loss = 0.4549, val_loss = 0.4245, val_accuracy : 0.817, time = 33s
Epoch : 3, batch : 200, train_loss = 0.4189, val_loss = 0.4035, val_accuracy : 0.803, time = 37s
Epoch : 3, batch : 300, train_loss = 0.4326, val_loss = 0.3917, val_accuracy : 0.833, time = 42s
Epoch : 4, batch : 100, train_loss = 0.4296, val_loss = 0.4257, val_accuracy : 0.823, time = 46s
Epoch : 4, batch : 200, train_lo

In [46]:
optimiser = Adam(model.parameters(), lr=3e-4)
learner(model, loss_fn, optimiser, epochs=8, bs=8)

Epoch : 1, batch : 100, train_loss = 0.6961, val_loss = 0.6725, val_accuracy : 0.647, time = 4s
Epoch : 1, batch : 200, train_loss = 0.6602, val_loss = 0.6044, val_accuracy : 0.720, time = 9s
Epoch : 1, batch : 300, train_loss = 0.5909, val_loss = 0.5619, val_accuracy : 0.697, time = 15s
Epoch : 2, batch : 100, train_loss = 0.5560, val_loss = 0.4787, val_accuracy : 0.777, time = 21s
Epoch : 2, batch : 200, train_loss = 0.4776, val_loss = 0.4868, val_accuracy : 0.773, time = 27s
Epoch : 2, batch : 300, train_loss = 0.4775, val_loss = 0.4192, val_accuracy : 0.813, time = 33s
Epoch : 3, batch : 100, train_loss = 0.4469, val_loss = 0.4023, val_accuracy : 0.830, time = 38s
Epoch : 3, batch : 200, train_loss = 0.4192, val_loss = 0.4156, val_accuracy : 0.823, time = 44s
Epoch : 3, batch : 300, train_loss = 0.4293, val_loss = 0.3928, val_accuracy : 0.837, time = 49s
Epoch : 4, batch : 100, train_loss = 0.4257, val_loss = 0.3824, val_accuracy : 0.847, time = 55s
Epoch : 4, batch : 200, train_lo

In [47]:
optimiser.param_groups[0]['lr'] = 1e-4
learner(model, loss_fn, optimiser, epochs=3, bs=8)

Epoch : 1, batch : 100, train_loss = 0.3509, val_loss = 0.3299, val_accuracy : 0.867, time = 4s
Epoch : 1, batch : 200, train_loss = 0.3282, val_loss = 0.3186, val_accuracy : 0.880, time = 9s
Epoch : 1, batch : 300, train_loss = 0.3177, val_loss = 0.3191, val_accuracy : 0.880, time = 15s
Epoch : 2, batch : 100, train_loss = 0.3431, val_loss = 0.3286, val_accuracy : 0.870, time = 22s
Epoch : 2, batch : 200, train_loss = 0.3039, val_loss = 0.3195, val_accuracy : 0.880, time = 28s
Epoch : 2, batch : 300, train_loss = 0.3219, val_loss = 0.3211, val_accuracy : 0.890, time = 33s
Epoch : 3, batch : 100, train_loss = 0.3441, val_loss = 0.3246, val_accuracy : 0.870, time = 39s
Epoch : 3, batch : 200, train_loss = 0.2867, val_loss = 0.3207, val_accuracy : 0.880, time = 45s
Epoch : 3, batch : 300, train_loss = 0.3514, val_loss = 0.3184, val_accuracy : 0.890, time = 51s


In [48]:
print('Test loss : {:.5f}, test accuracy : {:.03f}'.format(*validate(ds_test, loss_fn, model)))

Test loss : 0.26914, test accuracy : 0.883


In [49]:
model.embedding.weight.requires_grad_(True);
optimiser.param_groups[0]['lr'] = 1e-4
learner(model, loss_fn, optimiser, epochs=5, bs=8)

Epoch : 1, batch : 100, train_loss = 0.3011, val_loss = 0.3228, val_accuracy : 0.887, time = 7s
Epoch : 1, batch : 200, train_loss = 0.2996, val_loss = 0.3191, val_accuracy : 0.880, time = 14s
Epoch : 1, batch : 300, train_loss = 0.3170, val_loss = 0.3170, val_accuracy : 0.893, time = 21s
Epoch : 2, batch : 100, train_loss = 0.3007, val_loss = 0.3294, val_accuracy : 0.870, time = 29s
Epoch : 2, batch : 200, train_loss = 0.2744, val_loss = 0.3178, val_accuracy : 0.883, time = 37s
Epoch : 2, batch : 300, train_loss = 0.2973, val_loss = 0.3161, val_accuracy : 0.887, time = 44s
Epoch : 3, batch : 100, train_loss = 0.2928, val_loss = 0.3282, val_accuracy : 0.863, time = 52s
Epoch : 3, batch : 200, train_loss = 0.2599, val_loss = 0.3168, val_accuracy : 0.887, time = 60s
Epoch : 3, batch : 300, train_loss = 0.2856, val_loss = 0.3117, val_accuracy : 0.890, time = 67s
Epoch : 4, batch : 100, train_loss = 0.2620, val_loss = 0.3298, val_accuracy : 0.863, time = 75s
Epoch : 4, batch : 200, train_l

In [50]:
print('Test loss : {:.5f}, test accuracy : {:.03f}'.format(*utils.validate(ds_test, loss_fn, model)))

Test loss : 0.26950, test accuracy : 0.900


In [51]:
model.embedding.weight.requires_grad_(True);
optimiser.param_groups[0]['lr'] = 5e-5
learner(model, loss_fn, optimiser, epochs=5, bs=8)

Epoch : 1, batch : 100, train_loss = 0.2598, val_loss = 0.3224, val_accuracy : 0.893, time = 8s
Epoch : 1, batch : 200, train_loss = 0.2140, val_loss = 0.3202, val_accuracy : 0.890, time = 15s
Epoch : 1, batch : 300, train_loss = 0.2579, val_loss = 0.3179, val_accuracy : 0.893, time = 24s
Epoch : 2, batch : 100, train_loss = 0.2600, val_loss = 0.3229, val_accuracy : 0.890, time = 32s
Epoch : 2, batch : 200, train_loss = 0.2218, val_loss = 0.3190, val_accuracy : 0.887, time = 40s
Epoch : 2, batch : 300, train_loss = 0.2383, val_loss = 0.3162, val_accuracy : 0.890, time = 48s
Epoch : 3, batch : 100, train_loss = 0.2382, val_loss = 0.3197, val_accuracy : 0.880, time = 56s
Epoch : 3, batch : 200, train_loss = 0.1969, val_loss = 0.3192, val_accuracy : 0.890, time = 64s
Epoch : 3, batch : 300, train_loss = 0.2505, val_loss = 0.3161, val_accuracy : 0.893, time = 72s
Epoch : 4, batch : 100, train_loss = 0.2452, val_loss = 0.3199, val_accuracy : 0.887, time = 79s
Epoch : 4, batch : 200, train_l

In [52]:
print('Test loss : {:.5f}, test accuracy : {:.03f}'.format(*utils.validate(ds_test, loss_fn, model)))

Test loss : 0.27463, test accuracy : 0.897
