## LSTM version of the RNN code
One of the important additional thing that has been done is to make use of glove pretrained embeddings for the vocabulary to populate the weight matrix of the embbedding. This will give better results than allowing the weight matrix to be initiallised by some random numbers

In [1]:
import torch
from torchtext.legacy import data

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

##tokenize the text field using SPACY
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [3]:
from torchtext.legacy import datasets

## split the dataset into train and test
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


100%|██████████| 84.1M/84.1M [00:07<00:00, 11.4MB/s]


In [5]:
import random
##divide the trainign data into train and validation as well
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [6]:
MAX_VOCAB_SIZE = 25_000
## making use of glove vectors while creating a vocab. This will then be used to initialize the embedding weights
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                               
100%|█████████▉| 399999/400000 [00:10<00:00, 36875.98it/s]


In [7]:
LABEL.build_vocab(train_data)

In [8]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

##define the iterators
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)
    

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)


In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [12]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [14]:
## the model emebddings weight will be set to the pretrained embeddiings which were created while build_vocab
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-1.1172e-01, -4.9659e-01,  1.6307e-01,  ...,  1.2647e+00,
         -2.7527e-01, -1.3254e-01],
        [-8.5549e-01, -7.2081e-01,  1.3755e+00,  ...,  8.2522e-02,
         -1.1314e+00,  3.9972e-01],
        [-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        ...,
        [-9.4070e-01, -7.6762e-04, -1.6590e+00,  ...,  1.0883e-01,
         -5.2538e-02, -7.8009e-01],
        [-4.0654e-01,  8.6993e-01,  5.8611e-01,  ..., -3.5978e-01,
          3.2215e-01,  2.5987e-01],
        [-1.0666e+00,  2.0201e-01, -8.8930e-01,  ..., -2.3666e+00,
         -7.2194e-01,  1.8340e-01]])

In [15]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        ...,
        [-9.4070e-01, -7.6762e-04, -1.6590e+00,  ...,  1.0883e-01,
         -5.2538e-02, -7.8009e-01],
        [-4.0654e-01,  8.6993e-01,  5.8611e-01,  ..., -3.5978e-01,
          3.2215e-01,  2.5987e-01],
        [-1.0666e+00,  2.0201e-01, -8.8930e-01,  ..., -2.3666e+00,
         -7.2194e-01,  1.8340e-01]])


In [16]:
##define the optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [18]:
##loss calculation function
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)


In [19]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [21]:
## function to train the model. It makes use of the iterator to get the trainign data
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
##function to use the model and find out the predictions
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut_LSTM-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 31s
	Train Loss: 0.683 | Train Acc: 55.56%
	 Val. Loss: 0.689 |  Val. Acc: 52.47%
Epoch: 02 | Epoch Time: 0m 32s
	Train Loss: 0.640 | Train Acc: 63.97%
	 Val. Loss: 0.661 |  Val. Acc: 62.61%
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.639 | Train Acc: 63.14%
	 Val. Loss: 0.567 |  Val. Acc: 73.72%
Epoch: 04 | Epoch Time: 0m 32s
	Train Loss: 0.474 | Train Acc: 77.95%
	 Val. Loss: 0.374 |  Val. Acc: 83.87%
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.356 | Train Acc: 84.95%
	 Val. Loss: 0.316 |  Val. Acc: 86.82%


In [25]:
## find the test accuracy of the model
model.load_state_dict(torch.load('tut_LSTM-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.354 | Test Acc: 85.07%
