#### Imports

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from gensim.models import FastText
import numpy as np
# import whatever you need

Choose model

In [3]:
chosen_embeddings = FastText.load("embeddings/cc.bn.300.model")
#chosen_embeddings = FastText.load("embeddings/ai4b_subset_sg.model")
#chosen_embeddings = FastText.load("embeddings/ai4b_subset_fair.model")

Create embedding matrix

In [None]:
def build_simple_embedding(gensim_model, keep_n = 150000):
    wv = gensim_model.wv  
    gensim_weights = torch.FloatTensor(wv.vectors[:keep_n])
    # sorted, so keeping top 150000 works
    
    pad_weight = torch.zeros(1, wv.vector_size)                     # <PAD> gets zeros
    special_weights = torch.randn(3, wv.vector_size) * 0.1          # <BOS>, <EOS>, <UNK> get random noise
    # scale down (x0.1) to match sparseness of other token vecs

    # combine <PAD>, <BOS>, <EOS> and <UNK> with other tokens
    all_weights = torch.cat([pad_weight, special_weights, gensim_weights], dim=0)
    
    # make the full embedding
    embedding_layer = nn.Embedding.from_pretrained(all_weights, freeze=False, padding_idx=0)
    
    # create mapping dictionary for token in new vocab, to index
    word2idx = {'<PAD>': 0, '<BOS>': 1, '<EOS>': 2, '<UNK>': 3}
    word2idx.update({word: idx + 4 for word, idx in wv.key_to_index.items()})
    
    return embedding_layer, word2idx

embedding_layer, word_to_index = build_simple_embedding(chosen_embeddings)
print('Vocabulary size:', len(word_to_index))

Vocabulary size: 1485027


(free up RAM)

In [6]:
del chosen_embeddings

Encoder - BiLSTM

In [9]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, embedding_layer, hidden_size):
        super(BiLSTMEncoder, self).__init__()
        
        # AI agent - explain each of these, please!
        self.embedding = embedding_layer # loads embedding made with gensim
        self.hidden_size = hidden_size # neural net hidden size
        embed_size = embedding_layer.embedding_dim # 300 for us
        
        self.lstm = nn.LSTM(
            input_size=embed_size, # 300
            hidden_size=hidden_size, # suppose 256, for subsequent example
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, x, lengths):
        # dry run: consider a batch (size 4) of question vectors, 
        # with the longest sequence's length as 8.
            # [ 1, 45,  89,  12,  56,  90,  34,   2]
            # [ 1, 19, 102,  77, 210,  14,   2,   0]
            # [ 1, 65,  23,  11,   2,   0,   0,   0]
            # [ 1, 99,  41,   2,   0,   0,   0,   0]
        # this is loaded into the 2D tensor, x | shape: (4, 8)
        # more generally, x | shape: (batch_size, longest_seq_len)

        # length (1D) is the length of each sequence in x
        # lengths = [8, 7, 5, 4] | shape: (4,)
        # more generally, lengths | shape: (batch_size,)
        

        # convert each word index to its vector with the embedding. 
        # for our example, that's 4 sequences, with each 8 tokens each, and each
        # token having a 'depth' of 300 (it's a vector now)

        # this is a 3D tensor, embedded | shape: (4, 8, 300)
        # more generally, embedded | shape: (batch_size, longest_seq_len, word_vec_embedding_dim)
        embedded = self.embedding(x)
        
        # tells PyTorch to mathematically gloss over <PAD> tokens by ignoring them based on the 
        # values in the length vector (1D tensor)

        # tells the neural net to fully ignore <PAD> tokens.
        # even though they are zeroed out, the LSTM tries to do some math
        # when encountering it using its 3 gates. this adds some redundancy 
        # and learning that it really doesn't need.
        packed_embedded = pack_padded_sequence(
            embedded, 
            lengths.cpu(), # [AI-agent: why CPU? # answer - test without CPU]
            batch_first=True, # our formatting puts the batch_size first
            enforce_sorted=False # sort the batch by sequence length (high to low)
        )
        
        # run the nice embeddings through the BiLSTM
        
        # hidden --- final hidden state (short term memory) | shape: (2, 4, 256)
        # 2 : forward + backward, 4 : sequences, 256 : hidden-size

        # cell --- final cell state (long term memory) | shape: (2, 4, 256)
        # (same logic)

        _, (hidden, cell) = self.lstm(packed_embedded)
        
        # hidden/cell tensors have shape (num_layers * num_directions, batch_size, hidden_size)
        # index 0 -> forward LSTM's final state; index 1 -> backward LSTM's final state

        h_forward = hidden[0, :, :] # shape: (1, 4, 256) [take forward direction]
        h_backward = hidden[1, :, :] # shape: (1, 4, 256) [take backward direction]
        # recall, hidden | shape: (2, 4, 256)
        
        # same logic
        c_forward = cell[0, :, :]
        c_backward = cell[1, :, :]
        
        # concatenate along the hidden_size dimension (dim=1)
        # h_context (c_context) | shape: (batch_size, hidden_size * 2) = (4, 256*2) = (4, 512)

        h_context = torch.cat((h_forward, h_backward), dim=1)
        c_context = torch.cat((c_forward, c_backward), dim=1)
        
        # compressing the context of each question (long term and short term)
        # into two vectors of size 2*256 = 512, for every sentence in the batch
        return h_context, c_context

Decoder - LSTM

In [None]:
class LSTMDecoder(nn.Module):
    def __init__(self, embedding_layer, hidden_size, vocab_size):
        super(LSTMDecoder, self).__init__()
        
        # same as before
        self.embedding = embedding_layer
        embed_size = embedding_layer.embedding_dim
        
        # double of BiLSTM hidden size
        self.hidden_size = hidden_size 
        
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )
        
        # setup to hold hidden dim vectors streched out as probabilities
        # over tokens in the vocab
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        # x: input token for current step, shape: (batch_size) = (4) [suppose]
        
        # LSTM requires 3D input: (batch_size, sequence_length, embed_size).
        # since we process exactly 1 token at a time (per batch), the sequence_length is always 1
        # shape: (4, 1)
        x = x.unsqueeze(1)
        
        # convert each token to its vector
        # embedded | shape: (4, 1, 300)
        embedded = self.embedding(x)
        
        # pass embedded word and BiLSTM question contexts (long term, short term) into LSTM
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output shape: (batch_size, 1, hidden_size) = (4, 1, 512)

        # squeeze out the sequence length dimension because it is no longer needed
        # shape: (4, 512)
        output = output.squeeze(1)
        
        # push to linear layer to make prediction for current word
        # shape: (batch_size, vocab_size) = (4, 1485027)
        prediction = self.fc(output)
        
        # return guess and forward directional memory for next word
        return prediction, hidden, cell

Seq2Seq setup with Teacher Forcing

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, src_lengths, trg):
        
        # src: (batch_size, max_src_len) - padded bangla questions
        # src_lengths: (batch_size) - true lengths of the questions
        # trg: (batch_size, max_trg_len) - ground truth bangla answers
        
        batch_size = trg.shape[0]
        max_trg_len = trg.shape[1]
        vocab_size = self.decoder.fc.out_features
        
        # empty tensor to hold word by word predictions
        # outputs | shape: (batch_size, max_trg_len, vocab_size)
        outputs = torch.zeros(batch_size, max_trg_len, vocab_size).to(self.device)
        # shape: (4, 8, 1485027)
        
        # encode question
        h_context, c_context = self.encoder(src, src_lengths)
        
        # format for decoder: (batch_size, 512) -> (1, batch_size, 512)
        # LSTM class only accepts in this format
        hidden = h_context.unsqueeze(0)
        cell = c_context.unsqueeze(0)
        
        # first input to the decoder is ALWAYS the <BOS> token.
        input_token = trg[:, 0]
        # this is a column vector of <BOS> tokens, [<BOS>]
        
        # from first word onwards... 
        for t in range(1, max_trg_len):
            
            # pass the current word and the memory states into the decoder
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            # shape: (4, 1485027)
            
            # store the prediction in our outputs tensor
            outputs[:, t, :] = output
            # t'th word across all batches and full vocabulary has been saved as output
            
            # TEACHER FORCING: 
            # ignore whatever the model output.
            # force the next input to be the TRUE target token from the dataset.
            input_token = trg[:, t] 
            # take next column of true answer words as input
            
        return outputs