# Employing a LSTM to create a generative NLP model

## This notebook used the following sources:

#### 1) http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/
#### 2) http://www.jessicayung.com/lstms-for-time-series-in-pytorch/


In [495]:
import numpy as np


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd
from torch.autograd import Variable as V

torch.manual_seed(1)

import torchtext
from torchtext import data
from torchtext.datasets import WikiText2

# Load Data

In [12]:
TEXT = data.Field()
LABEL = data.Field()

In [509]:
train, valid, test = WikiText2.splits(TEXT) 

In [79]:
TEXT.build_vocab(train, vectors= torchtext.vocab.GloVe(name='6B', dim=200))

In [191]:
#BPTT helps us to feed the inputs in a staggered way (since we want text generation here)
train_iter, valid_iter, test_iter = torchtext.data.BPTTIterator.splits(batch_size=30, #number of sentences
                                                                        bptt_len=10, #max length of a sentence
                                                                        datasets=(train,valid,test) )

# Build a generative model using LSTM cells

In [469]:
class RNN_Model(nn.Module):
    def __init__(self, in_dim, num_vocab, hid_dim, n_layer, batch_size):
        super().__init__()
        #store necessary parameters
        self.in_dim = in_dim
        self.num_vocab = num_vocab
        self.hid_dim = hid_dim
        self.n_layer = n_layer 
        self.batch_size = batch_size
        
        #initialize hidden parameters
        self.init_hidden()
        
        #define the encoder, decoder, and the LSTM cell. 
        self.encoder = nn.Embedding(embedding_dim=in_dim, num_embeddings=num_vocab)
        self.decoder = nn.Linear(in_features=hid_dim, out_features=num_vocab)
        self.RNN = nn.LSTM(input_size=in_dim, hidden_size=hid_dim)
        
        
        
    def forward(self, input):
        output = self.encoder(input)
        output, self.hidden = self.RNN(output, self.hidden)
        #reshape so we have shape (batch_size, hidden_dim)
        decoded = self.decoder(output.view(-1, output.size(2)) )
        #reshape again: Although this is not really necessary, it's good practice to do so.
        return decoded.reshape(output.size(0),output.size(1), decoded.size(1))
        
    def init_hidden(self):
        self.hidden = (torch.zeros(self.n_layer, self.batch_size, self.hid_dim), 
                       torch.zeros(self.n_layer, self.batch_size, self.hid_dim) )
    
    def refresh_hidden(self):
        self.hidden = tuple(V(v.data) for v in self.hidden)


In [473]:
#weight_matrix contains the word embeddings that we pass on to the encoder in our model
weight_matrix = TEXT.vocab.vectors
model = RNN_Model(batch_size=30,hid_dim=100,in_dim=weight_matrix.size(1)
                  ,n_layer=1,num_vocab=weight_matrix.size(0))

model.encoder.weight.data.copy_(weight_matrix)


# initialize loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.7, 0.99))
n_tokens = weight_matrix.size(0)


In [517]:
def trainepochs(epochs):
    for j in range(epochs):
        Loss = 0
        for i,batch in enumerate(train_iter):
            
            # zero gradient
            model.zero_grad()
            
            # use the old hidden & cell vector without backpropogating all the way back
            model.refresh_hidden()
            
            #load in the text and target
            text, target = batch.text, batch.target
            
            prediction = model(text)
            
            loss = criterion(prediction.view(-1, prediction.size(2)), target.view(-1) )
        
            loss.backward()
            
            optimizer.step()
            
            Loss = Loss + loss.item()
            # Uncomment if you want to see the loss
            # if i % 100 == 99:
            #     print("Batch: ", i, " Loss: ", Loss/(i * 300))

        Loss = Loss/(len(train.examples[0].text))
        val_loss = 0
        model.eval()
        for batch in valid_iter:
            model.refresh_hidden()
            text, targets = batch.text, batch.target
            prediction = model(text)
            loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
            val_loss = val_loss + loss.item()
        val_loss /= len(valid.examples[0].text)
        print('Epoch:', j, ', Training Loss:', Loss, ', Validation Loss: ', val_loss)


# Now we train the model

In [None]:
epochs = 2
trainepochs(epochs)

# Now we print out the words that have been trained

In [519]:
def word_ids_to_sentence(id_tensor, vocab, join=None):
    """Converts a sequence of word ids to a sentence"""
    if isinstance(id_tensor, torch.LongTensor):
        ids = id_tensor.transpose(0, 1).contiguous().view(-1)
    elif isinstance(id_tensor, np.ndarray):
        ids = id_tensor.transpose().reshape(-1)
    batch = [vocab.itos[ind] for ind in ids] # denumericalize
    if join is None:
        return batch
    else:
        return join.join(batch)

In [518]:
arrs = model(b.text).cpu().data.numpy()
word_ids_to_sentence(np.argmax(arrs, axis=2), TEXT.vocab, join=' ')