# The LSTM model shown in the KDnuggets article

https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

In [None]:
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import DataLoader
import re, string
from nltk.tokenize import WordPunctTokenizer
import matplotlib.pyplot as plt

In [None]:
# this is to connect this notebook to the contents of your Google Drive
# files uploaded to Google Drive will not be deleted by inactivity,
# but it does require an authorization code every time you use it
from google.colab import drive
drive.mount(r'/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# this is Ryoko's Google Drive filepath
# please specify your own, or we can (probably) share a folder for it
filepath = r'/content/drive/My Drive/RetardBot/'

In [None]:
is_training = True

#this is for saving/loading the model from the checkpoint when colab crashes
load_model  = True
load_file = "kdn_temp2" #file name if I'm going to load the model
tmp_name = "kdn_temp2" #temporary file I want to create checkpoint in

#the device switch
device = torch.device('cuda:0')
#device = torch.device('cpu')

#parameters needed to run the model
#these originally needed to be specified from the terminal
sequence_length = 4 #Default = 4 
batch_size = 128 #Default = 256 Reduce if PC don't have enough RAM
max_epochs = 400 #Default = 10

### Notes on the model architecture

Based on model from https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

The model has three components:
1. **Embedding layer:** converts input of size (batch_size, sequence_length) to embedding of size (batch_size, sequence_length, embedding_dim)
2. **Stacked LSTM of 3 layers:** accepts embedding and a tuple (previous hidden state, previous cell state) and gives an output of size (batch_size, sequence_length, embedding_dim) and the tuple (current hidden state, current cell state). The hidden state and cell state both have size (num_layers, sequence_length, embedding_dim).
3. **Linear layer:** Maps the output of LSTM to logits for each word in vocab. Not a probability yet. Output size is  (batch_size, sequence_length, vocab_size)

In [None]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3 #stack 3 LSTM layers for abstract representation

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.1,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))

### Notes on the custom dataset

According to the Pytorch documentation, a custom dataset needs at least the functions \_\_len\_\_ and \_\_getitem\_\_. \_\_len\_\_\_ allows len(dataset) to return the size of the dataset and  \_\_getitem\_\_ allows the ith element of the dataset to be fetched with dataset\[i\].

In this custom dataset, \_\_len\_\_ and \_\_getitem\_\_ are designed like this. Let's say the only sentence we have in the dataset is:

__*We are using LSTM to create the Retard-bot language model.*__

__\_\_len\_\_:__<br>
For this custom dataset it's defined as "the size of the dataset - sequence length". This is probably because this model is created to make predictions based the first 4 words (default sequence length) given as prompt, but I can't say for certain. So in the example sentence above, it will return  the length of "**to create the Retard-bot language model.**"

__\_\_getitem\_\_:__<br>
It seems that this returns a tuple of n-grams with the n defined by sequence length. So if we say dataset\[0\] in the simple example, we would get (**We are using LSTM**, **are using LSTM to**). Not sure why it does this.

In [None]:
def preprocess(filename, clip=1):
  f = open(filename, "r")
  text = f.read()
  

  text = text.split(' ')
  text = [word for word in text if not '\\' in r"%r" %word] #remove the words containing backslashes (formerly emojis or sth?)
  text = [word.lower() for word in text] 
  text =  [WordPunctTokenizer().tokenize(word) for word in text] #using nltk to separate punctuation from words
  text = [item for sublist in text for item in sublist]

  #if passed, clip shortens the text to 1/clip of the original
  clip_idx = round(len(text) / clip)
  text = text[:clip_idx]
  
  return text

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length
    ):
        """
        words:                 words in entire dataset split by whitespace
        uniq_words:       the unique words sorted by frequency (most frequent first)
        index_to_word: index to word dict {index0: word0, index1:word1...}, most frequent have smaller index
        word_to_index: word to index dict {word0: index0, word1:index1...}, most frequent have smaller index
        words_indexes:  the words converted to their indices using word_to_index
        """
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()
        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        self.words_indexes = [self.word_to_index[w] for w in self.words]
        
    def load_words(self):
        #train_df = pd.read_csv('reddit-cleanjokes.csv') #original reddit-jokes dataset
        #text = train_df['Joke'].str.cat(sep=' ')
        return preprocess(filepath+"clean_data.txt", clip=15)
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True) 
    
    def __len__(self):
        return len(self.words_indexes) - self.sequence_length
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

In [None]:
dataset = Dataset(sequence_length)
model = Model(dataset)
model = model.to(device)
tmp_name = "kdn_temp2"

def train(dataset, model):
    model.train()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle = True) # NEED TO SHUFFLE AND RERUN
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    lambda1 = lambda epoch: 0.8 ** epoch
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

    if load_model == True:
      checkpoint = torch.load(filepath+load_file, map_location=device)
      model.load_state_dict(checkpoint['model_state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      last_epoch = checkpoint['epoch']
      epoch_losses = checkpoint['epoch_losses']
    else:
      last_epoch = 0
      epoch_losses = []
    
    for epoch in range(last_epoch, max_epochs):
      epoch_loss = 0.0
      state_h, state_c = model.init_state(sequence_length)
      
      for i, batch in enumerate(dataloader):
          x, y = batch
          x, y = x.to(device), y.to(device)
          optimizer.zero_grad()
          y_pred, (state_h, state_c) = model(x, (state_h, state_c))
          loss = criterion(y_pred.transpose(1, 2), y)
          
          state_h = state_h.detach()
          state_c = state_c.detach()
          
          loss.backward()
          optimizer.step()
          epoch_loss += loss.item()
          

          if (i +1)% 100 ==0:
            print({ 'epoch': epoch+1, "batch": i+1, 'loss': epoch_loss/(i+1) })
      
      print({ 'epoch': epoch+1, 'loss': epoch_loss/(i+1), 'epoch_losses len': len(epoch_losses)})

      if (epoch+1)%10 == 0:
        print("decrease learning rate")
        scheduler.step()

      # this is needed for work in Colab because once the time limit is up,
      # it will automatically delete all files that are not saved in Google Drive      
      print("saving model")
      torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch_losses': epoch_losses,
            }, filepath + tmp_name)
      epoch_losses.append(epoch_loss)
      

In [None]:
def predict(dataset, model, text, next_words=100):
    model.eval()
    
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))
    
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]], device=device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        p /= p.sum() #this is to avoid an error numpy gives about probability not summing to 1
        
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
        
    return words

In [None]:
if is_training:
    train(dataset, model)
    #file_name = 'kdnuggets' + str(max_epochs)
    file_name = 'kdnuggets_preprocess_' + str(max_epochs)
    torch.save(model.state_dict(), filepath+file_name)
else:
    #model.load_state_dict(torch.load(file_name, map_location=lambda storage, loc: storage))
    model.load_state_dict(torch.load(filepath + 'kdnuggets_temp',map_location=torch.device('cpu')))
    model.to(device)

{'epoch': 9, 'batch': 100, 'loss': 4.97307436466217}
{'epoch': 9, 'batch': 200, 'loss': 4.985718533992768}
{'epoch': 9, 'batch': 300, 'loss': 4.989361406962077}
{'epoch': 9, 'batch': 400, 'loss': 4.993049536943436}
{'epoch': 9, 'batch': 500, 'loss': 4.992406661987305}
{'epoch': 9, 'batch': 600, 'loss': 4.992355169455211}
{'epoch': 9, 'batch': 700, 'loss': 4.991387325695583}
{'epoch': 9, 'batch': 800, 'loss': 4.993124195933342}
{'epoch': 9, 'batch': 900, 'loss': 4.991148814095391}
{'epoch': 9, 'batch': 1000, 'loss': 4.992438296794892}
{'epoch': 9, 'batch': 1100, 'loss': 4.994581370353699}
{'epoch': 9, 'batch': 1200, 'loss': 4.99402241786321}
{'epoch': 9, 'batch': 1300, 'loss': 4.993597203034621}
{'epoch': 9, 'batch': 1400, 'loss': 4.99409601518086}
{'epoch': 9, 'batch': 1500, 'loss': 4.996076333999634}
{'epoch': 9, 'batch': 1600, 'loss': 4.997355302274227}
{'epoch': 9, 'batch': 1700, 'loss': 4.996597643179052}
{'epoch': 9, 'batch': 1800, 'loss': 4.997361216280195}
{'epoch': 9, 'batch': 

In [None]:
print(predict(dataset, model, text='what'))

['what', '70', 'price', 'is', 'buying', 'of', 'economic', 'effects', 'here', 'for', 'the', 'npl', 'volume', 'for', 'up', 'was', 'pricing', 'in', 'the', 'big', ',', 'peter', 'feddo', ',', '2019', 'expected', ',', 'while', 'the', 'top', 'president', 'and', 'rent', 'in', 'lieu', 'of', 'the', 'company', "'", 'll', 'be', 'met', 'the', 'company', "'", 'm', 'looking', 'at', 'approximately', '4', ':', 'abuse', 'and', 'mental', 'region', 'mooning', 'in', 'place', 'again', 'you', 'make', 'holding', '45', '-', 'work', ';', 'estimates', 'in', 'the', 'musk', 'came', 'to', 'make', 'the', 'botox', 'to', 'reflect', 'good', 'signal', ',', 'other', 'people', 'with', 'her', 'accounts', 'and', 'just', 'really', 'stumble', 'significantly', 'on', 'wednesday', ',', 'the', 'resected', 'lung', 'about', 'the', 'world', 'will', 'be']


In [None]:
print(predict(dataset, model, text='invest in'))

['invest', 'in', 'wework', 'got', 'auto', 'manufacturers', 'losing', 'their', 'excess', ',', 'but', '...', 'just', 'like', 'major', 'domestic', 'stimulus', 'is', 'still', 'starting', 'out', 'the', 'cost', '-', 'a', 'out', 'these', 'garbage', 'stocks', 'usually', 'gets', 'circuit', 'breakers', ',', 'all', 'mods', 'can', "'", 's', 'consciousness', 'old', 'broker', 'to', 'divest', 'missouri', 'and', 'market', ',', 'which', 'i', 'am', 'going', 'into', 'this', 'year', 'periods', 'of', 'the', 'stock', 'has', 'averaged', 'the', 'fed', 'is', 'traveling', 'by', 'the', 'foot', ',', 'country', ',', 'as', 'extensions', 'of', 'came', 'up', '2', '%', 'from', 'the', 'french', 'king', ',', 'its', 'sss', '188', '.', '**', 'uncertainty', 'hopefully', 'on', 'my', 'taxes', 'on', '&', 'amp', ';', 'q', '=', 'day', 'moving', 'average']


In [None]:
print(predict(dataset, model, text='deal'))

['deal', 'for', 'the', 'mainstream', '-', 'duty', 'or', 'office', '](', 'https', '://', 'imgur', '.', '4', '%', '|', 'lows', 'is', 'in', 'our', 'power', 'are', 'paying', 'for', 'good', 'for', 'the', 'rising', 'relatively', '8', '.', '98', 'shares', ',', 'they', 'expect', '-', '0', '.', 'org', '/', '8', '.', 'change', 'activity', 'for', '$', '2', '.', 'bullish', '25', ')', 'from', 'february', 'the', 'you', 'would', 'be', 'road', 'discuss', 'for', '$', '500', ',', 'seems', 'to', 'get', 'too', 'be', 'close', '|', '+', '0', '.', 'reddit', '.', '1tn', 'their', 'own', 'technical', 'point', 'to', '10', '.', '2', ')', 'is', 'no', 'new', 'zealand', "'", 're', 'about', 'both', 'parties', 'and', 'decide', 'from', 'u', '.', '00']
