# The LSTM model shown in the KDnuggets article

https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

In [1]:
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
# this is to connect this notebook to the contents of your Google Drive
# files uploaded to Google Drive will not be deleted by inactivity,
# but it does require an authorization code every time you use it
from google.colab import drive
drive.mount(r'/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# this is Ryoko's Google Drive filepath
# please specify your own, or we can (probably) share a folder for it
filepath = r'/content/drive/My Drive/RetardBot/'

In [4]:
is_training = True
#parameters needed to run the model
#these originally needed to be specified from the terminal
sequence_length = 4 #Default = 4 
batch_size = 128 #Default = 256 Reduce if PC don't have enough RAM
max_epochs = 100 #Default = 10
device = torch.device('cuda:0')
#device = torch.device('cpu')

### Notes on the model architecture

Based on model from https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

The model has three components:
1. **Embedding layer:** converts input of size (batch_size, sequence_length) to embedding of size (batch_size, sequence_length, embedding_dim)
2. **Stacked LSTM of 3 layers:** accepts embedding and a tuple (previous hidden state, previous cell state) and gives an output of size (batch_size, sequence_length, embedding_dim) and the tuple (current hidden state, current cell state). The hidden state and cell state both have size (num_layers, sequence_length, embedding_dim).
3. **Linear layer:** Maps the output of LSTM to logits for each word in vocab. Not a probability yet. Output size is  (batch_size, sequence_length, vocab_size)

In [5]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3 #stack 3 LSTM layers for abstract representation

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.1,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))

### Notes on the custom dataset

According to the Pytorch documentation, a custom dataset needs at least the functions \_\_len\_\_ and \_\_getitem\_\_. \_\_len\_\_\_ allows len(dataset) to return the size of the dataset and  \_\_getitem\_\_ allows the ith element of the dataset to be fetched with dataset\[i\].

In this custom dataset, \_\_len\_\_ and \_\_getitem\_\_ are designed like this. Let's say the only sentence we have in the dataset is:

__*We are using LSTM to create the Retard-bot language model.*__

__\_\_len\_\_:__<br>
For this custom dataset it's defined as "the size of the dataset - sequence length". This is probably because this model is created to make predictions based the first 4 words (default sequence length) given as prompt, but I can't say for certain. So in the example sentence above, it will return  the length of "**to create the Retard-bot language model.**"

__\_\_getitem\_\_:__<br>
It seems that this returns a tuple of n-grams with the n defined by sequence length. So if we say dataset\[0\] in the simple example, we would get (**We are using LSTM**, **are using LSTM to**). Not sure why it does this.

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length
    ):
        """
        words:                 words in entire dataset split by whitespace
        uniq_words:       the unique words sorted by frequency (most frequent first)
        index_to_word: index to word dict {index0: word0, index1:word1...}, most frequent have smaller index
        word_to_index: word to index dict {word0: index0, word1:index1...}, most frequent have smaller index
        words_indexes:  the words converted to their indices using word_to_index
        """
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()
        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        self.words_indexes = [self.word_to_index[w] for w in self.words]
        
    def load_words(self):
        #train_df = pd.read_csv('reddit-cleanjokes.csv') #original reddit-jokes dataset
        #text = train_df['Joke'].str.cat(sep=' ')
        f = open(filepath+"data.txt", "r")
        text = f.read()
        return text.split(' ')
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True) 
    
    def __len__(self):
        return len(self.words_indexes) - self.sequence_length
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

In [7]:
dataset = Dataset(sequence_length)
model = Model(dataset)
model = model.to(device)

def train(dataset, model):
    model.train()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle = True) # NEED TO SHUFFLE AND RERUN
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)
        epoch_loss = 0.0
        
        for i, batch in enumerate(dataloader):
            x, y = batch
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)
            
            state_h = state_h.detach()
            state_c = state_c.detach()
            
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        if ((epoch+1)%10) == 0:
            print({ 'epoch': epoch+1, 'loss': epoch_loss/(i+1) })
    print({ 'epoch': epoch+1, 'loss': epoch_loss/(i+1) })

In [8]:
def predict(dataset, model, text, next_words=100):
    model.eval()
    
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))
    
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]], device=device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
        
    return words

In [None]:
if is_training:
    train(dataset, model)
    file_name = 'kdnuggets' + str(max_epochs)
    torch.save(model.state_dict(), filepath+file_name)
else:
    model.load_state_dict(torch.load(file_name, map_location=lambda storage, loc: storage))
    model.to(device)

In [None]:
print(predict(dataset, model, text='Knock knock. Whos there?'))

['Knock', 'knock.', 'Whos', 'there?', 'hearing!', 'diving', 'bartender', 'say', 'to', 'the', 'jumper', 'cables?', 'You', 'better', 'not', 'try', 'to', 'start', 'anything.', "Don't", 'you', 'hate', 'jokes', 'about', 'German', 'sausage?', "They're", 'the', 'wurst!', 'Two', 'artists', 'had', 'an', 'art', 'contest...', 'It', 'ended', 'in', 'a', 'draw', 'Why', 'did', 'the', 'chicken', 'cross', 'the', 'playground?', 'To', 'get', 'to', 'the', 'other', 'slide.', 'What', 'gun', 'do', 'you', 'use', 'to', 'hunt', 'a', 'moose?', 'A', 'moosecut!', 'If', 'life', 'gives', 'you', 'melons,', 'you', 'might', 'have', 'dyslexia.', 'Broken', 'pencils...', '...are', 'pointless.', 'What', 'did', 'one', 'snowman', 'say', 'to', 'the', 'other', 'snowman?', "'Do", 'you', 'smell', "carrots?'", 'How', 'many', 'hipsters', 'does', 'it', 'take', 'to', 'change', 'a', 'lightbulb?', "It's", 'a', 'really', 'obscure']


In [None]:
print(predict(dataset, model, text='What did the'))

['What', 'did', 'the', 'bartender', 'say', 'to', 'the', 'jumper', 'cables?', 'You', 'better', 'not', 'try', 'to', 'start', 'anything.', "Don't", 'you', 'hate', 'jokes', 'about', 'German', 'sausage?', "They're", 'the', 'wurst!', 'Two', 'artists', 'had', 'an', 'art', 'contest...', 'It', 'ended', 'in', 'a', 'draw', 'Why', 'did', 'the', 'chicken', 'cross', 'the', 'playground?', 'To', 'get', 'to', 'the', 'other', 'slide.', 'What', 'gun', 'do', 'you', 'use', 'to', 'hunt', 'a', 'moose?', 'A', 'moosecut!', 'If', 'life', 'gives', 'you', 'melons,', 'you', 'might', 'have', 'dyslexia.', 'Broken', 'pencils...', '...are', 'pointless.', 'What', 'did', 'one', 'snowman', 'say', 'to', 'the', 'other', 'snowman?', "'Do", 'you', 'smell', "carrots?'", 'How', 'many', 'hipsters', 'does', 'it', 'take', 'to', 'change', 'a', 'lightbulb?', "It's", 'a', 'really', 'obscure', 'number.', "You've"]


In [None]:
print(predict(dataset, model, text='Why did the chicken cross the road'))

['Why', 'did', 'the', 'chicken', 'cross', 'the', 'road', 'week...', 'favorite', 'Italian', 'asks', 'what', 'always', 'God', 'through', 'brown', 'two', 'year', 'first', 'job', 'wheels', 'that', 'work.', 'Why', 'do', 'galaxies', 'have', 'old', 'poker', 'Because', "they're", 'look', 'before', "that'd", 'episodes.', "What's", 'Sam', "Smith's", 'favorite', 'kind', 'with', 'paper.', 'What', 'do', 'you', 'call', 'a', 'pig', 'that', 'does', 'karate?', '*A', 'pork', 'chop.*', 'What', 'was', 'the', 'car', 'doing', 'in', 'the', 'dressing', 'room?', 'Changing', 'attire.', 'What', 'do', 'you', 'call', 'a', 'pile', 'of', 'dogs?', 'A', 'ruff', 'terrain.', 'How', 'do', 'you', 'prepare', 'for', 'a', 'party', 'in', 'space?', 'You', 'Planet', 'Thanks', 'u/BostonCentrist', 'What', 'do', 'you', 'get', 'when', 'you', 'cross', 'an', 'octopus', 'with', 'a', 'cow?', 'A', 'stern', 'rebuke', 'from', 'the', 'Ethics']
