# The LSTM model shown in the KDnuggets article

https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

In [1]:
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
# this is to connect this notebook to the contents of your Google Drive
# files uploaded to Google Drive will not be deleted by inactivity,
# but it does require an authorization code every time you use it
from google.colab import drive
drive.mount(r'/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# this is Ryoko's Google Drive filepath
# please specify your own, or we can (probably) share a folder for it
filepath = r'/content/drive/My Drive/RetardBot/'

In [4]:
is_training = False
#parameters needed to run the model
#these originally needed to be specified from the terminal
sequence_length = 4 #Default = 4 
batch_size = 128 #Default = 256 Reduce if PC don't have enough RAM
max_epochs = 10 #Default = 10
#device = torch.device('cuda:0')
device = torch.device('cpu')

### Notes on the model architecture

Based on model from https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html

The model has three components:
1. **Embedding layer:** converts input of size (batch_size, sequence_length) to embedding of size (batch_size, sequence_length, embedding_dim)
2. **Stacked LSTM of 3 layers:** accepts embedding and a tuple (previous hidden state, previous cell state) and gives an output of size (batch_size, sequence_length, embedding_dim) and the tuple (current hidden state, current cell state). The hidden state and cell state both have size (num_layers, sequence_length, embedding_dim).
3. **Linear layer:** Maps the output of LSTM to logits for each word in vocab. Not a probability yet. Output size is  (batch_size, sequence_length, vocab_size)

In [5]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3 #stack 3 LSTM layers for abstract representation

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.1,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))

### Notes on the custom dataset

According to the Pytorch documentation, a custom dataset needs at least the functions \_\_len\_\_ and \_\_getitem\_\_. \_\_len\_\_\_ allows len(dataset) to return the size of the dataset and  \_\_getitem\_\_ allows the ith element of the dataset to be fetched with dataset\[i\].

In this custom dataset, \_\_len\_\_ and \_\_getitem\_\_ are designed like this. Let's say the only sentence we have in the dataset is:

__*We are using LSTM to create the Retard-bot language model.*__

__\_\_len\_\_:__<br>
For this custom dataset it's defined as "the size of the dataset - sequence length". This is probably because this model is created to make predictions based the first 4 words (default sequence length) given as prompt, but I can't say for certain. So in the example sentence above, it will return  the length of "**to create the Retard-bot language model.**"

__\_\_getitem\_\_:__<br>
It seems that this returns a tuple of n-grams with the n defined by sequence length. So if we say dataset\[0\] in the simple example, we would get (**We are using LSTM**, **are using LSTM to**). Not sure why it does this.

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length
    ):
        """
        words:                 words in entire dataset split by whitespace
        uniq_words:       the unique words sorted by frequency (most frequent first)
        index_to_word: index to word dict {index0: word0, index1:word1...}, most frequent have smaller index
        word_to_index: word to index dict {word0: index0, word1:index1...}, most frequent have smaller index
        words_indexes:  the words converted to their indices using word_to_index
        """
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()
        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        self.words_indexes = [self.word_to_index[w] for w in self.words]
        
    def load_words(self):
        #train_df = pd.read_csv('reddit-cleanjokes.csv') #original reddit-jokes dataset
        #text = train_df['Joke'].str.cat(sep=' ')
        f = open(filepath+"data.txt", "r")
        text = f.read()
        return text.split(' ')
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True) 
    
    def __len__(self):
        return len(self.words_indexes) - self.sequence_length
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

In [7]:
dataset = Dataset(sequence_length)
model = Model(dataset)
model = model.to(device)

def train(dataset, model):
    model.train()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle = True) # NEED TO SHUFFLE AND RERUN
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)
        epoch_loss = 0.0
        
        for i, batch in enumerate(dataloader):
            x, y = batch
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)
            
            state_h = state_h.detach()
            state_c = state_c.detach()
            
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            if (i +1)% 100 ==0:
              print({ 'epoch': epoch+1, "batch": i+1, 'loss': epoch_loss/(i+1) })
            if (i+1)% 3000 ==0:
              # this is needed for work in Colab because once the time limit is up,
              # it will automatically delete all files that are not saved in Google Drive
              print("saving model")
              file_name = 'kdnuggets' + "testrun2"
              torch.save(model.state_dict(), filepath+file_name)

    print({ 'epoch': epoch+1, 'loss': epoch_loss/(i+1) })

In [20]:
def predict(dataset, model, text, next_words=100):
    model.eval()
    
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))
    
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]], device=device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        p /= p.sum() #this is to avoid an error numpy gives about probability not summing to 1
        
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
        
    return words

In [31]:
if is_training:
    train(dataset, model)
    file_name = 'kdnuggets' + str(max_epochs)
    torch.save(model.state_dict(), filepath+file_name)
else:
    #model.load_state_dict(torch.load(file_name, map_location=lambda storage, loc: storage))
    model.load_state_dict(torch.load(filepath + 'kdnuggets_testrun',map_location=torch.device('cpu')))
    model.to(device)

In [32]:
print(predict(dataset, model, text='What'))

['What', 'to', 'I', 'was', 'socially', 'corrupted', "didn't", 'mean', 'this', 'in', 'this', 'reads', 'against', 'the', 'job', 'down', 'lower', 'started', 'an', 'goose', 'about', 'downvoted', 'that', 'into', 'older', 'and', 'that', 'I', 'can', 'let', 'me', 'a', 'wife', 'would', 'be', 'a', 'little', 'PLEASE', 'THE', 'COMNFIDENCE', 'in', 'Vix', 'is', 'close', 'them', 'perfect.rQ´\x07\x00X\x1e\x00\x00\x00*Goes', 'itâ\x80\x99s', '258', 'Retirement', 'SUPER', 'older,', "it's", 'Sunday', 'with', 'that', 'create', 'Christmas', 'looking\n3:', 'instead', 'is', 'pessimistic', 'less', 'than', 'most', 'worse', 'classes', 'speak', 'your', 'board', 'everyone', 'will', 'have', 'through', 'the', 'game', '...â\x80\x9dbe', 'check', 'my', 'portfolio', 'value', 'will', 'lure', 'you', 'want', 'the', 'movie,', 'bouncing', 'resistance', 'Sentiment:\n\n**Sentiment**|**Comments**|**%**\n:--|:--|:--\nBullish|1417|59.12%\nNeutral|13481|-\nBearish|980|40.88%\n\n\nâ\x94\x80â\x94\x80â\x94\x80â\x94\x80â\x94\x80â\x94\

In [33]:
print(predict(dataset, model, text='invest in'))

['invest', 'in', 'too.r)Õ\x03\x00X-\x00\x00\x00tendies', 'added', 'the', 'country', "it's", 'now', 'using', 'his', 'options', 'are', 'down', 'here', 'when', 'you', 'supply', 'pound', 'unpartisan.r~á\x00\x00XM\x00\x00\x00Its', 'Samsung', 'Sales', 'Flags,', 'Check', 'charges', 'then', 'close,', 'huge', 'time', 'and', 'proceeds', 'to', "it's", 'the', 'best', 'week', 'on', 'the', 'club', 'meant', 'to', 'hang', 'into', 'today,', '\x00\x00\x00&amp;#x200B;\n\n**This', 'do', 'over', 'welfare', 'for', 'data', 'for', 'a', 'requirement', 'index', 'trap', 'and', 'as', 'leaked', 'when', 'it', 'on', 'us', 'when', 'RKT', 'can', 'brag', 'if', 'she', "doesn't", 'be', 'what', 'print', 'more', 'kinds', 'of', 'my', 'least', 'itâ\x80\x99s', 'not', 'a', 'couple', 'lotta', 'Earnings', '\\[2/3', 'Volatility\\]\n*', '7:00', '9/18', '$NKLA', '$NKLA', '$NKLA', '250', '9/18', '$NKLA', '250', '9/18', '$NKLA', '250', 'blah', 'done', 'or', 'â\x80\x9chow', 'is', 'a']


In [34]:
print(predict(dataset, model, text='deal'))

['deal', 'margarine', 'baron', 'sheriff', 'in', 'another', 'year', 'or', 'that,', 'he', 'says', 'five', 'bj', 'on', '3', 'months', 'honestly', 'loss,', 'just', 'would', 'do', 'it', 'at', 'truly', 'signed', 'securities', 'but', 'he', 'squished', 'issued', 'include', 'hooked', 'with', 'bullets', 'in', 'buys', 'time', 'for', '1/2', 'of', 'all', 'of', 'members', 'â\x9c\x85', 'TA,', 'am', 'bUy', 'MOVERS:\n\n######(**source:', 'fold', 'my', 'head', 'from', 'the', 'airport', 'intraday', 'the', 'S&amp;P:**\n######', '***Monday', 'annually', 'and', 'buy', 'calls', 'to', 'shouting', 'are', 'underestimate', 'infinite', 'huge', 'hands', 'that', 'you', 'think', 'Elon', 'Iâ\x80\x99m', 'pink', 'thing', 'to', 'live', '10', 'opinion', 'is', '16.4%', 'by', 'Washington,', 'my', 'desk', 'or', 'major', 'pride', 'if', "it's", 'a', 'natural', "homes.r\x8c*\n\x00XL\x00\x00\x00That's", '2', '/u/ControlTheNarrative', 'I', 'would', 'want', 'to', 'have']
