# Contents

- [Loading the data](#loading-the-data)
- [Cleaning and preprocessing](#cleaning-&-preprocessing)
- [Dataset creation](#dataset-creation)
- [Model](#model)
- [Training the model](#train-the-model)

# Loading the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
file_path = '../input/reddit-wallstreetsbets-posts/reddit_wsb.csv'

In [None]:
df = pd.read_csv(file_path)

df.head()

In [None]:
df.shape

First of all let's drop the rows that have an empty body.

In [None]:
body_df = df.dropna(subset=['body'])
body_df.reset_index(inplace=True)
body_df.drop(['index'], axis=1, inplace=True)

body_df.head()

In [None]:
body_df.shape

In [None]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

count_words = lambda x : len(tokenizer(x))

body_df['body_lenght'] = body_df['body'].map(str).apply(count_words)

In [None]:
plt.hist(body_df.body_lenght, bins=25)
plt.show()

In [None]:
body_df.describe()

# Cleaning & Preprocessing

Since in any machine learning task, cleaning or preprocessing the data is as important as model building, in this section I'm going apply basic text preprocessing to the body of the wsb posts such as removing urls, lowercasing the text, remove emoji and emoticons and so on..

In [None]:
import re

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
remove_spaces = lambda x : re.sub('\\s+', ' ', x)

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
remove_double_quotes = lambda x : x.replace('"', '')
remove_single_quotes = lambda x : x.replace('\'', '')
trim = lambda x : x.strip()

In [None]:
other_chars = ['*', '#', '&x200B', '[', ']', '; ',' ;' "&nbsp", "\“"]
def remove_other_chars(x: str):
    for char in other_chars:
        x = x.replace(char, '')
    
    return x

In [None]:
funcs = [remove_urls, remove_spaces, remove_emoji, remove_double_quotes, remove_single_quotes, remove_other_chars, trim]

for fun in funcs:
    body_df['body'] = body_df['body'].apply(fun)

Now that we've clean the body of the posts, let's remove all the post that are not contained in the interquartile range (between the 1st quartile and the 3rd quartile).

In [None]:
index_names = body_df[body_df['body_lenght'] <= 20].index
  
body_df.drop(index_names, inplace = True)

In [None]:
index_names = body_df[body_df['body_lenght'] >= 300].index

body_df.drop(index_names, inplace = True)

In [None]:
# reset indexes (again)
body_df.reset_index(inplace=True)
body_df.drop(['index'], axis=1, inplace=True)

body_df

In [None]:
body_df.describe()

In [None]:
plt.hist(body_df.body_lenght, bins=25)
plt.show()

In [None]:
body_data = body_df.body.tolist()

# Dataset Creation 

To make a phrase recognizable from the model I'm going to add at the begining of a new post the *\<sos>* (start of sentence) special token and at the end of a post the *\<eos>* special token (end of sentence)

In [None]:
SOS_token = "<sos>"
EOS_token = "<eos>"

In [None]:
body_data = [SOS_token + " " + body + " " + EOS_token for body in body_data]

In [None]:
from collections import Counter
import torch
from torchtext.vocab import Vocab

If you've done something related to NLP, you might know that to feed our model with words we might turn them into numbers, and in order to do that we need a table of vocabulary to map the split tokens to numerical indices. Instead of coding my own PyTorch provides us with a useful library **torchtext** that is equipped with a Vocab class that is going to build a vocab for us!

In [None]:
from collections import Counter
from torchtext.vocab import Vocab

counter = Counter()

for body in body_data:
    counter.update(tokenizer(body))

vocab = Vocab(counter, specials=['<unk>', '<pad>', SOS_token, EOS_token])

To make our data samplable using the `dataloader` we must create our own Dataset class by extending the `torch.utils.data.Dataset` class and overring the `__len__` method and the `__getitem__` method.

In [None]:
from torch.utils.data import Dataset
import itertools

def load_data(data: list, vocab) -> list:
    return list(itertools.chain(*[[vocab[token] for token in tokenizer(item)] for item in data]))

class WSBDataset(Dataset):

    def __init__(self, vocab, data, sequence_length):
        self.vocab = vocab
        self.sequence_length = sequence_length
        self.words = load_data(data, vocab)
  
    def __len__(self):
        return len(self.words) - self.sequence_length

    def __getitem__(self, idx):
        return (
          torch.tensor(self.words[idx:idx+self.sequence_length]),
          torch.tensor(self.words[idx+1:idx+self.sequence_length+1]),
        )

# Model

Since the text that we are to deal with can be seen as a sequence of tokens, the prefect fit for it are reccurent neural nets. Nevertheless, to avoid the vanishing/expoloding gradient problem I've choose to use a more complex (and better) version of a vanilla RNN: a [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) (Long short term memory).

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch.nn as nn

class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.lstm_size = 256
        self.embedding_dim = 256
        self.num_layers = 3
        n_vocab = len(vocab)
      
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
      
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )

        self.fc = nn.Linear(self.lstm_size, n_vocab)
    
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (
          torch.zeros(self.num_layers, sequence_length, self.lstm_size, device=device),
          torch.zeros(self.num_layers, sequence_length, self.lstm_size, device=device),
          )

# Train the model

In [None]:
from torch.utils.data import DataLoader
from torch import optim

lr = 0.001 # learning rate
max_epochs = 10
print_every = 200
batch_size = 256
sequence_length = 7
save_model_path = './wsb_lstm.chkpt'
all_losses = []

In [None]:
def train(dataset: torch.utils.data.Dataset, model):
    model.train()

    dataloader = DataLoader(dataset, batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)
    
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

            if batch != 0 and batch % 1000 == 0:
                torch.save(model.state_dict(), save_model_path)

            all_losses.append(loss.item())
            
            if batch % print_every == 0 and batch != 0:    
                print({ 'epoch': epoch, 'batch': batch, 'loss': sum(all_losses[batch-200:batch]) / 200 })
    
    torch.save(model.state_dict(), save_model_path)
    # show losses
    plt.plot(all_losses)
    plt.show()

In [None]:
def predict(dataset, model, text, next_words=100):
    model.eval()
  
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))
  
    for i in range(0, next_words):
        x = torch.tensor([[dataset.vocab[w] for w in words[i:]]]).to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
    
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.vocab.itos[word_index])
  
    return words

In [None]:
dataset = WSBDataset(vocab, body_data, sequence_length)
model = Model().to(device)

train(dataset, model)

In [None]:
print(predict(dataset, model, text='The ceo of nasdaq pushed to halt trading'))