In [17]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# keras module for building LSTM 
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import keras
import tensorflow as tf
from tensorflow.keras.utils import Sequence

if IN_COLAB:
  !pip install Keras-Preprocessing
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

import pandas as pd
import numpy as np
import string, os 

# Text Generation with LSTM

In [2]:
# Get Data
train_text_file = keras.utils.get_file('train_text.txt', 'https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/reddit_wsb.csv')
train_text = pd.read_csv(train_text_file)
train_text.sample(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
37809,"What do we say to the God of Debt? ""Not today.""",23,lw9bja,https://i.redd.it/126wuz8ytnk61.png,2,1614739000.0,,2021-03-03 04:42:49
34640,Bought another 1000 shares. PLTR to the moon 🚀🚀🚀,879,log9ur,https://i.redd.it/i9d024zq2pi61.jpg,248,1613883000.0,,2021-02-21 06:45:14
13304,"IT'S A WAR, LAWSUIT get ready boys.",1,l724iu,https://www.reddit.com/r/wallstreetbets/commen...,1,1611882000.0,HOLD HOLD DON'T SELLL,2021-01-29 02:54:04
21409,NOKIA YOLO Update. For the Holders MOON 💥 BOOM...,79,lai4j5,https://i.redd.it/ny41ojqqfye61.jpg,21,1612253000.0,,2021-02-02 09:57:34
25928,Should I sell NOK and buy GME or AMC?,9,lc7h4a,https://www.reddit.com/r/wallstreetbets/commen...,26,1612442000.0,I feel more supportive on GME and AMC that in NOK,2021-02-04 14:27:55
43721,What it looks like if you were 170% invested i...,653,mfs678,https://www.reddit.com/r/wallstreetbets/commen...,168,1617061000.0,"In my brilliance, I saw high premiums on DISCA...",2021-03-30 02:34:26
25609,"Disclaimers for new diamond-handed, autistic, ...",9,lcc6ri,https://www.reddit.com/r/wallstreetbets/commen...,19,1612460000.0,I'm not entirely new to this sub but I haven't...,2021-02-04 19:36:20
45407,Buy the dip $RKT. 112K RKT long ding dong,104,mph94o,https://i.redd.it/85655y9dprs61.png,23,1618273000.0,,2021-04-13 03:17:11
1974,Boomers are big mad now cuz they cant understa...,4,l6yxbq,https://imgur.com/vwV6eN5.jpg,0,1611875000.0,,2021-01-29 01:02:26
20699,$GME Caution (actually just Gay Bear) reportin...,0,l9yrhs,https://www.reddit.com/r/wallstreetbets/commen...,135,1612198000.0,"**TLDR: ""CoNtrARiaN"" but actually just a\ndick...",2021-02-01 18:47:36


In [3]:
TOKENS = 1000
OUTPUT_LENGTH = 25

sentence to text words

In [22]:
raw_text = train_text['body']
## Remove punctuation
raw_text = raw_text.dropna()
raw_text = raw_text.apply(lambda x: x.replace('[{}]'.format(string.punctuation), ''))
vocab = set()
sentences = []
for sentence in raw_text:
  current_sentence = text_to_word_sequence(sentence)
  sentences.append(current_sentence)
  vocab.update(current_sentence)
#vocab
#sentences
max_length = max([len(sentence) for sentence in sentences])

[['the',
  'ceo',
  'of',
  'nasdaq',
  'pushed',
  'to',
  'halt',
  'trading',
  '“to',
  'give',
  'investors',
  'a',
  'chance',
  'to',
  'recalibrate',
  'their',
  'positions”',
  'https',
  'mobile',
  'twitter',
  'com',
  'mediaite',
  'status',
  '1354504710695362563',
  'https',
  'mobile',
  'twitter',
  'com',
  'mediaite',
  'status',
  '1354504710695362563',
  'now',
  'sec',
  'is',
  'investigating',
  'brokers',
  'are',
  'disallowing',
  'buying',
  'more',
  'calls',
  'this',
  'is',
  'the',
  'institutions',
  'flat',
  'out',
  'admitting',
  'they',
  'will',
  'change',
  'the',
  'rules',
  'to',
  'bail',
  'out',
  'the',
  'rich',
  'but',
  'if',
  'it',
  'happens',
  'to',
  'us',
  'we',
  'get',
  'a',
  '“well',
  'shucks',
  'you',
  'should',
  'have',
  'known',
  'investing',
  'is',
  'risky',
  'have',
  'you',
  'tried',
  'cutting',
  'out',
  'avocados',
  'and',
  'coffee',
  'maybe',
  'doing',
  'uber',
  'on',
  'the',
  'side',
  '”'

#### Tokenize

In [25]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer

{'the': 1,
 'to': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'in': 7,
 'i': 8,
 'this': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'on': 13,
 'you': 14,
 'are': 15,
 'https': 16,
 'they': 17,
 'with': 18,
 'be': 19,
 'have': 20,
 'as': 21,
 'at': 22,
 'not': 23,
 'but': 24,
 'will': 25,
 'we': 26,
 'com': 27,
 'if': 28,
 'all': 29,
 'their': 30,
 'from': 31,
 'so': 32,
 'or': 33,
 'stock': 34,
 'has': 35,
 'my': 36,
 'was': 37,
 'up': 38,
 'market': 39,
 'can': 40,
 'by': 41,
 'more': 42,
 'gme': 43,
 'what': 44,
 'just': 45,
 'www': 46,
 'shares': 47,
 'price': 48,
 'your': 49,
 'like': 50,
 'an': 51,
 'out': 52,
 'png': 53,
 'short': 54,
 'about': 55,
 '1': 56,
 'now': 57,
 'do': 58,
 'some': 59,
 'there': 60,
 '2': 61,
 'x200b': 62,
 'buy': 63,
 's': 64,
 'company': 65,
 'which': 66,
 'when': 67,
 'money': 68,
 'people': 69,
 'get': 70,
 'been': 71,
 'reddit': 72,
 'one': 73,
 'would': 74,
 '3': 75,
 'them': 76,
 'because': 77,
 'time': 78,
 'no': 79,
 'these': 80,
 'r': 81,
 'us': 82

#### Generate Sequences

In [26]:
## convert data to sequence of tokens 
def make_sequence(sentences):
    input_sequences = []
    for line in sentences:
        for i in range(1, len(line)):
            n_gram_sequence = line[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

In [None]:
class sequenceGenerator(Sequence):
    def __init__(self, sequences, batch_size):
        self.index = 0
        self.sequences = sequences
        self.batch_size = batch_size

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        for i in range(index, index + self.batch_size):
            sequence = self.sequences[i]
            x = sequence[:-1]
            y = sequence[-1]
            x = tokenizer.texts_to_sequences(x)
            x = pad_sequences(x, maxlen=max_length-1, padding='pre')
            y = ku.to_categorical(y, num_classes=TOKENS)
            yield x, y

In [27]:
sequences = make_sequence(sentences)
sequences

Generator


In [24]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(train_text['body'].dropna())


{'the': 1,
 'to': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'in': 7,
 'i': 8,
 'this': 9,
 'that': 10,
 'it': 11,
 'for': 12,
 'on': 13,
 'you': 14,
 'are': 15,
 'https': 16,
 'they': 17,
 'with': 18,
 'be': 19,
 'have': 20,
 'as': 21,
 'at': 22,
 'not': 23,
 'but': 24,
 'will': 25,
 'we': 26,
 'com': 27,
 'if': 28,
 'all': 29,
 'their': 30,
 'from': 31,
 'so': 32,
 'or': 33,
 'stock': 34,
 'has': 35,
 'my': 36,
 'was': 37,
 'up': 38,
 'market': 39,
 'can': 40,
 'by': 41,
 'more': 42,
 'gme': 43,
 'what': 44,
 'just': 45,
 'www': 46,
 'shares': 47,
 'price': 48,
 'your': 49,
 'like': 50,
 'an': 51,
 'out': 52,
 'png': 53,
 'short': 54,
 'about': 55,
 '1': 56,
 'now': 57,
 'do': 58,
 'some': 59,
 'there': 60,
 '2': 61,
 'x200b': 62,
 'buy': 63,
 's': 64,
 'company': 65,
 'which': 66,
 'when': 67,
 'money': 68,
 'people': 69,
 'get': 70,
 'been': 71,
 'reddit': 72,
 'one': 73,
 'would': 74,
 '3': 75,
 'them': 76,
 'because': 77,
 'time': 78,
 'no': 79,
 'these': 80,
 'r': 81,
 'us': 82

In [15]:
class CSVGenerator(Sequence):
    def __init__(self, file_path):
        self.file = file_path
        self.index = 0

    def __len__(self):
        return len(self.files)

    def __getitem__(self, index):
        file_path = os.path.join(self.folder_path, self.files[index])
        data = pd.read_csv(file_path)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        print("X: ", X, "y: ", y)
        return X, y

In [16]:
tmp = CSVGenerator(train_text_file)

NotADirectoryError: [Errno 20] Not a directory: '/Users/akeems/.keras/datasets/train_text.txt'

Text Vectorization with Keras

In [9]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=TOKENS,
    #output_mode='int',
    output_sequence_length=OUTPUT_LENGTH)


In [10]:
text_dataset = tf.data.Dataset.from_tensor_slices(train_text["body"].dropna())
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.

vectorize_layer.adapt(text_dataset.batch(64))

vect_text = vectorize_layer(text_dataset)
vect_text


ValueError: Exception encountered when calling layer 'text_vectorization_1' (type TextVectorization).

Attempt to convert a value (<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>) with an unsupported type (<class 'tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset'>) to a Tensor.

Call arguments received by layer 'text_vectorization_1' (type TextVectorization):
  • inputs=<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

### Clean Text

We can clean and prep our text here. The data cleanup we need is to:
<ul>
<li> Remove punctuation.
<li> Tokenize the text, as we did previously in NLP processing. 
<li> <b>Generate sequences of tokens.</b> This is the key to the LSTM model, we are structuring the data to be a sequence of tokens. Our model will attempt to predict the next token, which in this case is the next word in the sentence.
</ul>

In [4]:
def get_sequence_of_tokens(corpus, tokenizer):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

strip_punct = train_text["body"].dropna().str.replace('[{}]'.format(string.punctuation), '')
inp_seq, total_words = get_sequence_of_tokens(strip_punct, Tokenizer())
inp_seq

  strip_punct = train_text["body"].dropna().str.replace('[{}]'.format(string.punctuation), '')


KeyboardInterrupt: 

#### Dataset Prep - Padding and Targets

We also need to take the sequences and pad them, or make them all the same length. We will also create the targets - the next word in the sequence.

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_seq)

: 

: 

### Model

Now we model. The data that we made mirrors the construction of a sentence.
<ul>
<li> X features - the sentence up to this point. 
<li> Y target - the word(s) that should come next. 
</ul>

So, the model is effectively working to generate text just like a time series model works to predict the next value in a sequence of stock prices or hourly temperature. We train the model on, hopefully a large number of senteneces, where is sees many examples of "here are some words" (X values) and "here is the next word" (Y value). If we give it lots and lots of that training data, it should become better and better at determining what should come next, given the existing sentence. 

To do this well, we'd need a lot more data than we have, and much more time to train. We'd want to give the model enough data so that it can see lots and lots of examples of the same word in different contexts, and of similar contexts with different words. The patterns of language are really complex, so we need data that provides enough variation to demonstrate the patterns. 

The model is wrapped in a little function, so we can make a model to output a different number of words with more convenience.

#### Embedding Layer

We also use an embedding layer here, which accepts our enocoded inputs. 

In [None]:
def create_model(max_sequence_len, output_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layers - LSTM Layer
    model.add(LSTM(100, return_sequences = True))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, OUTPUT_LENGTH)
model.summary()

In [None]:
# Train Model
model.fit(predictors, label, epochs=100, verbose=5)

### Predictions

We can create a little function to generate text. We can give it a seed text, and it will generate text based on that. We can also give it a number of words to generate, and it will generate that many words.

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:


print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

