# LSTM for Trump text generation

![alt text](https://miro.medium.com/v2/resize:fit:984/1*Mb_L_slY9rjMr8-IADHvwg.png)

Tensorflow will be used for the integration of the LSTM model.

## Dataset:

In [1]:
import os

path = 'Trump Rally Speeches/'
files = os.listdir(path)
files = [path + file for file in files]
 
dates = []
locations = []
years = []
days = []
months = []
speeches_text = []
 
month_ab = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec']

for file in files:
    for month in month_ab:
        if month in file:
            locations.append(file[file.find('/')+1:file.find(month)])
            break
    for i, mont in enumerate(month_ab):
        if month in file:
            date = file[file.find(month):file.find('.txt')]
            dates.append(date)
            months.append(date[:3])
            days.append(str(date[3]))
            years.append(date[-4:])
            break   
        
for file in files:
    with open(file, 'r') as f:
        speeches_text.append(f.read())     
        
import pandas as pd
 
df = pd.DataFrame({'Speech':files, 'Date':dates, 'Location':locations, 'Year':years, 'Month':months, 'Day':days, 'Speech_Text':speeches_text})

In [2]:
from preprocessing import preprocessing_pipline

preprocessing = preprocessing_pipline(df['Speech_Text'])
df['Speech_Text_prepro'] = preprocessing.preprocess_light()

thank thank thank vice president pence hes good guy weve done great job together merry christmas mic
Thank you. Thank you. Thank you to Vice President Pence. He's a good guy. We've done a great job tog
thank thank thank vice president pence hes good guy weve done great job together merry christmas mic


In [13]:
from sklearn.model_selection import train_test_split

text_corpus = [word for speech in df['Speech_Text'].str.split() for word in speech]
train_corpus, test_corpus = train_test_split(text_corpus, test_size=0.2, random_state=42)
print("text corpus type",type(text_corpus))
print("text corpus type",type(text_corpus[0]))



# Preprocess text 
text_corpus_prepro = [word for speech in df['Speech_Text_prepro'].str.split() for word in speech]
train_corpus_prepro, test_corpus_prepro = train_test_split(text_corpus_prepro, test_size=0.2, random_state=42)

text corpus type <class 'list'>
text corpus type <class 'str'>


In [20]:
from keras.preprocessing.text import Tokenizer

# Create a tokenizer and fit it on your text corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_corpus)

input = []
output = []

lenght_of_sequences = 10

for i in range(lenght_of_sequences, len(train_corpus)):
    input.append(train_corpus[i-lenght_of_sequences:i])
    output.append(train_corpus[i])

input = tokenizer.texts_to_sequences(input)
output = tokenizer.texts_to_sequences(output)

# pad 
from keras.preprocessing.sequence import pad_sequences
input = pad_sequences(input, maxlen=lenght_of_sequences, padding='pre')
output = pad_sequences(output, maxlen=1, padding='pre')
# Convert your lists of input sequences into numpy arrays
import numpy as np
 
input = np.array(input)
output = np.array(output)

from sklearn.model_selection import train_test_split

# Split your data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42)
 

In [21]:
print("output:", type(output))
print("X_train Shape:", output.shape)
print("X_train Example Sequence:", output[1])

print("y_train Data Type:", type(y_train))
print("y_train Shape:", y_train.shape)
print("y_train Example Value:", y_train[0])


output: <class 'numpy.ndarray'>
X_train Shape: (296371, 1)
X_train Example Sequence: [46]
y_train Data Type: <class 'numpy.ndarray'>
y_train Shape: (237096, 1)
y_train Example Value: [3]


## The model

In [22]:
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, num_unique_words, max_sequence_length):
        self.n = n
        self.num_unique_words = num_unique_words
        self.max_sequence_length = max_sequence_length
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(self.num_unique_words, self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=self.num_unique_words, activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def train(self, X_train, y_train, epochs=10):
        self.model.fit(X_train, y_train, epochs=epochs)
         
    def predict(self, test_corpus):
        return self.model.predict(test_corpus)
     
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, start_string, num_generate=1000):
        # Vectorize your start string using the tokenizer you built earlier
        input_eval = [tokenizer.word_index[word] for word in start_string.split()]
        input_eval = tf.expand_dims(input_eval, 0)
         
        # Empty string to store your results
        text_generated = []
         
        # Here batch size == 1
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
             
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
             
            input_eval = tf.expand_dims([predicted_id], 0)
             
            text_generated.append(tokenizer.index_word[predicted_id])
             
        return (start_string + ' '.join(text_generated))


In [24]:
num_unique_words = len(tokenizer.word_index) + 1
max_sequence_length = 15
model_LSTM = my_model_LSTM(256, num_unique_words, max_sequence_length)
model_LSTM.train(X_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
# test 
model_LSTM.evaluate(X_test, y_test)
print(model_LSTM.evaluate(X_test, y_test))
# print metrics
print("LSTM Model Accuracy:", model_LSTM.evaluate(X_test, y_test)[1])
print("LSTM Model Loss:", model_LSTM.evaluate(X_test, y_test)[0])
# perplextiy
print("LSTM Model Perplexity:", np.exp(model_LSTM.evaluate(X_test, y_test)[0]))

LSTM Model Accuracy: 0.038363561034202576
LSTM Model Loss: 6.4545793533325195
LSTM Model Perplexity: 635.606304337891


In [26]:
print(model_LSTM.evaluate(X_test, y_test))

[6.4545793533325195, 0.038363561034202576]


In [27]:
model_LSTM.save('model_LSTM.h5')
model_LSTM.load('model_LSTM.h5')
model_LSTM.generate_text('I', num_generate=1000)

  saving_api.save_model(


AttributeError: 'my_model_LSTM' object has no attribute 'text_corpus'

## Model