# LSTM for Trump text generation

![alt text](https://miro.medium.com/v2/resize:fit:984/1*Mb_L_slY9rjMr8-IADHvwg.png)

Tensorflow will be used for the integration of the LSTM model.

## Dataset:

In [1]:
import os

path = 'Trump Rally Speeches/'
files = os.listdir(path)
files = [path + file for file in files]
 
dates = []
locations = []
years = []
days = []
months = []
speeches_text = []
 
month_ab = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec']

for file in files:
    for month in month_ab:
        if month in file:
            locations.append(file[file.find('/')+1:file.find(month)])
            break
    for i, mont in enumerate(month_ab):
        if month in file:
            date = file[file.find(month):file.find('.txt')]
            dates.append(date)
            months.append(date[:3])
            days.append(str(date[3]))
            years.append(date[-4:])
            break   
        
for file in files:
    with open(file, 'r') as f:
        speeches_text.append(f.read())     
        
import pandas as pd
 
df = pd.DataFrame({'Speech':files, 'Date':dates, 'Location':locations, 'Year':years, 'Month':months, 'Day':days, 'Speech_Text':speeches_text})

In [2]:
from preprocessing import preprocessing_pipline

preprocessing = preprocessing_pipline(df['Speech_Text'])
df['Speech_Text_prepro'] = preprocessing.preprocess_light()

thank thank thank vice president pence hes good guy weve done great job together merry christmas mic
Thank you. Thank you. Thank you to Vice President Pence. He's a good guy. We've done a great job tog
thank thank thank vice president pence hes good guy weve done great job together merry christmas mic


In [3]:
from sklearn.model_selection import train_test_split

text_corpus = [word for speech in df['Speech_Text'].str.split() for word in speech]
train_corpus, test_corpus = train_test_split(text_corpus, test_size=0.2, random_state=42)

# Preprocess text 
text_corpus_prepro = [word for speech in df['Speech_Text_prepro'].str.split() for word in speech]
train_corpus_prepro, test_corpus_prepro = train_test_split(text_corpus_prepro, test_size=0.2, random_state=42)

In [5]:
from keras.preprocessing.text import Tokenizer

# Create a tokenizer and fit it on your text corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_corpus)

input = []
output = []

lenght_of_sequences = 10

for i in range(lenght_of_sequences, len(train_corpus)):
    input.append(train_corpus[i-lenght_of_sequences:i])
    output.append(train_corpus[i])

input = tokenizer.texts_to_sequences(input)

# pad 
from keras.preprocessing.sequence import pad_sequences
input = pad_sequences(input, maxlen=lenght_of_sequences, padding='pre')
 
# Convert your lists of input sequences into numpy arrays
import numpy as np
 
input = np.array(input)
output = np.array(output)

from sklearn.model_selection import train_test_split

# Split your data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42)
 

In [6]:
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, num_unique_words, max_sequence_length):
        self.n = n
        self.num_unique_words = num_unique_words
        self.max_sequence_length = max_sequence_length
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(self.num_unique_words, self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=self.num_unique_words, activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def train(self, X_train, y_train, epochs=10):
        self.model.fit(X_train, y_train, epochs=epochs)
         
    def predict(self, test_corpus):
        return self.model.predict(test_corpus)
     
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, start_string, num_generate=1000):
        input_eval = [self.text_corpus.index(s) for s in start_string.split()]
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []
        temperature = 1.0
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_eval = tf.expand_dims([predicted_id], 0)
            text_generated.append(self.text_corpus[predicted_id])
        return (start_string + ' '.join(text_generated))


In [None]:
num_unique_words = len(tokenizer.word_index) + 1
max_sequence_length = len(input[0])
model_LSTM = my_model_LSTM(256, num_unique_words, max_sequence_length)
model_LSTM.train(X_train, y_train, epochs=10)

## Model

In [9]:
# import for LSTM 
import tensorflow as tf
import numpy as np
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, text_corpus):
        self.n = n 
        self.text_corpus = text_corpus
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(len(self.text_corpus),self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=len(self.text_corpus), activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def train(self, train_corpus, epochs=10):
        self.model.fit(train_corpus, epochs=epochs)
         
    def predict(self, test_corpus):
        return self.model.predict(test_corpus)
     
    def evaluate(self, test_corpus, test_labels):
        return self.model.evaluate(test_corpus, test_labels)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, start_string, num_generate=1000):
        input_eval = [self.text_corpus.index(s) for s in start_string.split()]
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []
        temperature = 1.0
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_eval = tf.expand_dims([predicted_id], 0)
            text_generated.append(self.text_corpus[predicted_id])
        return (start_string + ' '.join(text_generated))
    
model_LSTM = my_model_LSTM(256, text_corpus)
model_LSTM.train(train_corpus, epochs=10)
model_LSTM.evaluate(test_corpus, test_corpus)
model_LSTM.save('model_LSTM.h5')
model_LSTM.load('model_LSTM.h5')
model_LSTM.generate_text('I', num_generate=1000)
  

: 

: 