# LSTM for Trump text generation

![alt text](https://miro.medium.com/v2/resize:fit:984/1*Mb_L_slY9rjMr8-IADHvwg.png)

Tensorflow will be used for the integration of the LSTM model.

## Dataset:

In [3]:
import os

path = 'Trump Rally Speeches/'
files = os.listdir(path)
files = [path + file for file in files]
 
dates = []
locations = []
years = []
days = []
months = []
speeches_text = []
 
month_ab = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec']

for file in files:
    for month in month_ab:
        if month in file:
            locations.append(file[file.find('/')+1:file.find(month)])
            break
    for i, mont in enumerate(month_ab):
        if month in file:
            date = file[file.find(month):file.find('.txt')]
            dates.append(date)
            months.append(date[:3])
            days.append(str(date[3]))
            years.append(date[-4:])
            break   
        
for file in files:
    with open(file, 'r') as f:
        speeches_text.append(f.read())     
        
import pandas as pd
 
df = pd.DataFrame({'Speech':files, 'Date':dates, 'Location':locations, 'Year':years, 'Month':months, 'Day':days, 'Speech_Text':speeches_text})

In [4]:
from preprocessing import preprocessing_pipline

preprocessing = preprocessing_pipline(df['Speech_Text'])
df['Speech_Text_prepro'] = preprocessing.preprocess_light()

thank thank thank vice president pence hes good guy weve done great job together merry christmas mic
Thank you. Thank you. Thank you to Vice President Pence. He's a good guy. We've done a great job tog
thank thank thank vice president pence hes good guy weve done great job together merry christmas mic


In [5]:
from sklearn.model_selection import train_test_split

text_corpus = [word for speech in df['Speech_Text'].str.split() for word in speech]
train_corpus, test_corpus = train_test_split(text_corpus, test_size=0.2, random_state=42)

# Preprocess text 
text_corpus_prepro = [word for speech in df['Speech_Text_prepro'].str.split() for word in speech]
train_corpus_prepro, test_corpus_prepro = train_test_split(text_corpus_prepro, test_size=0.2, random_state=42)

In [None]:
from keras.preprocessing.text import Tokenizer

# Create a tokenizer and fit it on your text corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_corpus)

input = []
output = []

## Model

In [9]:
# import for LSTM 
import tensorflow as tf
import numpy as np
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, text_corpus):
        self.n = n 
        self.text_corpus = text_corpus
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(len(self.text_corpus),self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=len(self.text_corpus), activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def train(self, train_corpus, epochs=10):
        self.model.fit(train_corpus, epochs=epochs)
         
    def predict(self, test_corpus):
        return self.model.predict(test_corpus)
     
    def evaluate(self, test_corpus, test_labels):
        return self.model.evaluate(test_corpus, test_labels)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, start_string, num_generate=1000):
        input_eval = [self.text_corpus.index(s) for s in start_string.split()]
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []
        temperature = 1.0
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_eval = tf.expand_dims([predicted_id], 0)
            text_generated.append(self.text_corpus[predicted_id])
        return (start_string + ' '.join(text_generated))
    
model_LSTM = my_model_LSTM(256, text_corpus)
model_LSTM.train(train_corpus, epochs=10)
model_LSTM.evaluate(test_corpus, test_corpus)
model_LSTM.save('model_LSTM.h5')
model_LSTM.load('model_LSTM.h5')
model_LSTM.generate_text('I', num_generate=1000)
  

: 

: 

In [7]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, text_corpus):
        self.n = n
        self.text_corpus = text_corpus
        self.tokenizer = Tokenizer()
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(input_dim=len(self.text_corpus), output_dim=self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=len(self.text_corpus), activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def preprocess_and_tokenize(self, train_corpus):
        self.tokenizer.fit_on_texts(train_corpus)
        sequences = self.tokenizer.texts_to_sequences(train_corpus)
        padded_sequences = pad_sequences(sequences, padding='post')  # You can specify the maximum sequence length here.
        return padded_sequences
     
    def train(self, train_corpus, sequence_length, epochs=10):
        padded_sequences = self.preprocess_and_tokenize(train_corpus)
        
        # Create input and target sequences
        input_sequences = padded_sequences[:, :-1]
        target_sequences = padded_sequences[:, 1:]
        
        self.model.fit(input_sequences, target_sequences, epochs=epochs)
         
    def predict(self, test_corpus):
        padded_sequences = self.preprocess_and_tokenize(test_corpus)
        return self.model.predict(padded_sequences)
     
    def evaluate(self, test_corpus, test_labels):
        padded_sequences = self.preprocess_and_tokenize(test_corpus)
        return self.model.evaluate(padded_sequences, test_labels)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, start_string, num_generate=1000):
        input_eval = self.tokenizer.texts_to_sequences([start_string])[0]
        text_generated = []
        temperature = 1.0
        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_eval = tf.expand_dims([predicted_id], 0)
            text_generated.append(self.tokenizer.index_word[predicted_id])
        return (start_string + ' '.join(text_generated))
    
    
model_LSTM = my_model_LSTM(256, text_corpus)
model_LSTM.train(train_corpus, epochs=10)
model_LSTM.evaluate(test_corpus, test_corpus)
##model_LSTM.save('model_LSTM.h5')
model_LSTM.load('model_LSTM.h5')
model_LSTM.generate_text('I', num_generate=1000)


TypeError: my_model_LSTM.train() missing 1 required positional argument: 'sequence_length'