# LSTM for Trump text generation

![alt text](https://miro.medium.com/v2/resize:fit:984/1*Mb_L_slY9rjMr8-IADHvwg.png)

Tensorflow will be used for the integration of the LSTM model.

## Dataset:

In [1]:
import os

path = 'Trump Rally Speeches/'
files = os.listdir(path)
files = [path + file for file in files]
 
dates = []
locations = []
years = []
days = []
months = []
speeches_text = []
 
month_ab = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec']

for file in files:
    for month in month_ab:
        if month in file:
            locations.append(file[file.find('/')+1:file.find(month)])
            break
    for i, mont in enumerate(month_ab):
        if month in file:
            date = file[file.find(month):file.find('.txt')]
            dates.append(date)
            months.append(date[:3])
            days.append(str(date[3]))
            years.append(date[-4:])
            break   
        
for file in files:
    with open(file, 'r') as f:
        speeches_text.append(f.read())     
        
import pandas as pd
 
df = pd.DataFrame({'Speech':files, 'Date':dates, 'Location':locations, 'Year':years, 'Month':months, 'Day':days, 'Speech_Text':speeches_text})

We will use the two preprocessing function defined in preprocessing_pipline.
One only remove the punctuation and do lower case, the other one also remove the stop and rare words.

In [2]:
from preprocessing import preprocessing_pipline

preprocessing = preprocessing_pipline(df['Speech_Text'])
df['Speech_Text_prepro'] = preprocessing.preprocess_light()
df['Speech_Text_prepro2'] = preprocessing.preprocess()


thank thank thank vice president pence hes good guy weve done great job together merry christmas mic
Thank you. Thank you. Thank you to Vice President Pence. He's a good guy. We've done a great job tog
thank thank thank vice president pence hes good guy weve done great job together merry christmas mic


### Data 
1. Without preprocessing -> terrible results... not intresting
2. Light preprocessing 
3. Heavy preprocessing (stop words and rare words)

In [3]:
from sklearn.model_selection import train_test_split

text_corpus = [word for speech in df['Speech_Text'].str.split() for word in speech]

# Preprocess text 
text_corpus_prepro = [word for speech in df['Speech_Text_prepro'].str.split() for word in speech]

text_corpus_prepro2 = [word for speech in df['Speech_Text_prepro2'].str.split() for word in speech]


In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

def preprocess_for_LSTM(text_corpus, lenght_of_sequences=10, test_size=0.2):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_corpus)

    input = []
    output = []


    for i in range(lenght_of_sequences, len(text_corpus)):
        input.append(text_corpus[i - lenght_of_sequences:i])
        output.append(text_corpus[i])

    input = tokenizer.texts_to_sequences(input)
    output = tokenizer.texts_to_sequences(output)

    # pad 
    input = pad_sequences(input, maxlen=lenght_of_sequences, padding='pre')
    output = pad_sequences(output, maxlen=1, padding='pre')
    # Convert your lists of input sequences into numpy arrays
    
    input = np.array(input)
    output = np.array(output)

    # Split your data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test, tokenizer

 

2023-11-05 20:47:04.517219: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-05 20:47:04.550495: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-05 20:47:04.550534: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-05 20:47:04.550573: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-05 20:47:04.557812: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-05 20:47:04.558691: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [6]:
X_train, X_test, y_train, y_test, tokenizer = preprocess_for_LSTM(text_corpus_prepro, lenght_of_sequences=10, test_size=0.2)
X_train2, X_test2, y_train2, y_test2, tokenizer2 = preprocess_for_LSTM(text_corpus_prepro2, lenght_of_sequences=10, test_size=0.2)


## The model

In [25]:
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense

class my_model_LSTM:
    def __init__(self, n, num_unique_words, max_sequence_length):
        self.n = n
        self.num_unique_words = num_unique_words
        self.max_sequence_length = max_sequence_length
        self.model = self.build_model()
        
    def build_model(self):
        model = tf.keras.Sequential()
        model.add(Embedding(self.num_unique_words, self.n))
        model.add(LSTM(units=self.n))
        model.add(Dense(units=self.num_unique_words, activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
     
    def train(self, X_train, y_train, epochs=10):
        self.model.fit(X_train, y_train, epochs=epochs)
         
    def predict(self, test_corpus):
        return self.model.predict(test_corpus)
     
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)
     
    def save(self, path):
        self.model.save(path)
         
    def load(self, path):
        self.model = tf.keras.models.load_model(path)
         
    def generate_text(self, seed_text, max_length=100):
        output_text = seed_text.split()
        prefix = output_text[-(self.n - 1):]

        for _ in range(max_length):
            input_eval = [tokenizer.word_index[word] for word in prefix]
            input_eval = tf.expand_dims(input_eval, 0)

            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)

            if len(predictions.shape) == 1:
                predictions = tf.expand_dims(predictions, 0)

            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
            next_word = tokenizer.index_word[predicted_id]

            output_text.append(next_word)
            prefix = prefix[1:] + [next_word]

        return ' '.join(output_text)
    



### Light prepro

In [27]:
num_unique_words = len(tokenizer.word_index) + 1
max_sequence_length = 40
model_LSTM = my_model_LSTM(50, num_unique_words, max_sequence_length)
history = model_LSTM.train(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
model_LSTM.save('model_LSTM.h5')

  saving_api.save_model(


In [None]:
predictions = model_LSTM.evaluate(X_test, y_test)
print("LSTM Model Accuracy:", predictions[1])
print("LSTM Model Loss:", predictions[0])

LSTM Model Accuracy: 0.14131730794906616
LSTM Model Loss: 6.3365678787231445


In [None]:
model_LSTM = my_model_LSTM(256, num_unique_words, max_sequence_length)
model_LSTM.load('model_LSTM.h5')
seed_text = 'france forcing'
generated_text = model_LSTM.generate_text(seed_text, max_length=10)
print(generated_text)

france forcing major… bombing encouragement weirdest bushy witchhunt stiff solely resulting run


In [None]:
print("LSTM Model Perplexity:", np.exp(model_LSTM.evaluate(X_test, y_test)[0]))

LSTM Model Perplexity: 564.8543321620472


dfdf

In [None]:
num_unique_words2 = len(tokenizer2.word_index) + 1
max_sequence_length = 40
model_LSTM2 = my_model_LSTM(50, num_unique_words2, max_sequence_length)
model_LSTM2.train(X_train2, y_train2, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_LSTM2.save('model_LSTM2.h5')

In [None]:
predictions = model_LSTM2.evaluate(X_test, y_test)
print("LSTM Model Accuracy:", predictions[1])
print("LSTM Model Loss:", predictions[0])

LSTM Model Accuracy: 0.14261500537395477
LSTM Model Loss: 6.329217910766602


In [None]:
model_LSTM2 = my_model_LSTM(256, num_unique_words, max_sequence_length)
model_LSTM2.load('model_LSTM.h5')
seed_text = 'hello'
generated_text = model_LSTM2.generate_text(seed_text, max_length=30)
print(generated_text)

hello killing tweet franklin messed 200 talib problems 9th close tortured


In [None]:
print("LSTM Model Perplexity:", np.exp(model_LSTM2.evaluate(X_test, y_test)[0]))

LSTM Model Perplexity: 564.8543321620472


## Model