## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import sys
import random
####
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import Model
from keras.layers import Input, Activation, Embedding, LSTM
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

## Read data

In [3]:
df =  pd.read_csv("Donald-Tweets.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7375 entries, 0 to 7374
Data columns (total 12 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Date                                       7375 non-null   object 
 1   Time                                       7375 non-null   object 
 2   Tweet_Text                                 7375 non-null   object 
 3   Type                                       7375 non-null   object 
 4   Media_Type                                 1225 non-null   object 
 5   Hashtags                                   2031 non-null   object 
 6   Tweet_Id                                   7375 non-null   float64
 7   Tweet_Url                                  7375 non-null   object 
 8   twt_favourites_IS_THIS_LIKE_QUESTION_MARK  7375 non-null   int64  
 9   Retweets                                   7375 non-null   int64  
 10  Unnamed: 10             

In [4]:
df.head(2)

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,


## Data Preprocessing

### Convert to Lower Case

In [5]:
# lowercase all
print("...Before...")
print(df['Tweet_Text'][1])
##
text = df['Tweet_Text'].str.lower()
##
print("...After...")
print(text[1])

...Before...
Busy day planned in New York. Will soon be making some very important decisions on the people who will be running our government!
...After...
busy day planned in new york. will soon be making some very important decisions on the people who will be running our government!


### Remove the URLs

In [6]:
print("...Before...")
print(text[100])
##
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))
##
print("...After...")
print(text[100])

...Before...
hillary advisers wanted her to avoid supporting israel when talking to democrats: https://t.co/y7m8ivu173
...After...
hillary advisers wanted her to avoid supporting israel when talking to democrats:


### Remove short tweets

In [7]:
print("...Before...")
print(len(text))
text = text[text.map(len)>40]
print("...After...")
print(len(text))

...Before...
7375
...After...
6886


### Remove Emojis

In [8]:
print("...Before...")
print(text[49])
text = text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
print("...After...")
print(text[49])

...Before...
join me tomorrow! minnesota ۢ 2pm michigan ۢ 6pm virginia ۢ 9:30p_
...After...
join me tomorrow! minnesota  2pm michigan  6pm virginia  9:30p_


### Tokenizer

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_counts) + 1
print("Total Vocabulary: ", vocab_size )
#print(tokenizer.word_counts)

Total Vocabulary:  10760


### Prepare Input Data

#### Divide the data into training and test set

In [10]:
N = text.shape[0]
print("Total Row Count: ", N)
prop_train = 0.8
train = int(N*prop_train)
print("Training Data Count: ", train)
test = N - train
print("Test Data Count: ", test)

Total Row Count:  6886
Training Data Count:  5508
Test Data Count:  1378


#### convert text into integers

In [12]:
sequences, index_train, index_test = [], [], [] 
count = 0
for irow,line in enumerate(text):
    #print(irow, line)
    encoded = tokenizer.texts_to_sequences([line])[0]    
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        
        if irow < train:
            index_train.append(count)
        else:
            index_test.append(count)
        count += 1
print('Total Sequences: %d' % (len(sequences)))

Total Sequences: 114825


#### Insert padding to make each sequence of same length

In [13]:
from keras.preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])


sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
#print(sequences)

Max Sequence Length: 32


In [14]:
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
#print(y.shape)
y = to_categorical(y, num_classes=vocab_size)
X_train, y_train, X_test, y_test = X[index_train], y[index_train],X[index_test], y[index_test]
print(X_train.shape)
print(y_train.shape)

(92288, 31)
(92288, 10760)


## Build a Model

In [19]:
def build_model(vocab_size,
                input_length=1,
                dim_dense_embedding=10,
                hidden_unit_LSTM=5):
    
    
    main_input = Input(shape=(input_length,),dtype='int32',name='main_input')
    embedding = Embedding(vocab_size, dim_dense_embedding, 
                         input_length=input_length)(main_input)
    x = LSTM(hidden_unit_LSTM)(embedding)
    main_output = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs=[main_input],
                  outputs=[main_output])
    #print(model.summary())
    return(model)

In [20]:
model = build_model(vocab_size,
                               input_length=X.shape[1],
                               dim_dense_embedding=30,
                               hidden_unit_LSTM=64)

In [21]:
# compile network

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

In [34]:
# fit network
tf_model = model.fit(X_train, y_train, 
                 validation_data = (X_test,y_test),
                 epochs=50, verbose=2,batch_size=128)

Epoch 1/50
721/721 - 52s - loss: 4.4283 - accuracy: 0.2218 - val_loss: 7.0761 - val_accuracy: 0.1285
Epoch 2/50
721/721 - 52s - loss: 4.3581 - accuracy: 0.2270 - val_loss: 7.1394 - val_accuracy: 0.1287
Epoch 3/50
721/721 - 53s - loss: 4.2918 - accuracy: 0.2338 - val_loss: 7.1865 - val_accuracy: 0.1299
Epoch 4/50
721/721 - 53s - loss: 4.2281 - accuracy: 0.2407 - val_loss: 7.2305 - val_accuracy: 0.1297
Epoch 5/50
721/721 - 52s - loss: 4.1667 - accuracy: 0.2474 - val_loss: 7.2852 - val_accuracy: 0.1290
Epoch 6/50
721/721 - 53s - loss: 4.1086 - accuracy: 0.2541 - val_loss: 7.3237 - val_accuracy: 0.1303
Epoch 7/50
721/721 - 53s - loss: 4.0505 - accuracy: 0.2610 - val_loss: 7.3886 - val_accuracy: 0.1288
Epoch 8/50
721/721 - 52s - loss: 3.9973 - accuracy: 0.2667 - val_loss: 7.4409 - val_accuracy: 0.1250
Epoch 9/50
721/721 - 53s - loss: 3.9456 - accuracy: 0.2737 - val_loss: 7.4760 - val_accuracy: 0.1279
Epoch 10/50
721/721 - 54s - loss: 3.8950 - accuracy: 0.2804 - val_loss: 7.5311 - val_accura

In [None]:
#model.save_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/Trump_Tweet_Generation/models/trump_tweets_generator_model.h5')

In [None]:
#model.load_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/Trump_Tweet_Generation/models/trump_tweets_generator_model.h5')

### Test the Model

In [35]:
in_text = "Donald"
for _ in range(100):
    # encode the text as integer
    enc = tokenizer.texts_to_sequences([in_text])[0]
    #print(enc)
    # pre-pad sequences to a fixed length
    enc_padding = pad_sequences([enc], maxlen=max_length-1, padding='pre')
    #print(enc_padding)
    probs = model.predict(enc_padding, verbose=0).flatten()
    #print(probs)
    index = np.random.choice(range(len(probs)),p=probs)
    #print(index)
    index_word = {v: k for k,v in tokenizer.word_index.items()}
    word = index_word[index] 
    in_text += ' ' + word
print(in_text)

Donald trump a wednesday iowa see you to see you soon a wonderful evening in massachusetts makeamericagreatagain trump2016 supersaturday rally what was so much for the big and spirited crowd realdonaldtrump trump2016 unifying the fact that is not borders lying her husband they decide better trade deals online 2vote trump in our end number of two poll results pundits in paper pundits about their place seat in history resulted arriving dept story comes now shoulder to talk including 1million debate laugh weu amazing event phyllis great terrific date u s s speech arriving 16 congress stands for new anger and role
