## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import sys
import random
####
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import Model
from keras.layers import Input, Activation, Embedding, LSTM
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

Using TensorFlow backend.


## Read data

In [2]:
df =  pd.read_csv("Donald-Tweets.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7375 entries, 0 to 7374
Data columns (total 12 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Date                                       7375 non-null   object 
 1   Time                                       7375 non-null   object 
 2   Tweet_Text                                 7375 non-null   object 
 3   Type                                       7375 non-null   object 
 4   Media_Type                                 1225 non-null   object 
 5   Hashtags                                   2031 non-null   object 
 6   Tweet_Id                                   7375 non-null   float64
 7   Tweet_Url                                  7375 non-null   object 
 8   twt_favourites_IS_THIS_LIKE_QUESTION_MARK  7375 non-null   int64  
 9   Retweets                                   7375 non-null   int64  
 10  Unnamed: 10             

In [4]:
df.head(2)

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,


## Data Preprocessing

### Convert to Lower Case

In [5]:
# lowercase all
print("...Before...")
print(df['Tweet_Text'][1])
##
text = df['Tweet_Text'].str.lower()
##
print("...After...")
print(text[1])

...Before...
Busy day planned in New York. Will soon be making some very important decisions on the people who will be running our government!
...After...
busy day planned in new york. will soon be making some very important decisions on the people who will be running our government!


### Remove the URLs

In [6]:
print("...Before...")
print(text[100])
##
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))
##
print("...After...")
print(text[100])

...Before...
hillary advisers wanted her to avoid supporting israel when talking to democrats: https://t.co/y7m8ivu173
...After...
hillary advisers wanted her to avoid supporting israel when talking to democrats:


### Remove short tweets

In [7]:
print("...Before...")
print(len(text))
text = text[text.map(len)>40]
print("...After...")
print(len(text))

...Before...
7375
...After...
6886


### Remove Emojis

In [8]:
print("...Before...")
print(text[49])
text = text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
print("...After...")
print(text[49])

...Before...
join me tomorrow! minnesota ۢ 2pm michigan ۢ 6pm virginia ۢ 9:30p_
...After...
join me tomorrow! minnesota  2pm michigan  6pm virginia  9:30p_


### Tokenizer

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_counts) + 1
print("Total Vocabulary: ", vocab_size )
#print(tokenizer.word_counts)

Total Vocabulary:  10760


### Prepare Input Data

#### Divide the data into training and test set

In [10]:
N = text.shape[0]
print("Total Row Count: ", N)
prop_train = 0.8
train = int(N*prop_train)
print("Training Data Count: ", train)
test = N - train
print("Test Data Count: ", test)

Total Row Count:  6886
Training Data Count:  5508
Test Data Coount:  1378


#### convert text into integers

In [22]:
sequences, index_train, index_test = [], [], [] 
count = 0
for irow,line in enumerate(text):
    #print(irow, line)
    encoded = tokenizer.texts_to_sequences([line])[0]    
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        
        if irow < train:
            index_train.append(count)
        else:
            index_test.append(count)
        count += 1
print('Total Sequences: %d' % (len(sequences)))

Total Sequences: 114825


#### Insert padding to make each sequence of same length

In [12]:
from keras.preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])


sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
#print(sequences)

Max Sequence Length: 32


In [13]:
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
#print(y.shape)
y = to_categorical(y, num_classes=vocab_size)
X_train, y_train, X_test, y_test = X[index_train], y[index_train],X[index_test], y[index_test]
print(X_train.shape)
print(y_train.shape)

(92288, 31)
(92288, 10760)


## Build a Model

In [14]:
def build_model(vocab_size,
                 input_length=1,
                 dim_dense_embedding=10,
                 hidden_unit_LSTM=5):
    
    
    main_input = Input(shape=(input_length,),dtype='int32',name='main_input')
    embedding = Embedding(vocab_size, dim_dense_embedding, 
                         input_length=input_length)(main_input)
    x = LSTM(hidden_unit_LSTM)(embedding)
    main_output = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs=[main_input],
                  output=[main_output])
    #print(model.summary())
    return(model)

In [15]:
model = build_model(vocab_size,
                               input_length=X.shape[1],
                               dim_dense_embedding=30,
                               hidden_unit_LSTM=64)

# compile network

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

# fit network
tf_model = model.fit(X_train, y_train, 
                 validation_data = (X_test,y_test),
                 epochs=20, verbose=2,batch_size=128)

  del sys.path[0]
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 92288 samples, validate on 22537 samples
Epoch 1/20
 - 331s - loss: 7.0834 - accuracy: 0.0317 - val_loss: 7.0605 - val_accuracy: 0.0357
Epoch 2/20
 - 321s - loss: 6.7485 - accuracy: 0.0417 - val_loss: 7.0808 - val_accuracy: 0.0388
Epoch 3/20
 - 322s - loss: 6.6112 - accuracy: 0.0490 - val_loss: 6.9829 - val_accuracy: 0.0479
Epoch 4/20
 - 323s - loss: 6.4357 - accuracy: 0.0654 - val_loss: 6.9046 - val_accuracy: 0.0691
Epoch 5/20
 - 319s - loss: 6.2467 - accuracy: 0.0857 - val_loss: 6.8251 - val_accuracy: 0.0810
Epoch 6/20
 - 341s - loss: 6.0597 - accuracy: 0.1031 - val_loss: 6.7698 - val_accuracy: 0.0924
Epoch 7/20
 - 339s - loss: 5.8843 - accuracy: 0.1184 - val_loss: 6.7521 - val_accuracy: 0.0988
Epoch 8/20
 - 356s - loss: 5.7255 - accuracy: 0.1314 - val_loss: 6.7482 - val_accuracy: 0.1012
Epoch 9/20
 - 331s - loss: 5.5842 - accuracy: 0.1417 - val_loss: 6.7483 - val_accuracy: 0.1084
Epoch 10/20
 - 327s - loss: 5.4563 - accuracy: 0.1505 - val_loss: 6.7600 - val_accuracy: 0.1101

In [16]:
model.save_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/Trump_Tweet_Generation/models/trump_tweets_generator_model.h5')

In [17]:
model.load_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/Trump_Tweet_Generation/models/trump_tweets_generator_model.h5')

### Test the Model

In [19]:
in_text = "America"
for _ in range(100):
    # encode the text as integer
    enc = tokenizer.texts_to_sequences([in_text])[0]
    #print(enc)
    # pre-pad sequences to a fixed length
    enc_padding = pad_sequences([enc], maxlen=max_length-1, padding='pre')
    #print(enc_padding)
    probs = model.predict(enc_padding, verbose=0).flatten()
    #print(probs)
    index = np.random.choice(range(len(probs)),p=probs
    #print(index)
    index_word = {v: k for k,v in tokenizer.word_index.items()}
    word = index_word[index] 
    in_text += ' ' + word
print(in_text)

America was made on the rails yesterday to be tough tonight in math realdonaldtrump while time trump is trumptrain we must make up so horrible talk about them the country paulteutulsr cost of they get this lead for the boy is a success door and anncoulter needed hillary nice words by the s 4 made to vote all in order to make america great again live to have seen your volunteers thanks will make our win country i called it ohio again are unless we unearthed it again got experts up and end but i am far at congress that the
