# Autoencoder to predict the next tweet of Donald Trump

### Preprocess data

In [7]:
import numpy as np
import pandas as pd

def load_file(file_name):
    # convert all elements to string to avoid pandas dtype guessing
    data = pd.read_csv(file_name,  converters={i: str for i in range(35000)})['Text']

    # remove all retweets and replies
    remove = (data.str.contains("RT", case=True, na=False) | data.str.contains("RE", case=True, na=False))
    data = data[~remove] # ~: element-wise NOT operation

    # remove all urls 
    # https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python
    data = data.str.replace("https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:(\/\S+)*)", "", regex=True) 
    return data

data= load_file('./data/lessTweets.csv' )
print("Total number of data: ", data.shape)
print(data.head(10))

Total number of data:  (30084,)
0     I have not heard any of the pundits or comment...
1     I would have done even better in the election,...
2     Campaigning to win the Electoral College is mu...
4     especially how to get people, even with an unl...
5     Bill Clinton stated that I called him after th...
6     "@mike_pence: Congratulations to @RealDonaldTr...
7     "@Franklin_Graham: Congratulations to Presiden...
8     We did it! Thank you to all of my great suppor...
9     Today there were terror attacks in Turkey, Swi...
10    If my many supporters acted and threatened peo...
Name: Text, dtype: object


### Load data into tokenizer

In [10]:
#import keras
from keras.preprocessing.text import Tokenizer

# keep the most common 20000 words
tokenizer = Tokenizer(20000)
tokenizer.fit_on_texts(data)

Using TensorFlow backend.


In [26]:
from keras.preprocessing.sequence import pad_sequences

word2index = tokenizer.word_index
index2word = {v: k for k, v in word2index.items()}
print("Unique tokens: {0}".format(len(word2index)))

sequences = tokenizer.texts_to_sequences(data)
# pad sequences to be the same length
seq = pad_sequences(sequences)
print("Shape of data: {0} sentences with {1} words at most".format(seq.shape[0], seq.shape[1]))
maxWords = seq.shape[1]

Unique tokens: 35755
Shape of data: 30084 sentences with 36 words at most


### Doing word embedding layer

In [29]:
import os
embeddingIdx = {}
dims = 25
filePath = "./data/glove.twitter.27B." + str(dims) + "d.txt"
f = open(filePath)
for line in f:
    values = line.split()
    word = values[0]
    embeddingIdx[word] = np.asarray(values[1:], dtype='float32') # Coefficients
f.close()

print('Found {0} word vectors.'.format(len(embeddingIdx)))

embeddingMtx = np.zeros((len(word2index) + 1, dims)) 
for word, i in word2index.items():
    embeddingVec = embeddingIdx.get(word)
    if embeddingVec is not None:
        embedding_matrix[i] = embedding_vector


Found 1193515 word vectors.


### Loading embedding matrix onto the layer

In [28]:
from keras.layers import Embedding

embeddingLayer = Embedding(len(word2index) + 1,dims, weights=[embeddingMtx],input_length=maxWords,trainable=False)