# Autoencoder to predict the next tweet of Donald Trump

### Preprocess data

In [32]:
import numpy as np
import pandas as pd

data_len = 30000
num_words = 15000

def load_file(file_name):
    # convert all elements to string to avoid pandas dtype guessing
    data = pd.read_csv(file_name,  converters={i: str for i in range(35000)})['Text']

    # remove all retweets and replies
    remove = (data.str.contains("RT", case=True, na=False) | data.str.contains("RE", case=True, na=False))
    data = data[~remove] # ~: element-wise NOT operation

    # remove all urls 
    # https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python
    data = data.str.replace("https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:(\/\S+)*)", "", regex=True) 
    return data

data = load_file('./data/lessTweets.csv' )[: data_len]
print("Total number of data: ", data.shape)
print(data.head(10))

Total number of data:  (30000,)
0     I have not heard any of the pundits or comment...
1     I would have done even better in the election,...
2     Campaigning to win the Electoral College is mu...
4     especially how to get people, even with an unl...
5     Bill Clinton stated that I called him after th...
6     "@mike_pence: Congratulations to @RealDonaldTr...
7     "@Franklin_Graham: Congratulations to Presiden...
8     We did it! Thank you to all of my great suppor...
9     Today there were terror attacks in Turkey, Swi...
10    If my many supporters acted and threatened peo...
Name: Text, dtype: object


### Load data into tokenizer

In [33]:
#import keras
from keras.preprocessing.text import Tokenizer

# keep the most common 15000 words
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(data)

In [34]:
from keras.preprocessing.sequence import pad_sequences

word2index = tokenizer.word_index
index2word = {v: k for k, v in word2index.items()}
print("Unique tokens: {0}".format(len(word2index)))

sequences = tokenizer.texts_to_sequences(data)
# pad sequences to be the same length
seq = pad_sequences(sequences)
print("Shape of data: {0} sentences with at most {1} words".format(seq.shape[0], seq.shape[1]))

Tx = seq.shape[1]

Unique tokens: 35706
Shape of data: 30000 sentences with at most 36 words


### Doing word embedding layer

In [29]:
import os

embedding_index = {}
embedding_dim = 25

filePath = "./data/glove.twitter.27B." + str(embedding_dim) + "d.txt"
f = open(filePath)
for line in f:
    values = line.split()
    word = values[0]
    embedding_index[word] = np.asarray(values[1:], dtype='float32') # Coefficients
f.close()

print('Found {0} word vectors.'.format(len(embedding_index)))

Found 1193514 word vectors.


In [30]:
glove_embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word2index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embedding_index.get('unk')
            
print("FInish loading embedding weights from Glove")

FInish loading embedding weights from Glove


### Defining Model layers for Variational Autoencoder

In [49]:
from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from keras.layers.advanced_activations import ReLU
from keras import backend as K

n_a = 64 # number of hidden units
n_l = 16 # number of latent units
drop_rate = 0.2 # dropout rate
batch_size = 100

# initialize input layer
x = Input(batch_shape=(None, Tx), name='input') 
# initialize word embedding layer with Glove weights
x_embed = Embedding(num_words, embedding_dim, weights=[glove_embedding_matrix], input_length = Tx, trainable=False, name='embed')(x)

# initialize bidirectional lstm encoder
h = Bidirectional(LSTM(n_a, return_sequences=False, recurrent_dropout=0.2), merge_mode='concat', name='lstm')(x_embed)
# initialize dropout layer
h = Dropout(drop_rate, name='dropout')(h)
# initialize dense layer
h = Dense(n_a, activation='linear', name='dense')(h)
# initialize relu activication layer
h = ReLU(name='relu')(h)
# initialize another dense layer
h = Dropout(drop_rate, name='dropout2')(h)

# variational autoencoder: 
# z to describe the latent state
z_mean = Dense(n_l, name='z_mean')(h)
z_log_var = Dense(n_l, name='z_log_var')(h)

def sampling(args):
    z_mean, z_log_var = args
    # randomize sampling
    epsilon = K.random_normal((K.shape(z_mean)[0], K.int_shape(z_mean)[1]))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

# z contains two vectors to describe the latent state
z = Lambda(sampling, output_shape=(n_l,))([z_mean, z_log_var])

# repeat sample Tx times
repeator = RepeatVector(Tx)
# decode from latent space
decoder_h = LSTM(n_a, return_sequences=True, recurrent_dropout=drop_rate)
# TimeDistributed layer to keep one-to-one relations on input and output
decoder_mean = TimeDistributed(Dense(num_words, activation='linear'))
# decoded hidden layer
h_decoded = decoder_h(repeator(z))
# decoded output layer
x_decoded_mean = decoder_mean(h_decoded)

In [51]:
import tensorflow as tf
from keras.models import Model
from keras.optimizers import Adam

# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

# Custom VAE loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, Tx)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tf.contrib.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
opt = Adam(lr=0.01) #SGD(lr=1e-2, decay=1e-6, momentum=0.9, nesterov=True)
vae.compile(optimizer='adam', loss=[zero_loss])
vae.summary()

(?, 36) (?, 36, 15000)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 36)           0                                            
__________________________________________________________________________________________________
embed (Embedding)               (None, 36, 25)       375000      input[0][0]                      
__________________________________________________________________________________________________
lstm (Bidirectional)            (None, 128)          46080       embed[0][0]                      
__________________________________________________________________________________________________
dropout (Dropout)               (None, 128)          0           lstm[0][0]                       
______________________________________________________________________________________

### Model training

In [59]:
from keras.callbacks import ModelCheckpoint

epoch = 10
seq_valid = seq[25000:] # validation data
n_steps = (data_len - 5000) / batch_size # training data
checkpointer = ModelCheckpoint('./model/vae_seq2seq.h5', verbose=1, save_best_only=False)

def generator(seq, batch_size):
    for i in range(batch_size, len(seq), batch_size):
        yield[seq[i - batch_size: i], seq[i - batch_size: i]]
    
for i in range(epoch):
    print('-------epoch: ', i, '--------')
    vae.fit_generator(generator(seq, batch_size),
                      steps_per_epoch=n_steps, epochs=1, callbacks=[checkpointer],
                      validation_data=(seq_valid, seq_valid))
    
vae.save('./model/vae.h5')

-------epoch:  0 --------
Epoch 1/1

KeyboardInterrupt: 

In [None]:
# build a model to project sentences on the latent space
encoder = Model(x, z_mean)

# build a generator that can sample sentences from the learned distribution
decoder_input = Input(shape=(n_l,))
_h_decoded = decoder_h(repeator(decoder_input))
_x_decoded_mean = decoder_mean(_h_decoded)
_x_decoded_mean = Activation('softmax')(_x_decoded_mean)
generator = Model(decoder_input, _x_decoded_mean)

index2word = {v: k for k, v in word2index.items()}
sent_encoded = encoder.predict(seq_valid, batch_size = 16)
x_test_reconstructed = generator.predict(sent_encoded)
                                         
sent_idx = 672
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx])
#np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
#np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))
word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
original_sent = list(np.vectorize(index2word.get)(data_1_val[sent_idx]))

print(word_list, original_sent)

# function to parse a sentence
def sent_parse(sentence, mat_shape):
    sequence = tokenizer.texts_to_sequences(sentence)
    padded_sent = pad_sequences(sequence, maxlen=Tx)
    return padded_sent#[padded_sent, sent_one_hot]

# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec

# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample

# input: original dimension sentence vector
# output: sentence text
def print_latent_sentence(sent_vect):
    sent_vect = np.reshape(sent_vect,[1,n_l])
    sent_reconstructed = generator.predict(sent_vect)
    sent_reconstructed = np.reshape(sent_reconstructed,[Tx,num_words])
    reconstructed_indexes = np.apply_along_axis(np.argmax, 1, sent_reconstructed)
    np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
    np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))
    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    w_list = [w for w in word_list if w]
    print(' '.join(w_list))
    #print(word_list)
        
def new_sents_interp(sent1, sent2, n):
    tok_sent1 = sent_parse(sent1, [15])
    tok_sent2 = sent_parse(sent2, [15])
    enc_sent1 = encoder.predict(tok_sent1, batch_size = 16)
    enc_sent2 = encoder.predict(tok_sent2, batch_size = 16)
    test_hom = shortest_homology(enc_sent1, enc_sent2, n)
    for point in test_hom:
        print_latent_sentence(point)

sentence1=['where can i find a book on machine learning']
mysent = sent_parse(sentence1, [15])
mysent_encoded = encoder.predict(mysent, batch_size = 16)
print_latent_sentence(mysent_encoded)
print_latent_sentence(find_similar_encoding(mysent_encoded))

sentence2=['how can i become a successful entrepreneur']
mysent2 = sent_parse(sentence2, [15])
mysent_encoded2 = encoder.predict(mysent2, batch_size = 16)
print_latent_sentence(mysent_encoded2)
print_latent_sentence(find_similar_encoding(mysent_encoded2))
print('-----------------')

new_sents_interp(sentence1, sentence2, 6)