# Hillary or Donald with LSTMs

In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')


Loaded the word list!
Loaded the word vectors!


Just to make sure everything has been loaded in correctly, we can look at the dimensions of the vocabulary list and the embedding matrix. 

In [65]:
print(len(wordsList))
print(wordVectors.shape)
#print(wordVectors)
#print(wordsList)

400000
(400000, 50)


In [105]:
waitIndex = wordsList.index('wait')
wordVectors[waitIndex]

array([ 0.42236999, -0.15824001,  0.32442001, -0.87483001,  0.43257001,
       -0.91895002, -0.67919999,  1.11210001, -0.09794   , -0.23305   ,
       -0.28874001,  0.15379   , -0.10902   ,  0.37869   ,  1.40869999,
        0.84034997, -0.14228   , -0.54035997,  0.065478  , -0.85974002,
        0.2243    ,  0.29426   ,  0.35782   ,  0.57761002,  0.67767   ,
       -1.35169995,  0.013808  ,  0.15085   ,  0.67284   , -0.24376   ,
        2.49760008,  0.76786   , -0.95763999,  0.20433   ,  0.35602999,
       -0.40217999,  0.64647001,  0.031975  ,  0.23342   , -0.74050999,
       -0.62553   , -0.20900001, -0.0046038 ,  0.54782999, -0.035645  ,
       -0.031944  ,  0.50266999, -0.013276  , -0.036279  ,  0.57063001], dtype=float32)

![caption](Images/Pipeline for Data preparation.png)

In [121]:
import tensorflow as tf
from nltk.tokenize import TweetTokenizer
from tensorflow.contrib import rnn
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from nltk.tokenize import TweetTokenizer, word_tokenize
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
%matplotlib inline

In [122]:
maxSeqLength = 100

In [123]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br , : ! @ # $ %; />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [124]:
# Read in files from "train"
tweets = pd.read_csv('tweets.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [125]:
# 
train.loc[train["handle"]=='HillaryClinton','handle']=0
train.loc[train["handle"]=='realDonaldTrump','handle']=1
test.handle = 0

In [126]:
tweets_train = train[0:4000]
tweets_test = test[4001:4742]

In [127]:
tweets_train.tweet

0       The question in this election: Who can put the...
1       Last night, Donald Trump said not paying taxes...
2       If we stand together, there's nothing we can't...
3       Both candidates were asked about how they'd co...
4       Join me for a 3pm rally - tomorrow at the Mid-...
5       When Donald Trump goes low...register to vote:...
6       3) Has Trump offered a single proposal to redu...
7       The election is just weeks away. Check if you'...
8       Hillary Clinton's Campaign Continues To Make F...
9       'CNBC, Time magazine online polls say Donald T...
10      Donald Trump lied to the American people at le...
11      In the last 24 hrs. we have raised over $13M f...
12      “She gained about 55 pounds in...9 months. She...
13      We don’t want to turn against each other.\nWe ...
14      "What we hear from my opponent is dangerously ...
15      One candidate made it clear he wasn’t prepared...
16      "I’m really glad my dad never had a contract w...
17      When y

In [128]:
len(tweets_train)

4000

In [129]:
fileCounter =0
ids = np.zeros((len(tweets_train),100), dtype='int32')
i_train = 0
for i in range (4000):
    indexCounter = 0
    line=tweets_train.tweet[i]
    cleanedLine = cleanSentences(line)
    split = cleanedLine.split()
    for word in split:
        try:
            ids[fileCounter][indexCounter] = wordsList.index(word)
        except ValueError:
            ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
        indexCounter = indexCounter + 1
        if indexCounter >= maxSeqLength:
            break
    fileCounter = fileCounter + 1 
np.save('idsMatrix',ids)

In [130]:
ids = np.load('idsMatrix.npy')

In [131]:
ids.shape

(4000, 100)

In [132]:
ids[300]

array([201534,   9147,    301,      5,    266,     38,     33,    724,
          162,    123,   7763,    439,     73,   3907,  10468, 399999,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0], dtype=int32)

# Helper Functions

Below you can find a couple of helper functions that will be useful when training the network in a later step. 

In [133]:
from random import randint
def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,numFiles - 1)
        if(tweets_train.handle[num] == 0): 
            labels.append([0,1])
        else:
            labels.append([1,0])
        arr[i] = ids[num-1:num]
    return arr, labels

In [134]:
def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(1,742)
        if (tweets_test.handle[num] == 0): 
        
            labels.append([0,1])
        else:
            labels.append([1,0])
        arr[i] = ids[num-1:num]
    return arr, labels

In [135]:
numFiles = 4000

In [136]:
if(tweets_train.handle[2] == 0):
    print(5)

5


In [137]:
ids

array([[201534,    995,      6, ...,      0,      0,      0],
       [    76,    364,   3907, ...,      0,      0,      0],
       [    83,     53,   1346, ...,      0,      0,      0],
       ..., 
       [399999,   1962,  63404, ...,      0,      0,      0],
       [399999,    169,     12, ...,      0,      0,      0],
       [  5551,     81,    453, ...,      0,      0,      0]], dtype=int32)

# LSTM Model

Now, we’re ready to start creating our Tensorflow graph. We’ll first need to define some hyperparameters, such as batch size, number of LSTM units, number of output classes, and number of training iterations. 

In [138]:
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000

As with most Tensorflow graphs, we’ll now need to specify two placeholders, one for the inputs into the network, and one for the labels. The most important part about defining these placeholders is understanding each of their dimensionalities. 

The labels placeholder represents a set of values, each either [1, 0] or [0, 1], depending on whether each training example is positive or negative. Each row in the integerized input placeholder represents the integerized representation of each training example that we include in our batch.

In [139]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

![caption](Images/Pipeline word to vector.png)

Once we have our input data placeholder, we’re going to call the tf.nn.lookup() function in order to get our word vectors. The call to that function will return a 3-D Tensor of dimensionality batch size by max sequence length by word vector dimensions. In order to visualize this 3-D tensor, you can simply think of each data point in the integerized input tensor as the corresponding D dimensional vector that it refers to. 


![caption](Images/Pipeline Word to Vector with dimension.png)

In [140]:
numDimensions = 300

In [141]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

Now that we have the data in the format that we want, let’s look at how we can feed this input into an LSTM network. We’re going to call the tf.nn.rnn_cell.BasicLSTMCell function. This function takes in an integer for the number of LSTM units that we want. This is one of the hyperparameters that will take some tuning to figure out the optimal value. We’ll then wrap that LSTM cell in a dropout layer to help prevent the network from overfitting. 

Finally, we’ll feed both the LSTM cell and the 3-D tensor full of input data into a function called tf.nn.dynamic_rnn. This function is in charge of unrolling the whole network and creating a pathway for the data to flow through the RNN graph.

In [142]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

As a side note, another more advanced network architecture choice is to stack multiple LSTM cells on top of each other. This is where the final hidden state vector of the first LSTM feeds into the second. Stacking these cells is a great way to help the model retain more long term dependence information, but also introduces more parameters into the model, thus possibly increasing the training time, the need for additional training examples, and the chance of overfitting. For more information on how you can add stacked LSTMs to your model, check out Tensorflow's excellent [documentation](https://www.tensorflow.org/tutorials/recurrent#stacking_multiple_lstms).

The first output of the dynamic RNN function can be thought of as the last hidden state vector. This vector will be reshaped and then multiplied by a final weight matrix and a bias term to obtain the final output values.

In [143]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

Next, we’ll define correct prediction and accuracy metrics to track how the network is doing. The correct prediction formulation works by looking at the index of the maximum value of the 2 output values, and then seeing whether it matches with the training labels. 

In [144]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

We’ll define a standard cross entropy loss with a softmax layer put on top of the final prediction values. For the optimizer, we’ll use Adam and the default learning rate of .001. 

In [145]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

If you’d like to use Tensorboard to visualize the loss and accuracy values, you can also run and the modify the following code. 

In [146]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir,sess.graph)

# Hyperparameter Tuning

Choosing the right values for your hyperparameters is a crucial part of training deep neural networks effectively. You'll find that your training loss curves can vary with your choice of optimizer (Adam, Adadelta, SGD, etc), learning rate, and network architecture. With RNNs and LSTMs in particular, some other important factors include the number of LSTM units and the size of the word vectors.

* Learning Rate: RNNs are infamous for being diffult to train because of the large number of time steps they have. Learning rate becomes extremely important since we don't want our weight values to fluctuate wildly as a result of a large learning rate, nor do we want a slow training process due to a low learning rate. The default value of 0.001 is a good place to start. You should increase this value if the training loss is changing very slowly, and decrease if the loss is unstable.  
* Optimizer: There isn't a consensus choice among researchers, but Adam has been widely popular due to having the adaptive learning rate property (Keep in mind that optimal learning rates can differ with the choice of optimizer).
* Number of LSTM units: This value is largely dependent on the average length of your input texts. While a greater number of units provides more expressibility for the model and allows the model to store more information for longer texts, the network will take longer to train and will be computationally expensive. 
* Word Vector Size: Dimensions for word vectors generally range from 50 to 300. A larger size means that the vector is able to encapsulate more information about the word, but you should also expect a more computationally expensive model. 

# Training

The basic idea of the training loop is that we first define a Tensorflow session. Then, we load in a batch of reviews and their associated labels. Next, we call the session’s `run` function. This function has two arguments. The first is called the "fetches" argument. It defines the value we’re interested in computing. We want our optimizer to be computed since that is the component that minimizes our loss function. The second argument is where we input our `feed_dict`. This data structure is where we provide inputs to all of our placeholders. We need to feed our batch of reviews and our batch of labels. This loop is then repeated for a set number of training iterations.

Instead of training the network in this notebook (which will take at least a couple of hours), we’ll load in a pretrained model.

If you decide to train this notebook on your own machine, note that you can track its progress using [TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard). While the following cell is running, use your terminal to enter the directory that contains this notebook, enter `tensorboard --logdir=tensorboard`, and visit http://localhost:6006/ with a browser to keep an eye on your training progress.

In [163]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(1000):
    #
    
    
   #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
   #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)
        print(i)

   #Save the network every 10,000 training iterations
    if (i % 100 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
        iterations = 10
        average = 0.0
        for j in range(iterations):
            nextBatch, nextBatchLabels = getTrainBatch();
    
            #print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
            accuracy_current = sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels}) * 100
            average = (average * j + accuracy_current)/(j + 1)
           #print("Accuracy for this batch:", accuracy_current)
        print("i:", i)
        print("average:", average)
writer.close()



sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
#Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
    writer.close()

0
50
100
saved to models/pretrained_lstm.ckpt-100
i: 100
average: 62.0833328366
150
200
saved to models/pretrained_lstm.ckpt-200
i: 200
average: 59.1666662693
250
300
saved to models/pretrained_lstm.ckpt-300
i: 300
average: 61.6666671634
350
400
saved to models/pretrained_lstm.ckpt-400
i: 400
average: 58.750000596
450
500
saved to models/pretrained_lstm.ckpt-500
i: 500
average: 60.4166659713
550
600
saved to models/pretrained_lstm.ckpt-600
i: 600
average: 59.1666662693
650
700
saved to models/pretrained_lstm.ckpt-700
i: 700
average: 58.3333337307
750
800
saved to models/pretrained_lstm.ckpt-800
i: 800
average: 60.000000298
850
900
saved to models/pretrained_lstm.ckpt-900
i: 900
average: 57.0833334327
950


# Loading a Pretrained Model

Our pretrained model’s accuracy and loss curves during training can be found below. 

Looking at the training curves above, it seems that the model's training is going well. The loss is decreasing steadily, and the accuracy is approaching 100 percent. However, when analyzing training curves, we should also pay special attention to the possibility of our model overfitting the training dataset. Overfitting is a common phenomenon in machine learning where a model becomes so fit to the training data that it loses the ability to generalize to the test set. This means that training a network until you achieve 0 training loss might not be the best way to get an accurate model that performs well on data it has never seen before. Early stopping is an intuitive technique commonly used with LSTM networks to combat this issue. The basic idea is that we train the model on our training set, while also measuring its performance on the test set every now and again. Once the test error stops its steady decrease and begins to increase instead, you'll know to stop training, since this is a sign that the network has begun to overfit. 

Loading a pretrained model involves defining another Tensorflow session, creating a Saver object, and then using that object to call the restore function. This function takes into 2 arguments, one for the current session, and one for the name of the saved model. 

In [164]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))

INFO:tensorflow:Restoring parameters from models/pretrained_lstm.ckpt-900


Then we’ll load some movie reviews from our test set. Remember, these are reviews that the model has not been trained on and has never seen before. The accuracy for each test batch can be seen when you run the following code. 

In [166]:
sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels}) * 100

58.333331346511841

# Conclusion

My accuracy is only 58.333. It means that my code is only a little better them ramdam guessing. I will try to make them better when I fully understand the algorithm.