In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import math
from matplotlib import pyplot as plt
%matplotlib inline

# Recurrent Neural Networks: Sentiment Analysis

### Importing data

In [3]:
#_dir = "C:\\Users\\talha\\Google Drive\\virginia tech\\Computer Science\\Deep Learning\\assignments\\homework2\\sentiment-data"
_dir = "/home/talha/deeplearning/sentiment-data/"

def load_data(_dir):
    train_file = _dir + "train.csv"
    test_file = _dir + "test.csv"
    word_vec_file = _dir + "word-vectors.txt"

    train_data = np.genfromtxt(train_file, dtype=str, delimiter=',')
    test_data = np.genfromtxt(test_file, dtype=str, delimiter=',')
    word_vec = np.genfromtxt(word_vec_file,dtype=str,delimiter=',')
    return (train_data, test_data, word_vec)

def get_data(_dir):
    # Load the data
    (train_data, test_data, word_vec) = load_data(_dir)
    
    num_word_vec_encodings = word_vec.shape[0]
    word_vec_dim = word_vec.shape[1] - 1
    
    # Get word vectors
    word_vectors = word_vec.reshape(word_vec.shape[0], word_vec_dim + 1)
    word_vec_df = pd.DataFrame(index=word_vec[:, 0], data=list(word_vec[:,np.arange(1,word_vec_dim+1)]))
    
    assert word_vec_df.shape == (num_word_vec_encodings, word_vec_dim)
    
    train_x = train_data[:,1]
    train_y = train_data[:,0]

    test_x = test_data[:,1]
    test_y = test_data[:,0]

    # encode positive class as +1 and negative class as -1
    train_y[train_y=='postive']=1
    train_y[train_y=='negative']=0
    test_y[test_y=='postive']=1
    test_y[test_y=='negative']=0

    # convert datatype of class encodings
    train_y=train_y.astype(int)
    test_y=test_y.astype(int)
    
    return (train_x, train_y, test_x, test_y, word_vec, word_vec_df)

(train_x, train_y, test_x, test_y, word_vec, word_vec_df) = get_data(_dir)


In [4]:
print (train_x.shape)
print (train_y.shape)
print (test_x.shape)
print (test_y.shape)

# assert that data loaded correctly
assert train_x.shape == (20000,)
assert train_y.shape == (20000,)
assert test_x.shape == (5000,)
assert test_y.shape == (5000,)

# map each word to an index
# at each of those idices, the mapping is placed in word_vec
word_indices_list = list(word_vec[:,0])
#print(word_indices_list[-10:-1])
#print(word_indices_list.index('zsombor'))

# word_vectors: Contains vector representations of the words
word_vectors = word_vec[:,1:51]
#word_vectors = word_vectors.astype(dtype=np.float)
print(word_vectors.shape)
word_vectors = word_vectors.astype(dtype=np.float32)

(20000,)
(20000,)
(5000,)
(5000,)
(317934, 50)


In [5]:
zeros = np.zeros((50))
word_vectors = np.vstack((word_vectors,zeros))
print(word_vectors.shape)
print(word_vectors[-1])

(317935, 50)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [7]:
trainSentences = train_x.shape[0]
maxSeqLength = 100

ids = np.full((trainSentences, maxSeqLength), word_vectors.shape[0]-1, dtype='int32')
sentence_counter=0
for sentence in train_x:
    split = sentence.split()
    word_counter=0
    for word in split:
        try:
            ids[sentence_counter][word_counter] = word_indices_list.index(word)
        except ValueError:
            pass
        word_counter=word_counter+1    
        if word_counter >= maxSeqLength:
            break
    sentence_counter = sentence_counter+1
np.save('train_matrix', ids)

In [8]:
# For test data
testSentences = test_x.shape[0]
maxSeqLength = 100

test_ids = np.full((testSentences, maxSeqLength), word_vectors.shape[0]-1, dtype='int32')
sentence_counter=0
for sentence in test_x:
    split = sentence.split()
    word_counter=0
    for word in split:
        try:
            test_ids[sentence_counter][word_counter] = word_indices_list.index(word)
        except ValueError:
            pass
        word_counter=word_counter+1    
        if word_counter >= maxSeqLength:
            break
    sentence_counter = sentence_counter+1
np.save('test_matrix', test_ids)

In [9]:
# Also save these, needed in training
np.save('word_vectors', word_vectors)
np.save('sen_train_y', train_y)
np.save('sen_test_y', test_y)

In [2]:
# Load all the data that's needed for training.
# Can do this step instead of loading all the data and performing word-to-vector 
# transformations.
ids = np.load('train_matrix.npy')
test_ids = np.load('test_matrix.npy')
word_vectors = np.load('word_vectors.npy')
train_y = np.load('sen_train_y.npy')
test_y = np.load('sen_test_y.npy')
word_vectors=word_vectors.astype(np.float32)

In [3]:
print(ids.shape)
print(test_ids.shape)
print(train_y.shape)
print(test_y.shape)

(20000, 100)
(5000, 100)
(20000,)
(5000,)


In [4]:
#_dir = "/home/talha/deeplearning/sentiment-data/"
#word_vec_file=_dir+"word-vectors.txt"
#word_vec = np.genfromtxt(word_vec_file,dtype=str,delimiter=',')

Generating and testing encoding here

In [5]:
_ids_test = ids[2].reshape(1,100)
with tf.device('/cpu:0'):
    input_data = tf.placeholder(tf.int32, [1, 100], name='test_placeholder')
    encoded_data = tf.Variable(tf.zeros([1, 100, 50]), dtype=tf.float32)
    encoded_data = tf.nn.embedding_lookup(word_vectors, input_data)

sess = tf.Session()
x = sess.run(encoded_data, feed_dict={input_data: _ids_test})
print(x)
sess.close()

[[[-0.43943     0.29657999  0.44867    ..., -1.06570005  0.43312001
    0.24698   ]
  [ 0.11626     0.53896999 -0.39513999 ..., -0.39061999 -0.10885     0.084513  ]
  [-0.75335997  0.54070002  0.064126   ..., -1.0632     -0.76071
    1.03789997]
  ..., 
  [ 0.          0.          0.         ...,  0.          0.          0.        ]
  [ 0.          0.          0.         ...,  0.          0.          0.        ]
  [ 0.          0.          0.         ...,  0.          0.          0.        ]]]


## Vanilla RNN on sentiment analysis data

In [8]:
# Model Hyperparameters

max_sequence_length = 100
vector_dimensions = 50
batch_size = 16
rnn_units = 100
num_classes = 2
train_examples = train_y.shape[0]
train_iters = 2
learning_rate = 1e-3

with tf.device('/cpu:0'):
    tf.reset_default_graph()

    # Placeholders for model input
    input_data = tf.placeholder(tf.int32, [batch_size, max_sequence_length])
    labels = tf.placeholder(tf.float32, [batch_size, num_classes])

    # Variable for getting encoded data for input vectors
    encoded_data = tf.Variable(tf.zeros([batch_size, max_sequence_length, vector_dimensions]), dtype=tf.float32)
    encoded_data = tf.nn.embedding_lookup(word_vectors, input_data)

    # Hidden Layer of RNN cells
    rnn_cell = tf.contrib.rnn.BasicRNNCell(rnn_units, activation=tf.tanh)    
    
    #The model converges quickly in 2 epochs and gives ~70% accuracy on test data.
    #Adding dropout (without dropout, converged quickly in 2 epochs but gave 70%
    #accuracy on test data. Similar 70.83% accuracy was achieved with 5 epochs).
    #Might be because of overfitting, so dropout seems necessary here
    #With dropout and 5 training iterations, getting 56.27 % accuracy on test data
    #Should try either of two things now.. 1) decrease keep_prob from 0.75 2) train for less number of epochs. Trying 2 first
    #Training for less number of epochs has not helped.
    #So now trying with varying the batch sizes of the input data.
    #Larger batch sizes train quickly but giving me low accuracy (~50 to 59%)
    #Tried with smaller batch sizes: batch_size = 10, gives me accuracy of 65%
    #But for this, the accuracy on training data is 93% only and the cost/loss 
    #I get is 0.41. So, it seems I can try to multiple iterations.
    
    #rnn_cell = tf.contrib.rnn.DropoutWrapper(cell=rnn_cell, output_keep_prob=0.5)
    outputs, state = tf.nn.dynamic_rnn(rnn_cell, encoded_data, dtype=tf.float32)

    # Weights for output layers
    W_o = tf.Variable(tf.truncated_normal([rnn_units, num_classes]), name='W_o')
    b_o = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b_o')


    #get the output of the final cell
    outputs = tf.transpose(outputs, [1, 0, 2])
    final_output = tf.gather(outputs, int(outputs.get_shape()[0]) - 1) 

    # Logits
    logits = (tf.matmul(final_output, W_o) + b_o)

    correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    init = tf.global_variables_initializer()
    
    rnn_sess = tf.Session()
    rnn_sess.run(init)

for epoch in range(train_iters):
    avg_cost = 0.
    avg_acc = 0.
    total_batch = int(train_examples/batch_size)

    # Loop over all batches
    for i in range(total_batch):
        offset = (i * batch_size) % (batch_size)
        batch_x = ids[offset:offset+batch_size]
        batch_y = tf.one_hot(train_y[offset:offset+batch_size], depth=num_classes).eval(session=rnn_sess)

        # Run training step and cost op 
        t, c, a = rnn_sess.run(
                        [optimizer, loss, accuracy], 
                        feed_dict={input_data: batch_x, labels: batch_y}
                        )
        avg_cost += c
        avg_acc += a

    # Compute average cost
    avg_cost /= total_batch
    avg_acc /= total_batch

    print("Epoch: " + str(epoch+1) + ", cost = " + str(avg_cost)) 
    print("Epoch: " + str(epoch+1) + ", Accuracy = " + str(avg_acc)) 

Epoch: 1, cost = 0.00171604360035
Epoch: 1, Accuracy = 0.9992
Epoch: 2, cost = 7.02255456963e-07
Epoch: 2, Accuracy = 1.0


In [9]:
test_iters = math.floor(test_y.shape[0] / batch_size)

test_accuracy = 0
for i in range(test_iters):
    offset = (i * batch_size) % (batch_size)
    batch_x = test_ids[offset:offset+batch_size]
    batch_y = tf.one_hot(train_y[offset:offset+batch_size], depth=num_classes).eval(session=rnn_sess)
    
    test_accuracy += (rnn_sess.run(accuracy, feed_dict={input_data: batch_x, labels: batch_y}))

test_accuracy /= test_iters
print("Accuracy on test data = " + str(test_accuracy * 100) + " %")

rnn_sess.close()

Accuracy on test data = 68.75 %
