# Sentiment Analysis with RNN



In [1]:
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import pandas
from sklearn.cross_validation import train_test_split
import numpy
import re
#import nltk
#from nltk.corpus import stopwords
import numpy as np
import tensorflow as tf



In [2]:
Tweet = pandas.read_csv("Tweets.csv")


## Data preprocessing

In [3]:
def tweet_to_words(raw_tweet):
    letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
    words = letters_only.lower().split()                             
    #stops = set(stopwords.words("english"))                  
    #meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( words )) 



In [4]:
Tweet['clean_tweet']=Tweet['text'].apply(lambda x: tweet_to_words(x))
Tweet['sentiment'] = Tweet['airline_sentiment'].apply(lambda x: 0 if x == 'negative' else 1)


In [5]:
#Join all the words in review to build a corpus

all_text = ' '.join(Tweet['clean_tweet'])
words = all_text.split()

### Encoding the words

The embedding lookup requires that we pass in integers to our network. The easiest way to do this is to create dictionaries that map the words in the vocabulary to integers. Then we can convert each of our reviews into integers so they can be passed into the network.


In [6]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

#print(vocab_to_int)
tweet_ints = []
for each in Tweet['clean_tweet']:
    tweet_ints.append([vocab_to_int[word] for word in each.split()])

In [7]:
print(tweet_ints[100])
Tweet["text"][100]

[81, 114, 1, 419, 12, 2774, 4060, 1, 12, 10378, 1311, 32, 1463, 81, 38, 252, 39, 11, 36, 6818, 941, 305, 6914]


'@VirginAmerica trying to add my boy Prince to my ressie. SF this Thursday @VirginAmerica from LAX http://t.co/GsB2J3c4gM'

### Encoding the labels

Our labels are "positive" or "negative". To use these labels in our network, we need to convert them to 0 and 1.

> **Exercise:** Convert labels from `positive` and `negative` to 1 and 0, respectively.

In [8]:
labels = np.array([0 if each == 'negative' else 1 for each in Tweet['airline_sentiment'][:]]) #np.array(Tweet['sentiment'])


In [9]:
tweet_len = Counter([len(x) for x in tweet_ints])
print("Zero-length reviews: {}".format(tweet_len[0]))
print("Maximum review length: {}".format(max(tweet_len)))

Zero-length reviews: 0
Maximum review length: 34


Turns out its the final review that has zero length. But that might not always be the case, so let's make it more general.

In [10]:
seq_len = max(tweet_len)
features = np.zeros((len(tweet_ints), seq_len), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [11]:
features[:10,:seq_len]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    81,    51,  7695,   223],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    81,   541,     5,
           91,  1137,  2494,     1,     3,   203, 12001],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    81,     2,   200,    11,    99,   773,
          569,     2,    78,     1,   155,   149,   191],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    81,    18,
           27,   134,  4117,     1,  5673,  5

## Training, Validation, Test



With our data in nice shape, we'll split it into training, validation, and test sets.

> **Exercise:** Create the training, validation, and test sets here. You'll need to create sets for the features and the labels, `train_x` and `train_y` for example. Define a split fraction, `split_frac` as the fraction of data to keep in the training set. Usually this is set to 0.8 or 0.9. The rest of the data will be split in half to create the validation and testing data.

In [12]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

print("Train set: \t\t{}".format(train_y.shape), 
      "\nValidation set: \t{}".format(val_y.shape),
      "\nTest set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(11712, 34) 
Validation set: 	(1464, 34) 
Test set: 		(1464, 34)
Train set: 		(11712,) 
Validation set: 	(1464,) 
Test set: 		(1464,)


## Build the graph

Here, we'll build the graph. First up, defining the hyperparameters.

* `lstm_size`: Number of units in the hidden layers in the LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `lstm_layers`: Number of LSTM layers in the network. I'd start with 1, then add more if I'm underfitting.
* `batch_size`: The number of reviews to feed the network in one training pass. Typically this should be set as high as you can go without running out of memory.
* `learning_rate`: Learning rate

In [13]:
lstm_size = 256
lstm_layers = 1
batch_size = 100
learning_rate = 0.001

For the network itself, we'll be passing in our 200 element long review vectors. Each batch will be `batch_size` vectors. We'll also be using dropout on the LSTM layer, so we'll make a placeholder for the keep probability.

In [3]:
n_words = len(vocab_to_int)
print(n_words)
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

NameError: name 'vocab_to_int' is not defined

### Embedding

Now we'll add an embedding layer. We need to do this because there are 74000 words in our vocabulary. It is massively inefficient to one-hot encode our classes here. You should remember dealing with this problem from the word2vec lesson. Instead of one-hot encoding, we can have an embedding layer and use that layer as a lookup table. You could train an embedding layer using word2vec, then load it here. But, it's fine to just make a new layer and let the network learn the weights.




In [15]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [16]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

### RNN forward pass


In [17]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

### Output

We only care about the final output, we'll be using that as our sentiment prediction. So we need to grab the last output with `outputs[:, -1]`, the calculate the cost from that and `labels_`.

In [18]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### Validation accuracy

Here we can add a few nodes to calculate the accuracy which we'll use in the validation pass.

In [19]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### Batching

This is a simple function for returning batches from our data. First it removes data such that we only have full batches. Then it iterates through the `x` and `y` arrays and returns slices out of those arrays with size `[batch_size]`.

In [20]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training

Below is the typical training code. If you want to do this yourself, feel free to delete all this code and implement it yourself. Before you run this, make sure the `checkpoints` directory exists.

In [21]:
print(train_x)
train_y[:None]

[[    0     0     0 ...,    51  7695   223]
 [    0     0     0 ...,     3   203 12001]
 [    0     0     0 ...,   155   149   191]
 ..., 
 [    0     0     0 ...,  1121  1737    13]
 [    0     0     0 ...,    32    15  1102]
 [    0     0     0 ...,     0    13    41]]


array([1, 1, 1, ..., 0, 0, 1])

In [26]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/twitter_sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.150
Epoch: 0/10 Iteration: 10 Train loss: 0.223
Epoch: 0/10 Iteration: 15 Train loss: 0.144
Epoch: 0/10 Iteration: 20 Train loss: 0.153
Epoch: 0/10 Iteration: 25 Train loss: 0.177
Val acc: 0.747
Epoch: 0/10 Iteration: 30 Train loss: 0.187
Epoch: 0/10 Iteration: 35 Train loss: 0.181
Epoch: 0/10 Iteration: 40 Train loss: 0.178
Epoch: 0/10 Iteration: 45 Train loss: 0.195
Epoch: 0/10 Iteration: 50 Train loss: 0.234
Val acc: 0.631
Epoch: 0/10 Iteration: 55 Train loss: 0.186
Epoch: 0/10 Iteration: 60 Train loss: 0.197
Epoch: 0/10 Iteration: 65 Train loss: 0.212
Epoch: 0/10 Iteration: 70 Train loss: 0.171
Epoch: 0/10 Iteration: 75 Train loss: 0.173
Val acc: 0.748
Epoch: 0/10 Iteration: 80 Train loss: 0.159
Epoch: 0/10 Iteration: 85 Train loss: 0.174
Epoch: 0/10 Iteration: 90 Train loss: 0.157
Epoch: 0/10 Iteration: 95 Train loss: 0.144
Epoch: 0/10 Iteration: 100 Train loss: 0.136
Val acc: 0.726
Epoch: 0/10 Iteration: 105 Train loss: 0.229
Epoch: 0/10 Ite

## Testing

In [25]:
get_batches(test_x, test_y, batch_size)

<generator object get_batches at 0x7fb01465d2b0>

In [6]:
graph = tf.Graph()
with graph.as_default():
    saver = tf.train.Saver()

ValueError: No variables to save

In [5]:
  
test_acc = []
test_pred = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state= sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
        prediction = tf.cast(tf.round(predictions),tf.int32)
        prediction = sess.run(prediction,feed_dict=feed)
        test_pred.append(prediction)
        
        
        
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
    

NameError: name 'saver' is not defined