In [1]:
import numpy as np
import tensorflow as tf
import matplotlib
from string import punctuation
import json
from collections import Counter

In [2]:
#load the json file
with open('Amazon_Instant_Video_5.json', 'r') as file:
    reviewstext = file.read()

In [3]:
#convert each line text into json and form the json array
reviewsjson = []
for review in reviewstext.split('\n'):
    if(review != ''):
        reviewsjson.append(json.loads(review))

In [4]:
len(reviewsjson)


37126

In [5]:
reviewsjson[1]['reviewText']

'I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.'

In [6]:
reviewsjson[1]

{'reviewerID': 'A3BC8O2KCL29V2',
 'asin': 'B000H00VBQ',
 'reviewerName': 'Carol T',
 'helpful': [0, 0],
 'reviewText': 'I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.',
 'overall': 5.0,
 'summary': 'Excellent Grown Up TV',
 'unixReviewTime': 1346630400,
 'reviewTime': '09 3, 2012'}

In [7]:
reviewsjson[0]['overall']

2.0

In [8]:
#a function for replacing characters and converting special form of words
def replaceSpecialChars(reviewtext):
    reviewtext = reviewtext.replace('.', ' ')
    reviewtext = reviewtext.replace(',', ' ')
    reviewtext = reviewtext.replace(';', ' ')
    reviewtext = reviewtext.replace(':', ' ')
    reviewtext = reviewtext.replace(',', ' ')
    reviewtext = reviewtext.replace('?', ' ')
    reviewtext = reviewtext.replace('!', ' ')
    reviewtext = reviewtext.replace('*', ' ')
    reviewtext = reviewtext.replace('&', ' ')
    reviewtext = reviewtext.replace('(', ' ')
    reviewtext = reviewtext.replace(')', ' ')
    reviewtext = reviewtext.replace('[', ' ')
    reviewtext = reviewtext.replace(']', ' ')
    reviewtext = reviewtext.replace('{', ' ')
    reviewtext = reviewtext.replace('}', ' ')
    reviewtext = reviewtext.replace('/', ' ')
    reviewtext = reviewtext.replace('\\', ' ')
    reviewtext = reviewtext.replace('-', ' ')
    reviewtext = reviewtext.replace('+', ' ')
    reviewtext = reviewtext.replace('\'s', ' ')
    return reviewtext

#prepare all text vocabulary and word2int dictionary, etc
allwords = set() # this is the word universe
allwords_counter = Counter() # this will be used to clip most frequently occurring and least frequently occurring words
positivewords_counter = Counter() #to identify words that occur more frequently in positive reviews
negativewords_counter = Counter() #to identify words that occur more frequently in negative reviews
for review in reviewsjson:
    reviewtext = review['reviewText'].lower()
    reviewtext = replaceSpecialChars(reviewtext)
    words = reviewtext.split(' ')
    allwords.update(words)
    allwords_counter.update(words)
    if int(review['overall'])>=3: #consider positive
        positivewords_counter.update(words)
    else:
        negativewords_counter.update(words)
print('finished preprocessing the review texts.')

finished preprocessing the review texts.


In [9]:
positivewords_counter 

Counter({'i': 56260,
         'highly': 1108,
         'recommend': 1739,
         'this': 41192,
         'series': 12089,
         '': 464378,
         'it': 52894,
         'is': 57057,
         'a': 77605,
         'must': 1394,
         'for': 24767,
         'anyone': 1045,
         'who': 9557,
         'yearning': 13,
         'to': 72935,
         'watch': 8028,
         '"grown': 3,
         'up"': 22,
         'television': 1182,
         'complex': 362,
         'characters': 7724,
         'and': 92478,
         'plots': 775,
         'keep': 2350,
         'one': 12932,
         'totally': 581,
         'involved': 609,
         'thank': 354,
         'you': 17995,
         'amazin': 2,
         'prime': 1217,
         'mysteries': 301,
         'are': 19530,
         'interesting': 3949,
         'the': 167603,
         'tension': 344,
         'between': 2031,
         'robson': 7,
         'tall': 52,
         'blond': 24,
         'good': 11459,
         'but': 23442,

In [10]:
#prepare a correlation between positive words and negative words and most common words to both sentiments
most_common_words = allwords_counter.most_common()
pos_neg_ratio = {}
for word, count in most_common_words:
    pos_count = positivewords_counter[word]
    neg_count = negativewords_counter[word]
    pos_neg_ratio[word] = float(pos_count)/float(neg_count+1) #applying log for the values of ratios and subtracting 1 
    pos_neg_ratio[word] = np.log(pos_neg_ratio[word]+0.000001)-1 #shifting the axis centre to 1
    #now most common words (such as the,was,has,what,is,on,for,of, etc) will fall around 0-1. We can eliminate them
print('finished computing pos to neg ratio correlation.')

finished computing pos to neg ratio correlation.


In [11]:
#prepare vocabulary set
vocab = []
for word, count in most_common_words:
    if (count > 50) & ((pos_neg_ratio[word] >= 1.0) | (pos_neg_ratio[word] < 0.0)):
        vocab.append(word)
print('vocab length:{0}'.format(len(vocab)))

vocab length:2197


In [12]:
#prepare vocabulary to int dictionary
vocab_to_int = {word:i for i, word in enumerate(vocab)}
print('finished preparing vocab_to_int. Length:{0}'.format(len(vocab_to_int)))

finished preparing vocab_to_int. Length:2197


In [14]:
#prepare features and labels

# for full dataset validation reviewsjson_part = reviewsjson

reviewsjson_part = reviewsjson[:1280]



features = np.zeros((len(reviewsjson_part), len(vocab)))
labels = np.zeros((len(reviewsjson_part), 1))

pos_label_index = 0
neg_label_index = 1

def convert_review_to_feature(json):
    reviewtext = review['reviewText'].lower()
    reviewtext = replaceSpecialChars(reviewtext)
    words = reviewtext.split(' ')
    vec = np.zeros(len(vocab))
    for word in words:
        if(word in vocab_to_int):
            vec[vocab_to_int[word]] = 1
    return vec

def convert_review_to_label_onehot(json):
    score = int(review['overall'])
    vec = np.zeros(2)
    if(score >= 3):
        vec[pos_label_index] = 1
    else:
        vec[neg_label_index] = 1
    return vec

def convert_review_to_label(json):
    score = int(review['overall'])
    vec = 0
    if(score >= 3):
        vec = 1
    return vec

for i, review in enumerate(reviewsjson_part):
    features[i] = convert_review_to_feature(review)
    labels[i] = convert_review_to_label(review)

print('finished preparing features and labels')

finished preparing features and labels


In [15]:
#prepare training and testing set
train_split_per = 0.8
validation_split_per = 0.1
features_count = len(features)

features_train, features_test = features[: int(train_split_per*features_count)], features[int(train_split_per*features_count):]
labels_train, labels_test = labels[: int(train_split_per*features_count)], labels[int(train_split_per*features_count):]

print('finished splitting training and test sets')

finished splitting training and test sets


In [16]:
#build graph for RNN
#uses LSTM memory cell
lstm_size = 64
lstm_layers = 1
batch_size = 128
learning_rate = 0.001

graph = tf.Graph()

# create the input and label placeholders
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

#create the first layer (embedding)
embed_size = 50 #embedding layer size

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((len(vocab), embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

print('finished initializing params and creating embedding layers')

finished initializing params and creating embedding layers


In [17]:
#create the lstm layer
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    #adding a drop out layer
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    #stack up the layer
    cell_layers = tf.contrib.rnn.MultiRNNCell([drop]*lstm_layers)
    #get the initial state for assigning     
    init_state = cell_layers.zero_state(batch_size, tf.float32)

print('finished creating lstm layers')

finished creating lstm layers


In [18]:
#do the forward pass of RNN
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell_layers, embed, initial_state=init_state)
    
    #form a regular fully_connected nn layer to obtain predictions
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid) #sigmoid because just two labels
    
    #cost computation
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    #optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    #validation accuracy
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
print('finished creating forward pass functions')

finished creating forward pass functions


In [19]:
#creating a batching function
def get_batches(x, y, batch_size=100):
    #get the total number of batches that can be formed
    n = len(x) // batch_size
    x_clipped = x[:n*batch_size]
    y_clipped = y[:n*batch_size]
    
    batches = []
    for i in range(0, len(x), batch_size):
        x_batch = x_clipped[i: i+batch_size]
        y_batch = y_clipped[i: i+batch_size]
        batch = [x_batch, y_batch]
        if(len(x_batch) >0):
            batches.append(batch)
    
    return batches

print('finished creating batching function')

finished creating batching function


In [20]:
#prepare saver
with graph.as_default():
    saver = tf.train.Saver()

In [21]:
#training function
epochs = 2

with tf.Session(graph=graph) as session:
    session.run(tf.global_variables_initializer())
    
    iteration = 1
    
    for e in range(epochs):
        state = session.run(init_state)
        
        #get the batches
        batches = get_batches(features_train, labels_train, batch_size)
        
        val_batch_index = np.random.randint(0, len(batches))
        x_val = batches[val_batch_index][0]
        y_val = batches[val_batch_index][1]
        
        #run for each batch
        for i, (x, y) in enumerate(batches, 1):
            
            if i==val_batch_index:
                continue
            
            #feed dictionary
            feed_dict = {inputs_:x, labels_:y, keep_prob:0.75, init_state: state}
            
            #execute cost, optimizer, and final state
            loss, state, _ = session.run([cost, final_state, optimizer], feed_dict=feed_dict)
            
            print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))
                        
            #execute cross validation process 
            feed_dict_val = {inputs_:x_val, labels_:y_val, keep_prob:1, init_state: state}
            
            #execute val accuracy and final state
            batch_acc, val_state = session.run([accuracy, final_state], feed_dict=feed_dict_val)
            print("Validation accuracy: {:.3f}".format(batch_acc))
            
            iteration+=1
            
    #save the model after every epoch
    saver.save(session, "checkpoints_amazon/sentiment.ckpt")
    
print('finished training and saved model in each epoch')

Epoch: 0/2 Iteration: 1 Train loss: 0.285
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 2 Train loss: 0.190
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 3 Train loss: 0.119
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 4 Train loss: 0.072
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 5 Train loss: 0.062
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 6 Train loss: 0.117
Validation accuracy: 0.961
Epoch: 0/2 Iteration: 7 Train loss: 0.042
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 8 Train loss: 0.042
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 9 Train loss: 0.026
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 10 Train loss: 0.018
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 11 Train loss: 0.051
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 12 Train loss: 0.038
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 13 Train loss: 0.122
Validation accuracy: 0.961
Epoch: 1/2 Iteration: 14 Train loss: 0.038
Validation accuracy: 0.961
finished training and saved m

In [22]:
# testing function

with tf.Session(graph=graph) as session:
    saver.restore(session, tf.train.latest_checkpoint('checkpoints_amazon'))
    
    #execute testing, prepare test state
    
    #get the batches
    test_batches = get_batches(features_test, labels_test, batch_size)
    
    test_acc = []
    
    for i,(x,y) in enumerate(test_batches, 1):
        
        feed_dict_test= {inputs_:x, labels_:y, keep_prob:1, init_state: state}
        
        batch_acc, test_state = session.run([accuracy, final_state], feed_dict=feed_dict_test)
        test_acc.append(batch_acc)
        print("Current batch Test accuracy: {:.3f}".format(batch_acc))
        print("Overall test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints_amazon/sentiment.ckpt
Current batch Test accuracy: 0.977
Overall test accuracy: 0.977
Current batch Test accuracy: 0.945
Overall test accuracy: 0.961
