<h1>Classifying Amazon instant video reviews with a RNN using pretrained word embeddings</h1>

In [1]:
import tensorflow as tf
import json
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
import zipfile
from sklearn.preprocessing import LabelEncoder
import codecs
import gzip
REVIEW_FT = 'reviews_words'
ZIP_FILE = 'data/sentiment labelled sentences.zip'
DIR_TO_EXTRACT = 'data/'
glove_filename = '../../temp/glove.twitter.27B.25d.txt'

json_data = []
with gzip.open('data/reviews_Amazon_Instant_Video_5.json.gz', 'rb') as gzip_file:
    for json_str in gzip_file:
        json_data.append(json.loads(json_str))
reviews_df = pd.DataFrame.from_records(json_data)

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
def build_word_vector_matrix(vector_file):
    np_arrays = []
    labels_array = []
    with codecs.open(vector_file, 'r', 'utf-8') as f:
        for i, line in enumerate(f):
            sr = line.split()
            if(len(sr)<26):
                continue
            labels_array.append(sr[0])
            np_arrays.append(np.array([float(j) for j in sr[1:]]))
    return np.array(np_arrays), labels_array
embeddings,vocabulary = build_word_vector_matrix(glove_filename)
voc_size = len(vocabulary)
WD_EMB_SIZE = len(embeddings[0])
embeddings = np.asarray(embeddings)
WINDOW_SIZE = WD_EMB_SIZE
STRIDE_SIZE = int(WD_EMB_SIZE/2)
print(WD_EMB_SIZE)
print(STRIDE_SIZE)
print(embeddings.shape)

25
12
(1193513, 25)


In [4]:
reviews_df = reviews_df.sample(frac=0.5)
print(reviews_df.shape)
print(reviews_df.head(5))

(9282, 9)
             asin  helpful  overall  \
34730  B00I3MNVBW  [7, 10]      1.0   
382    B000IXUOP0   [1, 1]      5.0   
5785   B003OURWUO   [0, 0]      5.0   
13702  B007UMK3SA   [0, 0]      3.0   
32678  B00HNZHPGC   [1, 1]      2.0   

                                              reviewText   reviewTime  \
34730  There is NO way a professional football cheerl...  02 20, 2014   
382    as we all know Season 3 leaves us with Gibbs g...  01 11, 2010   
5785   I love all the Foyle/s War programs.  Michael ...  03 15, 2014   
13702  Reasonably entertaining with decent acting and...  03 20, 2014   
32678  The show is based on a book series called &#34...  07 12, 2014   

           reviewerID                                      reviewerName  \
34730  A39YKEKCLUVL0L                                              Tina   
382    A1U11PS71ML0YJ  Derrick Jenkins "love of Gaming/Movies/Anime/...   
5785   A2UNGG75A2TBEU                                    michael kallal   
13702  A1H5QWR9G

In [5]:
average_review_size = int(sum([len(c) for c in reviews_df.reviewText])/reviews_df.shape[0])

In [6]:
lencoder = LabelEncoder()
voc_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(average_review_size)
voc_processor.fit(vocabulary)
X_transform = voc_processor.transform(reviews_df.reviewText)
X_transform = np.array(list(X_transform))
y = lencoder.fit_transform(reviews_df.overall.values)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_transform, 
                                    y, test_size=0.2, random_state=42)
n_words = len(voc_processor.vocabulary_)
n_classes = len(lencoder.classes_)

In [7]:
def get_estimator_spec(input_logits, out_lb, train_predict_m,embedding_placeholder):
    preds_cls = tf.argmax(input_logits, 1)
    if train_predict_m == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
        mode=train_predict_m,
        predictions={
            'pred_class': preds_cls,
            'pred_prob': tf.nn.softmax(input_logits)
        })
    tr_l = tf.losses.sparse_softmax_cross_entropy(labels=out_lb, logits=input_logits)
    if train_predict_m == tf.estimator.ModeKeys.TRAIN:
        adm_opt = tf.train.AdamOptimizer(learning_rate=0.01)
        tr_op = adm_opt.minimize(tr_l, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(train_predict_m, loss=tr_l, train_op=tr_op,
                                    scaffold=tf.train.Scaffold(init_feed_dict={embedding_placeholder: embeddings}))
    eval_metric_ops = {'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes)}
    return tf.estimator.EstimatorSpec(train_predict_m, loss=tr_l, train_op=tr_op)

In [8]:
def rnn_model_fn(features,labels,mode):
    em_plholder = tf.placeholder(tf.float32, [voc_size, WD_EMB_SIZE])
    Wt = tf.Variable(em_plholder,trainable=False, name='Wt')
    comments_word_vec = tf.nn.embedding_lookup(Wt, features[REVIEW_FT])
    comments_wd_l = tf.unstack(comments_word_vec, axis=1)
    rnn_cell = tf.nn.rnn_cell.GRUCell(WD_EMB_SIZE)
    _, comments_encoding = tf.nn.static_rnn(rnn_cell, comments_wd_l, dtype=tf.float32)
    dense = tf.layers.dense(comments_encoding, units=512, activation=tf.nn.relu) 
    dropout = tf.layers.dropout(inputs=dense, rate=0.4,training=(mode==tf.estimator.ModeKeys.TRAIN))
    logits = tf.layers.dense(inputs=dropout, units=n_classes)
    return get_estimator_spec(input_logits=logits, out_lb=labels, train_predict_m=mode, 
                                                     embedding_placeholder=em_plholder)

In [9]:
run_config = tf.contrib.learn.RunConfig()
run_config = run_config.replace(model_dir='/tmp/models/',save_summary_steps=10,log_step_count_steps=10)
classifier = tf.estimator.Estimator(model_fn=rnn_model_fn,config=run_config)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={REVIEW_FT: X_train},
      y=y_train,
      batch_size=256,
      num_epochs=None,
      shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200)

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x14549b5f8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 10, '_save_checkpoints_secs': 600, '_log_step_count_steps': 10, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/models/'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/models/model.ckpt.
INFO:tensorflow:loss = 1.6175256, step = 1
INFO:tensorflow:global_step/sec: 0.59516
INFO:tensorflow:global_step/sec: 3.63984
INFO:tensorflow:global_step/sec: 3.70885
INFO:tensorflow:global_step/sec: 3.67699
INFO:tensorflow:global_step/sec: 3.63468
INFO:ten

KeyboardInterrupt: 

In [10]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={REVIEW_FT: X_test},
      y=y_test,
      num_epochs=1,
      shuffle=False)
predictions = classifier.predict(input_fn=test_input_fn)
y_predicted = np.array(list(p['pred_class'] for p in predictions))
y_predicted = y_predicted.reshape(np.array(y_test).shape)

cls_mets = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(cls_mets))
print(metrics.confusion_matrix(y_test,y_predicted))

INFO:tensorflow:Restoring parameters from /tmp/models/model.ckpt-1
Accuracy: 0.556812
[[   0    0    0    0   94]
 [   0    0    0    0  115]
 [   0    0    0    0  198]
 [   0    0    0    0  416]
 [   0    0    0    0 1034]]
