In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import datetime

In [2]:
# Loading word vectors
words = np.load('wordsList.npy')
words = words.tolist() #Originally loaded as numpy array
words = [word.decode('UTF-8') for word in words] #Encode words as UTF-8
word_vectors = np.load('wordVectors.npy')

In [3]:
# Example of word vector
word_index = words.index("basketball")
word_vectors[word_index]

array([ -2.36540008e+00,   1.33039999e+00,  -5.22069991e-01,
         9.64690030e-01,  -8.49120039e-03,  -7.19780028e-01,
        -1.08099997e+00,   2.04430008e-03,  -8.93989980e-01,
        -2.60939986e-01,   6.15069985e-01,  -2.86000013e-01,
        -9.25300002e-01,   1.37659997e-01,   9.63559985e-01,
         9.45629999e-02,   1.89659998e-01,   1.31579995e+00,
        -1.20379996e+00,  -1.15590002e-02,  -1.15310001e+00,
         2.84130007e-01,  -9.76380035e-02,   3.80719990e-01,
        -6.82439983e-01,  -1.31830001e+00,  -4.31840003e-01,
        -1.56760007e-01,  -2.56379992e-01,  -1.03460002e+00,
         1.89440000e+00,   1.21389997e+00,  -9.61180031e-02,
        -9.00200009e-01,   4.26369995e-01,   8.43020022e-01,
        -1.51069999e-01,   4.67790008e-01,   9.30479988e-02,
        -5.97299993e-01,  -7.08199978e-01,  -3.94320011e-01,
        -2.78549999e-01,   6.45139992e-01,  -1.38630003e-01,
         3.29470009e-01,   3.99260014e-01,   3.67320001e-01,
        -7.08760023e-01,

In [4]:
# Load training data
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
# one hot encode the labels
# Y_train.loc[data['Is_Response'].str.contains('not happy')] = 0
# Y_train.loc[data['Is_Response'] == 'happy'] = 1
m = len(data['Is_Response']) # number of examples
Y_train = pd.get_dummies(data['Is_Response']) # [1,0] is not happy and [0,1] is happy
Y_train = Y_train.values

In [6]:
X_data = data['Description']
#cleaning the comments data
remove_special_chars = re.compile("[^A-Za-z0-9 ]+")
X_train = X_data.apply(lambda comment: re.sub(remove_special_chars, "", comment.lower()))

In [7]:
# count average number of words
# to obtain sequence length
words_count = X_train.str.split().str.len() # get number of words per each comment
words_count.head()

0     46
1    202
2    228
3     93
4    286
Name: Description, dtype: int64

In [8]:
words_count.describe()

count    38932.000000
mean       154.069557
std        128.114708
min          4.000000
25%         72.000000
50%        120.000000
75%        195.000000
max       2275.000000
Name: Description, dtype: float64

In [9]:
seq_length = 250 # 250 is reasonable decision as 75% of the data has length <= 195

In [10]:
m = len(X_train) # number of examples
uknown_vector_index = 399999
X_train_int = np.zeros((m, seq_length), dtype='int32')
# # Integerize the comments
# for i in range(m):
#     comment = X_train[0].split(" ")
#     word_counter = 0
#     for word in comment:
#         try:
#             X_train_int[i][word_counter] = words.index(word)
#         except ValueError:
#             X_train_int[i][word_counter] = uknown_vector_index
        
#         word_counter += 1
        
#         if (word_counter == seq_length):
#             break
# np.save('integerized_comments', X_train_int) # save them the integerized comments
# instead of computing the integerized_comments, they can be load
X_train_int = np.load('integerized_comments.npy')

In [None]:
# Building the model
batch_size = 1216
num_batches = 32
num_lstm_units = 64
num_classes = 2
iterations = 100000
word_vec_dimensions = 50 # the loaded word2vec has 50 dimensions

tf.reset_default_graph()

Y = tf.placeholder(tf.float32, [batch_size, num_classes]) # labels
X = tf.placeholder(tf.int32, [batch_size, seq_length]) # input_data

data = tf.Variable(tf.zeros([batch_size, seq_length, word_vec_dimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(word_vectors, X)

lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_lstm_units)
lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([num_lstm_units, num_classes]))
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))

value = tf.transpose(value, [1, 0, 2])
last_layer = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = tf.add((tf.matmul(last_layer, weight)), bias)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y))
optimizer = tf.train.AdamOptimizer().minimize(cost)



sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

# supervisizing the training
tf.summary.scalar('Loss', cost)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

minibatches = split_batches(X_train_int, Y_train, num_batches)
for i in range(iterations):
    #Next Batch of reviews
    for minibatch in minibatches:
        X_minibatch, Y_minibatch = minibatch
        _ , minibatch_cost = sess.run([optimizer, cost], {X: X_minibatch, Y: Y_minibatch})
        
    #Save the network every 100 training iterations
    if (i % 100 == 0 and i != 0):
        save_path = saver.save(sess, "model.ckpt", global_step=i)
        print("saved to %s" % save_path)
    #Write summary to Tensorboard
    summary = sess.run(merged, {X: X_minibatch, Y: Y_minibatch})
    writer.add_summary(summary, i)
writer.close()


saved to model.ckpt-100


In [13]:
def split_batches(X, Y, num_batches):
    total_num_examples = len(X)
    num_examples_per_batch = total_num_examples // num_batches
    d = num_examples_per_batch
    num_remainder_examples_last_batch = num_examples_per_batch - num_examples_per_batch * num_batches
    minibatches = []
    for x in range(0, num_batches):
        minibatches.append((X[x*d:(x+1)*d, :], Y[x*d:(x+1)*d, :]))
#     minibatches.append((X[x*num_batches: x*num_batches + num_remainder_examples_last_batch], 
#                         Y[x*num_batches: x*num_batches + num_remainder_examples_last_batch]
#                        ))
    return minibatches