In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import re
import string
import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import gzip

In [None]:
#Load the dataset
dataset=pd.read_csv('./dataset/train.csv')
dataset1=dataset.copy()

In [None]:
#Sample instance of the dataset
dataset.head()

In [None]:
#Get column Names
dataset.columns

In [None]:
#Dataset Shape
dataset.shape

In [None]:
#Stats of the target variable
dataset.describe()

In [None]:
#Outlier detection using box plot
dataset.score.plot.box()
plt.show()

In [None]:
#Check for null values
dataset.isnull().sum()

In [None]:
#Remove outliers
print(dataset.shape)
columns=dataset.columns
q25, q75=dataset.iloc[:,-1].quantile(0.25), dataset.iloc[:,-1].quantile(0.75)
iqr = q75 - q25
min = q25 - (iqr*2.0)
max = q75 + (iqr*2.0)
dataset = dataset.drop(dataset[dataset.iloc[:,-1] < min].index)
dataset = dataset.drop(dataset[dataset.iloc[:,-1] > max].index)
print("Outliers removed=",dataset.shape[0])

In [None]:
#New stats of the dataset. These are used for initializing the weights of the NN
dataset.describe()

In [None]:
#Take sufficient number of instances
dataset=dataset.iloc[:39900,:]
dataset.shape

# Text transformation

In [None]:

"""Remove stop words from list of tokenized words"""
stop_words=set(stopwords.words('english_for_lstm'))

"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()

"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()


In [None]:
#Append the parent comment and reply and then clean them.
for i in range(dataset.shape[0]):
#     line=re.sub('[^a-zA-Z0-9 ]*','',(dataset.iloc[i,3]+" "+dataset.iloc[i,1]).lower())
    line=re.sub('[^a-zA-Z0-9 ]*','',dataset.iloc[i,1].lower())
    new_line=[]
    for word in line.split():
        if word not in stop_words:
            new_line.append(lemmatizer.lemmatize(word))
    dataset.iloc[i,1]=' '.join(new_line)

In [None]:
#Tokenize and pad the sentences
t = Tokenizer()
t.fit_on_texts(dataset.iloc[:,1])
vocab_size = len(t.word_index) + 1
print("Vocab size=",vocab_size)

encoded_docs = t.texts_to_sequences(dataset.iloc[:,1])

max_length = 200
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs.shape)

words=t.word_docs

In [None]:
#Create the embedding matrix
embeddings_index = dict()
f = open('/home/sanket/nltk_data/glove.840B.300d.txt')
for line in f:
    values = line.strip().lower().split()
    word = values[0]
    try:
        if word in words:
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    except Exception as e:
        print(e)
        
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
#Function to generate the input matrix for NN
def get_input(x):
    x=np.reshape(x,[-1,max_length])
    sp=tuple(list(x.shape)+[300])
    mat = zeros(sp)
    for k in range(sp[0]):
        j=0
        for i in x[k]:
            mat[k][j]=embedding_matrix[i]
            j+=1
    return mat

# Defining the NN Graph

In [None]:
tf.reset_default_graph()
learning_rate = 0.0001
training_steps = 50
batch_size = 50
display_step = 50

# Network Parameters
num_input =  300
timesteps=max_length
num_hidden_layer_1 = 1024
num_hidden_layer_2 = 512
num_hidden_layer_3=256
num_classes =  1

# tf Graph input
X_tensor = tf.placeholder("float", [None, timesteps, num_input])
Y_tensor = tf.placeholder("float", [None, num_classes])

# Define weights
weights = {
    'layer3': tf.Variable(tf.random.normal([num_hidden_layer_2, num_hidden_layer_3]),name="layer3_weight"),
    'out': tf.Variable(tf.random.normal([num_hidden_layer_3, num_classes],mean=2,stddev=3),name="out_weight")
}
biases = {
    'out': tf.Variable(tf.random.normal([num_classes]),name="out_bias"),
    'layer3':tf.Variable(tf.random.normal([num_hidden_layer_3]),name="layer3_bias")
}


def Model(x, weights, biases):

    print(x.shape)
    x=tf.unstack(x,timesteps,1)
    # Forward direction cell
    lstm_fw_cell = rnn.BasicLSTMCell(num_hidden_layer_1, forget_bias=0.2,reuse=tf.AUTO_REUSE)
    # Backward direction cell
    lstm_bw_cell = rnn.BasicLSTMCell(num_hidden_layer_1, forget_bias=0.2,reuse=tf.AUTO_REUSE)
    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(num_hidden_layer_2, forget_bias=0)
    
    # Get lstm cell output
    outputs, states_f, states_b = rnn.static_bidirectional_rnn(cell_fw=lstm_fw_cell,
                                                   cell_bw=lstm_bw_cell,
                                                   inputs=x,
                                                   dtype=tf.float32)
    print(len(outputs), type(outputs)) 
    output, states=rnn.static_rnn(lstm_cell, outputs, dtype=tf.float32)
    output=tf.nn.dropout(output,keep_prob=0.7)
    print(output, type(output))
    outputs=tf.matmul(output[-1], weights['layer3']) + biases['layer3']
    print(outputs, type(outputs)) 
    outputs=tf.nn.leaky_relu(outputs,alpha=0.9)
    print(outputs, type(outputs)) 
    return tf.matmul(outputs, weights['out']) + biases['out']

logits = Model(X_tensor, weights, biases)
prediction =logits

# Define loss and optimizer
loss_op = tf.sqrt(tf.losses.mean_squared_error(prediction,Y_tensor))
print(loss_op)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,epsilon=0.1)

train_op = optimizer.minimize(loss_op)

init = tf.global_variables_initializer()


In [None]:
#Split the data into train and validation set
X_train, X_validation, Y_train, Y_Validation=train_test_split(padded_docs,dataset.score, test_size=0.3, random_state=50)
print(np.array(X_train).shape,np.array(Y_train).shape,np.array(X_validation).shape,np.array(Y_Validation).shape)
train_loss=[]
valid_loss=[]

# Train the NN

In [None]:
#Function to calculate the training and validation error
def cal_error():
    print('Training Error:')
    l=len(X_train)//batch_size
    for i in range(0,len(train_loss),l):
        print(sum(train_loss[i:i+l])/l)
    print('\nValidation Errors:')
    l=len(X_validation)//batch_size
    for i in range(0,len(valid_loss),l):
        print(sum(valid_loss[i:i+l])/l)

In [None]:
# Start training
config = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4, \
                        allow_soft_placement=True, device_count = {'CPU': 3})
sess= tf.Session(config=config)
pre_loss=10000000000
# Run the initializer
# sess.run(init)
saver = tf.train.Saver()
saver.restore(sess,'./checkpooint_01_04_19_d.ckpt')

len_validation_set=len(X_validation)

for step in range(training_steps):
    print("steps=",step)
    for i in range(0, len(X_train), batch_size):

        batch_x, batch_y = get_input(X_train[i:i+batch_size]), Y_train[i:i+batch_size]
        batch_x = batch_x.reshape((batch_size, timesteps, num_input))
        batch_y=np.array(batch_y).reshape((-1,1))
        j=i%len_validation_set
        batch_x_val, batch_y_val = get_input(X_validation[j:j+batch_size]), Y_Validation[j:j+batch_size]
        batch_x_val = batch_x_val.reshape((batch_size, timesteps, num_input))
        batch_y_val=np.array(batch_y_val).reshape((-1,1))

        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X_tensor: batch_x, 
                                      Y_tensor: batch_y})
        if i % display_step== 0:
            # Calculate batch loss and accuracy
            loss = sess.run(loss_op, feed_dict={X_tensor: batch_x,
                                                                 Y_tensor: batch_y})
            val_loss = sess.run(loss_op, feed_dict={X_tensor: batch_x_val,
                                                                 Y_tensor: batch_y_val})
            if pre_loss>loss or i%1000==0:
                pre_loss=min(loss,pre_loss)
                saver.save(sess,'./checkpooint_01_04_19_e.ckpt')
            train_loss.append(loss)
            valid_loss.append(val_loss)

            if i%1500==0:
                print("batch=",i)
                
                plt.plot(train_loss)
                plt.show()
                
                plt.plot(valid_loss)
                plt.show()
                
                cal_error()
print("Optimization Finished!")

In [None]:
#Save the training and validation loss in file
np.savetxt('train_loss',np.array(train_loss))
np.savetxt('validation_loss',np.array(valid_loss))
plt.plot(train_loss)
plt.plot(valid_loss)
plt.show()

In [None]:
#Close the session
sess.close()

# Evaluate the model

In [None]:
#Create a session and restore the model weights
config = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4, \
                        allow_soft_placement=True, device_count = {'CPU': 3})
sess= tf.Session(config=config)
saver = tf.train.Saver()
saver.restore(sess,'./checkpooint_01_04_19_e.ckpt')

In [None]:
batch=200
for p in range(0,len(dataset1),batch):
    batch_x, batch_y = get_input(padded_docs[p:p+batch]),dataset.score[p:p+batch]
    batch_x = batch_x.reshape((batch, timesteps, num_input))
    batch_y=batch_y
    a=sess.run(prediction, feed_dict={X_tensor: batch_x})
    b=sess.run(tf.math.round(a))
#     print(p,P+batch)
    dataset1.iloc[p:p+batch,0]=b
#     for t in zip(b,batch_y,dataset1.iloc[p:p+batch,1],dataset1.iloc[p:p+batch,3]):
#         print(t)
dataset1.to_csv('results.csv',index=False)
print("File Created!!!")    

In [None]:
dataset1.head()

# Configurations Used

# Finally, the test loss(RMSE) is 3.41 and the training loss(RMSE) is 3.39. Giving a final score of max(0, 100-3.41) = 96.59