### imports

In [None]:
%matplotlib inline

import torch
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from random import randint
import matplotlib.pyplot as plt
from InferSent.models import InferSent

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as rfClf
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/runyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### embed words using Infersent

In [4]:
V = 2
MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [5]:
model = infersent.cuda()

In [6]:
W2V_PATH = 'InferSent/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [7]:
data = pd.read_csv('finaldata.csv')
data.head()

Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio,verb_diff,adj_diff,nn_diff,ne_diff
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95,0.0,0.0,0.090909,1.0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86,1.0,1.0,0.230769,1.0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,63,63,60,0.5,0.333333,0.666667,1.0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,24,28,27,0.666667,1.0,1.0,1.0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86,1.0,0.333333,0.6,0.0


In [8]:
q1 = data.question1.values
q2 = data.question2.values
labels = data.is_duplicate.values

X = np.stack((q1, q2), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [9]:
questions = list(q1) + list(q2)
model.build_vocab(questions, tokenize=True)

Found 102976(/137513) words with w2v vectors
Vocab size : 102976


In [10]:
q1_train_embd = model.encode(Q1_train, tokenize=False, verbose=True)
q2_train_embd = model.encode(Q2_train, tokenize=False, verbose=True)
q1_test_embd = model.encode(Q1_test, tokenize=False, verbose=True)
q2_test_embd = model.encode(Q2_test, tokenize=False, verbose=True)

Nb words kept : 3254587/3664154 (88.8%)
Speed : 1763.9 sentences/s (gpu mode, bsize=64)
Nb words kept : 3316153/3729784 (88.9%)
Speed : 1608.1 sentences/s (gpu mode, bsize=64)
Nb words kept : 1393317/1568232 (88.8%)
Speed : 1751.1 sentences/s (gpu mode, bsize=64)
Nb words kept : 1421941/1599534 (88.9%)
Speed : 1784.6 sentences/s (gpu mode, bsize=64)


In [11]:
X_train = abs(q1_train_embd - q2_train_embd)
X_test = abs(q1_test_embd - q2_test_embd)

### function to plot confusion matrix

In [12]:
def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    B =(C/C.sum(axis=0))
    
    plt.figure(figsize=(20,4))
    
    labels = [1,2]
    # representing A in heatmap format
    cmap=sns.light_palette("blue")
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing B in heatmap format
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()

### train and predict on XGBoost model

In [None]:
%%time
clf = xgb.XGBClassifier(max_depth=10,learning_rate=0.131131,n_estimators=500,
                        min_child_weight=6,
                        reg_alpha=119.704012,reg_lambda=115.715236,
                        gamma=3.768808,colsample_bytree=0.911753,n_jobs=-1)
clf.fit(X_train, y_train)
predict_y = clf.predict_proba(X_test)
y_pred =np.argmax(predict_y,axis=1)
plot_confusion_matrix(y_test, y_pred)

### train and predict on Random Forest model

In [None]:
%%time
clf = rfClf(n_estimators=500, max_depth=10, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
plot_confusion_matrix(y_test, y_pred)

### import keras

In [None]:
import numpy as np
import pandas as pd
import datetime, time, json
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed

### create neural network

In [26]:
from keras.callbacks import EarlyStopping

MODEL_WEIGHTS_FILE = "best_weights.h5"
#early_stopping_monitor = EarlyStopping(patience=3)
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
model_mc = Sequential()

#add model layers
model_mc.add(Dense(200, activation='relu', input_shape=(4096,)))
model_mc.add(Dense(200, activation='relu'))
model_mc.add(Dense(200, activation='relu'))
model_mc.add(Dense(1))

model_mc.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#train model
model_mc.fit(X_train, y_train, validation_split=0.1, epochs=10, callbacks=callbacks)

Train on 254700 samples, validate on 28300 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20a3118ccf8>

In [27]:
model_mc.load_weights(MODEL_WEIGHTS_FILE)
y_pred = model_mc.predict(X_test)
y_pred =(y_pred>0.5)
list(y_pred)
confusion_matrix(y_test, y_pred)

array([[61371, 15137],
       [12730, 32049]], dtype=int64)

### calculate performance metrics

In [28]:
recall_score(y_test, y_pred), precision_score(y_test, y_pred), accuracy_score(y_test, y_pred)

(0.7157149556711851, 0.6792056966049251, 0.7702391847436246)

In [20]:
recall_score(y_test, y_pred)

0.7164742401572165

In [21]:
precision_score(y_test, y_pred)

0.6886389491081586

In [22]:
accuracy_score(y_test, y_pred)

0.7757220477050302