### imports

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier as rfClf
from IPython.display import display
from pprint import pprint
from tqdm import tqdm_notebook
from sklearn.preprocessing import normalize

def zprint():
    print('{}{}'.format('*' * 80, '\n'))


### load data

In [2]:
data = pd.read_csv('./data-quora-question-pairs/train.csv', dtype={'question1': str, 'question1': str,},)


In [3]:
print(data.shape)

y = list(data['is_duplicate'])
print(len(y))
display(y[:10])

display(data.head())
zprint()

X = data[['question1','question2']]
X['question1'].value = X['question1'].astype(str)
X['question2'].value = X['question2'].astype(str)
print(X.shape)
display(X.head())


(404290, 6)
404290


[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


********************************************************************************

(404290, 2)


Unnamed: 0,question1,question2
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


### split into train and test

In [4]:
_X = X.copy()
_X['questions'] = X['question1'].astype(str) + ' ' + X['question2'].astype(str)
_X['questions'].value = _X['questions'].astype(str)

print(_X.shape)
display(_X.head())

__X = list(_X['questions'])
print(len(__X))
display(__X[0])


(404290, 3)


Unnamed: 0,question1,question2,questions
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,"Which one dissolve in water quikly sugar, salt..."


404290


'What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market?'

In [5]:
X_train, X_test, y_train, y_test = train_test_split(_X, y, train_size=0.7, shuffle=True, stratify=y)
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))




(283003, 3)
283003
(121287, 3)
121287


### TF-IDF Random Forest model

In [6]:
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english', 
                             norm='l2', 
                             smooth_idf=True, 
                             use_idf=True, 
                             binary=False, 
                             strip_accents='unicode',
                            )
Xv_train = vectorizer.fit_transform(X_train['questions'])


In [7]:
print(Xv_train.shape)
print(Xv_train[0])


(283003, 73597)
  (0, 71188)	0.06790018537186837
  (0, 59537)	0.1442832554076246
  (0, 46906)	0.1708460544191791
  (0, 20937)	0.09024120931970257
  (0, 66049)	0.08487759551344401
  (0, 51676)	0.23671499058180598
  (0, 26302)	0.10888736032010338
  (0, 35865)	0.34531488120586484
  (0, 15327)	0.2345873980946561
  (0, 33009)	0.08714162210324544
  (0, 59991)	0.2939266541890618
  (0, 31628)	0.08139937082681085
  (0, 12443)	0.10089428705183909
  (0, 68334)	0.26171779135948436
  (0, 71517)	0.242404245177812
  (0, 65333)	0.06762531483548484
  (0, 38156)	0.31697572709689537
  (0, 71238)	0.16043911661559446
  (0, 65509)	0.18270646638233345
  (0, 54135)	0.2581982814876192
  (0, 7422)	0.16483589917272556
  (0, 2927)	0.3606815737694713
  (0, 50346)	0.24844744916058342


### train

In [8]:
clf = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=50,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [9]:
Xv1_train = vectorizer.transform(X_train['question1'].astype(str))
Xv2_train = vectorizer.transform(X_train['question2'].astype(str))
Xvc_train = sp.sparse.hstack([Xv1_train,Xv2_train], format='csr')

print(Xv1_train.shape)
print(Xv2_train.shape)
zprint()

print(Xvc_train.shape)
print(Xvc_train)


(283003, 73597)
(283003, 73597)
********************************************************************************

(283003, 147194)
  (0, 15327)	0.3681122733694606
  (0, 20937)	0.1416056317777191
  (0, 26302)	0.17086499136010472
  (0, 33009)	0.13674178953374416
  (0, 35865)	0.5418647675938045
  (0, 46906)	0.2680899741386375
  (0, 51676)	0.3714508708969054
  (0, 59537)	0.22640788716116936
  (0, 59991)	0.4612268593974383
  (0, 66049)	0.13318910093363262
  (0, 71188)	0.10654831334699752
  (0, 76524)	0.46802891608843694
  (0, 81019)	0.21389495009684792
  (0, 86040)	0.13092280627168754
  (0, 105225)	0.10562574322885368
  (0, 111753)	0.4113151787297126
  (0, 123943)	0.3223912691749686
  (0, 127732)	0.335044179156723
  (0, 138930)	0.08775220334050296
  (0, 139106)	0.23708421955019718
  (0, 141931)	0.3396111781671699
  (0, 144835)	0.2081895813612499
  (0, 145114)	0.3145494651698502
  (1, 20006)	0.19147178714930443
  (1, 33009)	0.09438529531038886
  :	:
  (283002, 20006)	0.33166276402617295
  (2

In [10]:
%%time
clf.fit(Xvc_train, y_train)


CPU times: user 5min 49s, sys: 1.44 s, total: 5min 50s
Wall time: 12min 52s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
y_pred_train = clf.predict(Xvc_train)


In [12]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average=average)
    recall = metrics.recall_score(y_train, y_pred_train, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9877351123486323
0.9977137676517832
0.9690000382833736
********************************************************************************

micro
0.9877351123486323
0.9877351123486323
0.9877351123486323
********************************************************************************

macro
0.9877351123486323
0.9899352968355982
0.9838502283631142
********************************************************************************

weighted
0.9877351123486323
0.9879004099726454
0.9877351123486323
********************************************************************************



In [13]:
Xv1_test = vectorizer.transform(X_test['question1'].astype(str))
Xv2_test = vectorizer.transform(X_test['question2'].astype(str))
Xvc_test = sp.sparse.hstack([Xv1_test,Xv2_test], format='csr')

print(Xv1_test.shape)
print(Xv2_test.shape)
zprint()

print(Xvc_test.shape)
print(Xvc_test[0])


(121287, 73597)
(121287, 73597)
********************************************************************************

(121287, 147194)
  (0, 33009)	0.10724563278477903
  (0, 33244)	0.2015294382182401
  (0, 34651)	0.08951341190416477
  (0, 39218)	0.42373622112669074
  (0, 47107)	0.35437336474313336
  (0, 48164)	0.3178029868404363
  (0, 54801)	0.3953985999258331
  (0, 62656)	0.4139928371624464
  (0, 63564)	0.4037346471780976
  (0, 71188)	0.08356509978414339
  (0, 72683)	0.18980276713322777
  (0, 78185)	0.28812116983035174
  (0, 94534)	0.10264765995552425
  (0, 106606)	0.09912193842546141
  (0, 106841)	0.18626388830279103
  (0, 120215)	0.10715333277150692
  (0, 121761)	0.29372989160539786
  (0, 123943)	0.2826042499929678
  (0, 136253)	0.38263350635590765
  (0, 137161)	0.37315235873635055
  (0, 138735)	0.40543316282135805
  (0, 138918)	0.1633114537054273
  (0, 139132)	0.22252138169129151
  (0, 139828)	0.36659092188940506
  (0, 144785)	0.07723516995740076
  (0, 146260)	0.1331852181527636


In [75]:
y_pred = clf.predict(Xvc_test)


In [86]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.7882955304360731
0.7916870266308331
0.5789097568056455
********************************************************************************

micro
0.7882955304360731
0.7882955304360731
0.7882955304360731
********************************************************************************

macro
0.7882955304360731
0.7893641755925023
0.7448778407074184
********************************************************************************

weighted
0.7882955304360731
0.7887565116170128
0.7882955304360731
********************************************************************************



### bag of words Random Forest model

In [61]:
vectorizer_2 = TfidfVectorizer(lowercase=True,
                             stop_words='english', 
                             norm='l2', 
                             smooth_idf=True, 
                             use_idf=False, 
                             binary=False, 
                             strip_accents='unicode',
                            )
Xv_train_2 = vectorizer_2.fit_transform(X_train['questions'])


In [77]:
print(Xv_train_2)


  (0, 1074)	0.1796053020267749
  (0, 24390)	0.1796053020267749
  (0, 57749)	0.1796053020267749
  (0, 12128)	0.1796053020267749
  (0, 4509)	0.1796053020267749
  (0, 6998)	0.1796053020267749
  (0, 35539)	0.1796053020267749
  (0, 9867)	0.3592106040535498
  (0, 65266)	0.3592106040535498
  (0, 33103)	0.3592106040535498
  (0, 41484)	0.1796053020267749
  (0, 4512)	0.1796053020267749
  (0, 33803)	0.1796053020267749
  (0, 21053)	0.1796053020267749
  (0, 34745)	0.3592106040535498
  (0, 71159)	0.3592106040535498
  (1, 46311)	0.2
  (1, 47290)	0.4
  (1, 71877)	0.4
  (1, 45281)	0.4
  (1, 65266)	0.4
  (1, 34745)	0.4
  (1, 71159)	0.4
  (2, 45946)	0.17407765595569785
  (2, 47243)	0.17407765595569785
  :	:
  (283001, 63522)	0.47140452079103173
  (283001, 72654)	0.23570226039551587
  (283001, 20999)	0.23570226039551587
  (283001, 12128)	0.23570226039551587
  (283001, 65266)	0.23570226039551587
  (283001, 41484)	0.23570226039551587
  (283001, 34745)	0.23570226039551587
  (283001, 71159)	0.4714045207910317

In [62]:
clf_2 = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=50,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [63]:
%%time
clf_2.fit(Xvc_train, y_train)


CPU times: user 5min 56s, sys: 2.01 s, total: 5min 58s
Wall time: 2min 56s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [64]:
y_pred_train_2 = clf_2.predict(Xvc_train)


In [90]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train_2)
    precision = metrics.precision_score(y_train, y_pred_train_2, average=average)
    recall = metrics.recall_score(y_train, y_pred_train_2, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9876821093769323
0.9978115573431123
0.9687607671988056
********************************************************************************

micro
0.9876821093769323
0.9876821093769323
0.9876821093769323
********************************************************************************

macro
0.9876821093769323
0.989917064134844
0.9837586010440444
********************************************************************************

weighted
0.9876821093769323
0.9878518252339343
0.9876821093769323
********************************************************************************



In [67]:
y_pred_2 = clf_2.predict(Xvc_test)


In [87]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred_2)
    precision = metrics.precision_score(y_test, y_pred_2, average=average)
    recall = metrics.recall_score(y_test, y_pred_2, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.7893261437746832
0.7958455146945684
0.5775251792134706
********************************************************************************

micro
0.7893261437746832
0.7893261437746832
0.7893261437746832
********************************************************************************

macro
0.7893261437746832
0.7913928897916486
0.7454076463328293
********************************************************************************

weighted
0.7893261437746832
0.7902280713400028
0.7893261437746832
********************************************************************************



### load data with new String features

In [119]:
data_fe = pd.read_csv('./data-quora-question-pairs/quora_train.csv')


In [122]:
display(data_fe.head())


(404290, 6)


Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,63,63,60
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,24,28,27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86


In [129]:
print(data_fe.shape)

y = list(data_fe['is_duplicate'])
print(len(y))
display(y[:10])

display(data_fe.head())
zprint()

X = data_fe[['question1','question2','tokenSortRatio','tokenSetRatio','wratio']]
X['question1'].value = X['question1'].astype(str)
X['question2'].value = X['question2'].astype(str)
print(X.shape)
display(X.head())


(404290, 6)
404290


[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,63,63,60
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,24,28,27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86


********************************************************************************

(404290, 5)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86


### split into train and test

In [130]:
_X = X.copy()
_X['questions'] = X['question1'].astype(str) + ' ' + X['question2'].astype(str)
_X['questions'].value = _X['questions'].astype(str)

print(_X.shape)
display(_X.head())

__X = list(_X['questions'])
print(len(__X))
display(__X[0])


(404290, 6)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,questions
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,"Which one dissolve in water quikly sugar, salt..."


404290


'What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market?'

In [131]:
X_train, X_test, y_train, y_test = train_test_split(_X, y, train_size=0.7, shuffle=True, stratify=y)
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))




(283003, 6)
283003
(121287, 6)
121287


### TF-IDF with new String features (Random Forest model)

In [132]:
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english', 
                             norm='l2', 
                             smooth_idf=True, 
                             use_idf=True, 
                             binary=False, 
                             strip_accents='unicode',
                            )
Xv_train = vectorizer.fit_transform(X_train['questions'])


In [133]:
print(Xv_train.shape)
print(Xv_train[0])


(283003, 73895)
  (0, 71477)	0.12228275781328729
  (0, 34866)	0.1309927264196222
  (0, 72990)	0.27768385227919223
  (0, 56094)	0.48171920798792267
  (0, 46892)	0.1695506138923916
  (0, 60959)	0.693790411045817
  (0, 1178)	0.38553024586301476


### train

In [134]:
clf = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=50,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [137]:
Xv1_train = vectorizer.transform(X_train['question1'].astype(str))
Xv2_train = vectorizer.transform(X_train['question2'].astype(str))
Xvc_train = sp.sparse.hstack([Xv1_train, Xv2_train,
                              X_train[['tokenSortRatio','tokenSetRatio','wratio']]], format='csr')

print(Xv1_train.shape)
print(Xv2_train.shape)
zprint()

print(Xvc_train.shape)
print(Xvc_train)


(283003, 73895)
(283003, 73895)
********************************************************************************

(283003, 147793)
  (0, 1178)	0.38553024586301476
  (0, 34866)	0.1309927264196222
  (0, 46892)	0.1695506138923916
  (0, 56094)	0.48171920798792267
  (0, 60959)	0.693790411045817
  (0, 71477)	0.12228275781328729
  (0, 72990)	0.27768385227919223
  (0, 75073)	0.38553024586301476
  (0, 108761)	0.1309927264196222
  (0, 120787)	0.1695506138923916
  (0, 129989)	0.48171920798792267
  (0, 134854)	0.693790411045817
  (0, 145372)	0.12228275781328729
  (0, 146885)	0.27768385227919223
  (0, 147790)	100.0
  (0, 147791)	100.0
  (0, 147792)	100.0
  (1, 15451)	0.3751367114091314
  (1, 15816)	0.4380453846301769
  (1, 31830)	0.1307011256258669
  (1, 33222)	0.13986327736870136
  (1, 33461)	0.263157114037892
  (1, 34866)	0.11663455677295982
  (1, 36089)	0.5524741634078637
  (1, 60272)	0.47340537958991125
  :	:
  (283002, 62516)	0.25920479561330045
  (283002, 62820)	0.6323732569569827
  (283002, 

In [138]:
%%time
clf.fit(Xvc_train, y_train)


CPU times: user 6min 17s, sys: 3.01 s, total: 6min 20s
Wall time: 3min 10s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [139]:
y_pred_train = clf.predict(Xvc_train)


In [140]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average=average)
    recall = metrics.recall_score(y_train, y_pred_train, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9904099956537563
0.9980327291234389
0.9759484705792274
********************************************************************************

micro
0.9904099956537563
0.9904099956537563
0.9904099956537563
********************************************************************************

macro
0.9904099956537563
0.9920678878071807
0.9874112700030055
********************************************************************************

weighted
0.9904099956537563
0.9905074554907417
0.9904099956537563
********************************************************************************



### test

In [141]:
Xv1_test = vectorizer.transform(X_test['question1'].astype(str))
Xv2_test = vectorizer.transform(X_test['question2'].astype(str))
Xvc_test = sp.sparse.hstack([Xv1_test, Xv2_test,
                             X_test[['tokenSortRatio','tokenSetRatio','wratio']]], format='csr')

print(Xv1_test.shape)
print(Xv2_test.shape)
zprint()

print(Xvc_test.shape)
print(Xvc_test)


(121287, 73895)
(121287, 73895)
********************************************************************************

(121287, 147793)
  (0, 5882)	0.16331710976931804
  (0, 23416)	0.3586512363050986
  (0, 31830)	0.13253843137988555
  (0, 33181)	0.3719646103154373
  (0, 44460)	0.2082636627005847
  (0, 59813)	0.23545979270896208
  (0, 60514)	0.42097043850225313
  (0, 61644)	0.4708095100629896
  (0, 72351)	0.43712690742062915
  (0, 80734)	0.13899489446160734
  (0, 97311)	0.3206229692481986
  (0, 106474)	0.40121085384931315
  (0, 107076)	0.3325247085252811
  (0, 118355)	0.18618118986424778
  (0, 121376)	0.20108435426936647
  (0, 134409)	0.37633438364475474
  (0, 137342)	0.4603474223296863
  (0, 140198)	0.12346376350565796
  (0, 145372)	0.09870293902030969
  (0, 146246)	0.3907777606996976
  (0, 147790)	69.0
  (0, 147791)	72.0
  (0, 147792)	68.0
  (1, 3838)	0.3836206530747434
  (1, 5775)	0.18330590985867326
  :	:
  (121285, 105725)	0.1490670365180056
  (121285, 125826)	0.43115857496813836
  (121

In [142]:
y_pred = clf.predict(Xvc_test)


In [143]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.8001599511901523
0.8041218797192857
0.6064449853726077
********************************************************************************

micro
0.8001599511901523
0.8001599511901523
0.8001599511901523
********************************************************************************

macro
0.8001599511901523
0.8013764935869613
0.7599917194338335
********************************************************************************

weighted
0.8001599511901523
0.800658293313292
0.8001599511901523
********************************************************************************



### load data with new String features and normalize

In [176]:
data_fe = pd.read_csv('./data-quora-question-pairs/quora_train.csv')

print(data_fe.shape)
display(data_fe.head())


(404290, 6)


Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,63,63,60
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,24,28,27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86


In [177]:
y = list(data_fe['is_duplicate'])
print(len(y))
display(y[:10])
zprint()

X = data_fe[['question1','question2',
             'tokenSortRatio','tokenSetRatio','wratio']]
X['question1'].value = X['question1'].astype(str)
X['question2'].value = X['question2'].astype(str)
print(X.shape)
display(X.head())

for col in ['tokenSortRatio','tokenSetRatio','wratio']:
    X[col+'_norm'] = X[col] / X[col].max()
print(X.shape)
display(X.head())


404290


[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

********************************************************************************

(404290, 5)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86


(404290, 8)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,tokenSortRatio_norm,tokenSetRatio_norm,wratio_norm
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,0.93,1.0,0.95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,0.63,0.86,0.86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,0.63,0.63,0.6
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,0.24,0.28,0.27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,0.47,0.67,0.86


### split into train and test

In [153]:
_X = X.copy()
_X['questions'] = X['question1'].astype(str) + ' ' + X['question2'].astype(str)
_X['questions'].value = _X['questions'].astype(str)

print(_X.shape)
display(_X.head())

__X = list(_X['questions'])
print(len(__X))
display(__X[0])


(404290, 9)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,tokenSortRatio_norm,tokenSetRatio_norm,wratio_norm,questions
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,0.93,1.0,0.95,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,0.63,0.86,0.86,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,0.63,0.63,0.6,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,0.24,0.28,0.27,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,0.47,0.67,0.86,"Which one dissolve in water quikly sugar, salt..."


404290


'What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market?'

In [154]:
X_train, X_test, y_train, y_test = train_test_split(_X, y, train_size=0.7, shuffle=True, stratify=y)
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))




(283003, 9)
283003
(121287, 9)
121287


### TF-IDF with new normalized String features (Random Forest model)

In [155]:
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english', 
                             norm='l2', 
                             smooth_idf=True, 
                             use_idf=True, 
                             binary=False, 
                             strip_accents='unicode',
                            )
Xv_train = vectorizer.fit_transform(X_train['questions'])


In [156]:
print(Xv_train.shape)
print(Xv_train[0])


(283003, 73749)
  (0, 71539)	0.05293985593198728
  (0, 29813)	0.10307837510728696
  (0, 14507)	0.5363473176451466
  (0, 57306)	0.255311202262035
  (0, 34907)	0.40619088429151046
  (0, 66164)	0.07629870724098087
  (0, 40622)	0.22440045103365386
  (0, 34846)	0.0653036849591473
  (0, 38939)	0.07322840432266288
  (0, 53675)	0.1800748075681643
  (0, 21146)	0.12518323757342728
  (0, 18747)	0.18350095607310515
  (0, 20644)	0.15162990020727668
  (0, 53600)	0.20415293881022104
  (0, 47057)	0.058276244649376994
  (0, 65451)	0.030400227272420182
  (0, 24228)	0.21242469459351918
  (0, 46798)	0.04234486036782243
  (0, 28830)	0.1137757181897544
  (0, 67898)	0.12408290823520582
  (0, 6991)	0.3236956611139389
  (0, 72099)	0.07463925754287612
  (0, 71750)	0.18052670576186514
  (0, 9136)	0.06223858318813041
  (0, 32645)	0.06324531392754315
  (0, 30504)	0.09746304012028263
  (0, 35018)	0.055086492730672856
  (0, 66186)	0.11644051222860048


### train

In [157]:
clf = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=50,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [158]:
Xv1_train = vectorizer.transform(X_train['question1'].astype(str))
Xv2_train = vectorizer.transform(X_train['question2'].astype(str))
Xvc_train = sp.sparse.hstack([Xv1_train, Xv2_train,
                              X_train[['tokenSortRatio_norm','tokenSetRatio_norm','wratio_norm']]], format='csr')

print(Xv1_train.shape)
print(Xv2_train.shape)
zprint()

print(Xvc_train.shape)
print(Xvc_train)


(283003, 73749)
(283003, 73749)
********************************************************************************

(283003, 147501)
  (0, 14507)	0.34546728048999525
  (0, 18747)	0.3545850282624044
  (0, 20644)	0.2929995221877917
  (0, 21146)	0.24189575238653893
  (0, 29813)	0.19918178811067805
  (0, 34846)	0.12618849226950826
  (0, 34907)	0.5232641444805562
  (0, 38939)	0.1415016922637596
  (0, 40622)	0.21680824442180252
  (0, 53675)	0.3479645670372778
  (0, 57306)	0.2466731830915529
  (0, 66164)	0.14743454117290755
  (0, 71539)	0.10229745235971881
  (0, 80740)	0.47054319318238097
  (0, 82885)	0.09047369239274933
  (0, 88256)	0.5197774943518481
  (0, 97977)	0.30879312302441714
  (0, 102579)	0.16539112560047958
  (0, 104253)	0.14167805017107618
  (0, 106394)	0.09193713584814797
  (0, 108656)	0.19682104591533348
  (0, 108767)	0.08007698991549023
  (0, 114371)	0.16310089609723707
  (0, 120547)	0.06155499812314046
  (0, 120806)	0.08471380230933433
  :	:
  (283002, 72669)	0.23910776531077935

In [159]:
%%time
clf.fit(Xvc_train, y_train)


CPU times: user 5min 51s, sys: 3.29 s, total: 5min 54s
Wall time: 2min 56s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [160]:
y_pred_train = clf.predict(Xvc_train)


In [161]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average=average)
    recall = metrics.recall_score(y_train, y_pred_train, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9904629986254563
0.9976628888258705
0.9764557252785115
********************************************************************************

micro
0.9904629986254563
0.9904629986254563
0.9904629986254563
********************************************************************************

macro
0.9904629986254563
0.9920260886532806
0.9875584661044332
********************************************************************************

weighted
0.9904629986254563
0.9905514735404454
0.9904629986254563
********************************************************************************



### test

In [162]:
Xv1_test = vectorizer.transform(X_test['question1'].astype(str))
Xv2_test = vectorizer.transform(X_test['question2'].astype(str))
Xvc_test = sp.sparse.hstack([Xv1_test, Xv2_test,
                             X_test[['tokenSortRatio_norm','tokenSetRatio_norm','wratio_norm']]], format='csr')

print(Xv1_test.shape)
print(Xv2_test.shape)
zprint()

print(Xvc_test.shape)
print(Xvc_test)


(121287, 73749)
(121287, 73749)
********************************************************************************

(121287, 147501)
  (0, 6810)	0.19160459681241124
  (0, 9623)	0.25524890826624314
  (0, 14224)	0.4973582926290971
  (0, 33197)	0.17450977316737196
  (0, 50521)	0.521366595811019
  (0, 65451)	0.13543351120811523
  (0, 66164)	0.16995599621159407
  (0, 70172)	0.5317745620673667
  (0, 71344)	0.1359490803037746
  (0, 83372)	0.23510975930186676
  (0, 87973)	0.45811670365635415
  (0, 106946)	0.1607409451577674
  (0, 108595)	0.13398733956793699
  (0, 124257)	0.4575960900055202
  (0, 139200)	0.12474780181364538
  (0, 139913)	0.1565464613840285
  (0, 143921)	0.48981752807382684
  (0, 144499)	0.3732558218455383
  (0, 145093)	0.12522269248725218
  (0, 146574)	0.21604952213812265
  (0, 147498)	0.81
  (0, 147499)	0.85
  (0, 147500)	0.84
  (1, 8724)	0.5248637394554759
  (1, 69895)	0.7707358615084331
  :	:
  (121285, 147499)	0.65
  (121285, 147500)	0.86
  (121286, 18399)	0.6454485552183719


In [163]:
y_pred = clf.predict(Xvc_test)


In [164]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.8022953820277523
0.8056845570513197
0.6121396190178432
********************************************************************************

micro
0.8022953820277523
0.8022953820277523
0.8022953820277523
********************************************************************************

macro
0.8022953820277523
0.8033293008140916
0.7628651773135956
********************************************************************************

weighted
0.8022953820277523
0.80271315955286
0.8022953820277523
********************************************************************************



### load data with new NLP features and normalize

In [178]:
data_fe_2 = pd.read_csv('./data-quora-question-pairs/finaldata.csv')

print(data_fe_2.shape)
display(data_fe_2.head())


(404287, 10)


Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio,verb_diff,adj_diff,nn_diff,ne_diff
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95,0.0,0.0,0.090909,1.0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86,1.0,1.0,0.230769,1.0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,63,63,60,0.5,0.333333,0.666667,1.0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,24,28,27,0.666667,1.0,1.0,1.0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86,1.0,0.333333,0.6,0.0


In [179]:
y = list(data_fe_2['is_duplicate'])
print(len(y))
display(y[:10])
zprint()

X = data_fe_2[['question1','question2',
               'tokenSortRatio','tokenSetRatio','wratio',
               'verb_diff','adj_diff','nn_diff','ne_diff']]
X['question1'].value = X['question1'].astype(str)
X['question2'].value = X['question2'].astype(str)
print(X.shape)
display(X.head())

for col in ['tokenSortRatio','tokenSetRatio','wratio']:
    X[col+'_norm'] = X[col] / X[col].max()
print(X.shape)
display(X.head())


404287


[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]

********************************************************************************

(404287, 9)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,verb_diff,adj_diff,nn_diff,ne_diff
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,0.0,0.0,0.090909,1.0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,1.0,1.0,0.230769,1.0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,0.5,0.333333,0.666667,1.0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,0.666667,1.0,1.0,1.0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,1.0,0.333333,0.6,0.0


(404287, 12)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,verb_diff,adj_diff,nn_diff,ne_diff,tokenSortRatio_norm,tokenSetRatio_norm,wratio_norm
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,0.0,0.0,0.090909,1.0,0.93,1.0,0.95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,1.0,1.0,0.230769,1.0,0.63,0.86,0.86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,0.5,0.333333,0.666667,1.0,0.63,0.63,0.6
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,0.666667,1.0,1.0,1.0,0.24,0.28,0.27
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,1.0,0.333333,0.6,0.0,0.47,0.67,0.86


### split into train and test

In [180]:
_X = X.copy()
_X['questions'] = X['question1'].astype(str) + ' ' + X['question2'].astype(str)
_X['questions'].value = _X['questions'].astype(str)

print(_X.shape)
display(_X.head())

__X = list(_X['questions'])
print(len(__X))
display(__X[0])


(404287, 13)


Unnamed: 0,question1,question2,tokenSortRatio,tokenSetRatio,wratio,verb_diff,adj_diff,nn_diff,ne_diff,tokenSortRatio_norm,tokenSetRatio_norm,wratio_norm,questions
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,93,100,95,0.0,0.0,0.090909,1.0,0.93,1.0,0.95,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,63,86,86,1.0,1.0,0.230769,1.0,0.63,0.86,0.86,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,63,63,60,0.5,0.333333,0.666667,1.0,0.63,0.63,0.6,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,24,28,27,0.666667,1.0,1.0,1.0,0.24,0.28,0.27,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,47,67,86,1.0,0.333333,0.6,0.0,0.47,0.67,0.86,"Which one dissolve in water quikly sugar, salt..."


404287


'What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market?'

In [181]:
X_train, X_test, y_train, y_test = train_test_split(_X, y, train_size=0.7, shuffle=True, stratify=y)
print(X_train.shape)
print(len(y_train))
print(X_test.shape)
print(len(y_test))




(283000, 13)
283000
(121287, 13)
121287


### TF-IDF with new normalized String features + NLP features (Random Forest model)

In [182]:
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english', 
                             norm='l2', 
                             smooth_idf=True, 
                             use_idf=True, 
                             binary=False, 
                             strip_accents='unicode',
                            )
Xv_train = vectorizer.fit_transform(X_train['questions'])


In [183]:
print(Xv_train.shape)
print(Xv_train[0])


(283000, 73618)
  (0, 31813)	0.0839783795368488
  (0, 38927)	0.2946308358646404
  (0, 6826)	0.09871150545660912
  (0, 72726)	0.24144701813486186
  (0, 66117)	0.08767387742011196
  (0, 28154)	0.15286838756900753
  (0, 7010)	0.37872539502966535
  (0, 32649)	0.1453175002081391
  (0, 70612)	0.3439485920047134
  (0, 68765)	0.21571431710517544
  (0, 5874)	0.10366300617696682
  (0, 59574)	0.36050608712334353
  (0, 61167)	0.20095554491292938
  (0, 71262)	0.07009757774480421
  (0, 21067)	0.09323044445120451
  (0, 39609)	0.22728272079991466
  (0, 3684)	0.17893078729234732
  (0, 38839)	0.19598580534182505
  (0, 33178)	0.08998460041255339
  (0, 65409)	0.0697940420993238
  (0, 9964)	0.2617643909075131
  (0, 14689)	0.28175932798254555


### train

In [184]:
clf = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=50,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [185]:
Xv1_train = vectorizer.transform(X_train['question1'].astype(str))
Xv2_train = vectorizer.transform(X_train['question2'].astype(str))
Xvc_train = sp.sparse.hstack([Xv1_train, Xv2_train,
                              X_train[['tokenSortRatio_norm','tokenSetRatio_norm','wratio_norm',
                                       'verb_diff','adj_diff','nn_diff','ne_diff']]], format='csr')

print(Xv1_train.shape)
print(Xv2_train.shape)
zprint()

print(Xvc_train.shape)
print(Xvc_train)


(283000, 73618)
(283000, 73618)
********************************************************************************

(283000, 147243)
  (0, 5874)	0.12359828390319588
  (0, 6826)	0.11769456748253866
  (0, 7010)	0.45155750949683915
  (0, 28154)	0.18226627862135206
  (0, 31813)	0.10012813614537105
  (0, 32649)	0.1732632914018184
  (0, 38927)	0.3512908513927033
  (0, 59574)	0.42983447372763667
  (0, 61167)	0.23960100529660747
  (0, 66117)	0.10453430970124375
  (0, 68765)	0.25719801490257754
  (0, 70612)	0.4100928314786688
  (0, 72726)	0.2878793331930549
  (0, 77302)	0.3285667671445781
  (0, 83582)	0.48067233691610545
  (0, 88307)	0.5173886110320226
  (0, 94685)	0.1711970656158542
  (0, 106796)	0.16523679182187245
  (0, 112457)	0.3598845310068343
  (0, 113227)	0.4173543856320225
  (0, 139027)	0.12816130262177736
  (0, 144880)	0.1287186786175902
  (0, 147236)	0.34
  (0, 147237)	0.35
  (0, 147238)	0.38
  :	:
  (282999, 6337)	0.21646756339701811
  (282999, 7494)	0.20406212475772503
  (282999, 810

In [186]:
%%time
clf.fit(Xvc_train, y_train)


CPU times: user 5min 31s, sys: 3.13 s, total: 5min 34s
Wall time: 2min 46s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=50, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [187]:
y_pred_train = clf.predict(Xvc_train)


In [188]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average=average)
    recall = metrics.recall_score(y_train, y_pred_train, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9910459363957597
0.9978902541462367
0.9778147850388577
********************************************************************************

micro
0.9910459363957597
0.9910459363957597
0.9910459363957597
********************************************************************************

macro
0.9910459363957597
0.992528269395589
0.9883024047312194
********************************************************************************

weighted
0.9910459363957597
0.9911255893423736
0.9910459363957597
********************************************************************************



### test

In [189]:
Xv1_test = vectorizer.transform(X_test['question1'].astype(str))
Xv2_test = vectorizer.transform(X_test['question2'].astype(str))
Xvc_test = sp.sparse.hstack([Xv1_test, Xv2_test,
                             X_test[['tokenSortRatio_norm','tokenSetRatio_norm','wratio_norm',
                                     'verb_diff','adj_diff','nn_diff','ne_diff']]], format='csr')

print(Xv1_test.shape)
print(Xv2_test.shape)
zprint()

print(Xvc_test.shape)
print(Xvc_test)


(121287, 73618)
(121287, 73618)
********************************************************************************

(121287, 147243)
  (0, 20612)	0.7738951365248012
  (0, 21121)	0.26579020909074136
  (0, 24488)	0.5223955627357192
  (0, 71451)	0.2398848858281623
  (0, 94230)	0.7954530386748473
  (0, 94739)	0.2731941570542195
  (0, 103753)	0.4814810784008943
  (0, 145069)	0.24656720576000846
  (0, 147236)	0.85
  (0, 147237)	0.88
  (0, 147238)	0.85
  (0, 147239)	0.5
  (1, 9644)	0.17900245285019517
  (1, 13456)	0.39628691622501233
  (1, 18332)	0.6484643371820412
  (1, 32154)	0.41078033236117295
  (1, 33178)	0.12224347784925492
  (1, 34829)	0.10192207068425796
  (1, 58946)	0.3527815131996224
  (1, 65409)	0.09481473941388317
  (1, 71323)	0.2507830070319717
  (1, 83262)	0.18116222112049815
  (1, 87074)	0.4010683473952057
  (1, 91950)	0.6562884349950522
  (1, 105772)	0.41573663499150976
  :	:
  (121286, 45416)	0.20677586560778147
  (121286, 47375)	0.11059090666406553
  (121286, 50958)	0.23076751

In [190]:
y_pred = clf.predict(Xvc_test)


In [191]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.8062282025278884
0.8074622120747955
0.6239308604479779
********************************************************************************

micro
0.8062282025278884
0.8062282025278884
0.8062282025278884
********************************************************************************

macro
0.8062282025278884
0.8065989278309976
0.7684274995500724
********************************************************************************

weighted
0.8062282025278884
0.8063730903894626
0.8062282025278884
********************************************************************************



### Increase max_features to 100 in Random Forest model

In [194]:
clf_2 = rfClf(n_estimators=10,
            criterion='entropy', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0,
            max_features=100,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            bootstrap=True,
            n_jobs=2,
            warm_start=False,
            class_weight=None,
           )


In [195]:
%%time
clf_2.fit(Xvc_train, y_train)


CPU times: user 5min 5s, sys: 3.11 s, total: 5min 8s
Wall time: 2min 32s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [198]:
y_pred_train = clf_2.predict(Xvc_train)


In [199]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average=average)
    recall = metrics.recall_score(y_train, y_pred_train, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.9910671378091873
0.9977153792079981
0.9780444852800428
********************************************************************************

micro
0.9910671378091873
0.9910671378091873
0.9910671378091873
********************************************************************************

macro
0.9910671378091873
0.9925057934494713
0.9883668392027944
********************************************************************************

weighted
0.9910671378091873
0.9911429805417848
0.9910671378091873
********************************************************************************



In [200]:
y_pred = clf_2.predict(Xvc_test)


In [201]:
averages = ['binary','micro','macro','weighted']
for average in averages:
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    print(average)
    print(accuracy)
    print(precision)
    print(recall)
    zprint()


binary
0.8041422411305416
0.8001541887956142
0.6258067397664083
********************************************************************************

micro
0.8041422411305416
0.8041422411305416
0.8041422411305416
********************************************************************************

macro
0.8041422411305416
0.802957752834021
0.7671630551448761
********************************************************************************

weighted
0.8041422411305416
0.8036911726018001
0.8041422411305416
********************************************************************************

