### Preliminaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

from collections import Counter
import string
import re,math
import warnings
warnings.filterwarnings("ignore")
import wordcloud
from wordcloud import WordCloud, STOPWORDS

In [2]:
import nltk

### Reading Data

In [3]:
train_data = pd.read_csv('train.csv',encoding='utf-8')
train_data = train_data.fillna('')
print('Shape:{}'.format(train_data.shape))
train_data.head()

Shape:(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
test_data = pd.read_csv('test.csv',encoding='utf-8')
test_data = test_data.fillna('')
print('Shape:{}'.format(test_data.shape))
test_data.head()

Shape:(3563475, 3)


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [5]:
#Since data is huge, will work on first 1k records
train_data = train_data[:1000]
train_data.shape

(1000, 6)

### Data Preprocessing

In [6]:
#tokenisation
from nltk.tokenize import word_tokenize
train_data['que1_token'] = [word_tokenize(w.lower()) for w in train_data['question1']]
train_data['que2_token'] = [word_tokenize(w.lower()) for w in train_data['question2']]

In [7]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,que1_token,que2_token
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, is, the, story, of, kohinoor, (, koh-i-...","[what, would, happen, if, the, indian, governm..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[why, am, i, mentally, very, lonely, ?, how, c...","[find, the, remainder, when, [, math, ], 23^, ..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water, ?]"


In [8]:
#Tweaking stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words1=stop_words
pos_tagger=nltk.pos_tag(stop_words)
for value in pos_tagger:
    if value[1] in ['IN','WDT','WP','WRB']:
        stop_words1.remove(value[0])

In [9]:
#removal of Stop words and punctuations
from string import punctuation
for i in range(0,train_data.shape[0]):
    train_data['que1_token'][i] = [w for w in train_data['que1_token'][i] if w not in set(stop_words1 + list(punctuation))]
    train_data['que2_token'][i] = [w for w in train_data['que2_token'][i] if w not in set(stop_words1 + list(punctuation))]
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,que1_token,que2_token
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, step, by, step, guide, invest, in, shar...","[what, step, by, step, guide, invest, in, shar..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, story, of, kohinoor, koh-i-noor, diamond]","[what, would, happen, if, indian, government, ..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, i, increase, speed, of, internet, connec...","[how, internet, speed, increased, by, hacking,..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[why, i, mentally, lonely, how, i, solve]","[find, remainder, when, math, 23^, 24, /math, ..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]"


In [10]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
for i in range(0,train_data.shape[0]):
    train_data['que1_token'][i]=[lem.lemmatize(w,'v') for w in  train_data['que1_token'][i]]
    train_data['que2_token'][i]=[lem.lemmatize(w,'v') for w in  train_data['que2_token'][i]]
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,que1_token,que2_token
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, step, by, step, guide, invest, in, shar...","[what, step, by, step, guide, invest, in, shar..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, story, of, kohinoor, koh-i-noor, diamond]","[what, would, happen, if, indian, government, ..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, i, increase, speed, of, internet, connec...","[how, internet, speed, increase, by, hack, thr..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[why, i, mentally, lonely, how, i, solve]","[find, remainder, when, math, 23^, 24, /math, ..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]"


### Feature Engineering

In [11]:
#diff in length of questions in square to penalize high diff
train_data['length_difference']=(train_data.question1.str.len()-train_data.question2.str.len())**2
#matching last word
train_data['last_word']=0
for i in range(0,train_data.shape[0]):       
    if train_data['que1_token'][i][-1] == train_data['que2_token'][i][-1]:
        train_data['last_word'][i]=1

### Cosine Similarity

In [12]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
        

In [13]:
train_data['cosine_sim']=None
for i in range(0,train_data.shape[0]):
    vector1=Counter(train_data['que1_token'][i])
    vector2=Counter(train_data['que2_token'][i])
    train_data['cosine_sim'][i]=get_cosine(vector1, vector2)
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,que1_token,que2_token,length_difference,last_word,cosine_sim
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, step, by, step, guide, invest, in, shar...","[what, step, by, step, guide, invest, in, shar...",81,0,0.934199
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, story, of, kohinoor, koh-i-noor, diamond]","[what, would, happen, if, indian, government, ...",1369,0,0.492366
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, i, increase, speed, of, internet, connec...","[how, internet, speed, increase, by, hack, thr...",196,0,0.447214
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[why, i, mentally, lonely, how, i, solve]","[find, remainder, when, math, 23^, 24, /math, ...",225,0,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",1369,0,0.436436


### Scaling

In [14]:
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
train_data['length_difference'] = robust_scaler.fit_transform(np.array(train_data['length_difference']).reshape(-1,1))
X_train = train_data[['cosine_sim','length_difference','last_word']]
y_train = train_data.is_duplicate

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lr_model =LogisticRegression(C=0.1, random_state=42)
lr_model.fit(X_train, y_train)
lr_predict_test = lr_model.predict(X_train)

# Accuracy
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_test)))
print("Log Loss: {0:.4f}".format(metrics.log_loss(y_train, lr_predict_test)))

Accuracy: 0.6920
Log Loss: 10.6380


In [16]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)      # Create random forest object
rf_model.fit(X_train, y_train)
rf_predict_test = rf_model.predict(X_train)
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_test)))
print("Log Loss: {0:.4f}".format(metrics.log_loss(y_train, rf_predict_test)))

Accuracy: 0.9370
Log Loss: 2.1760


In [17]:
from xgboost import XGBClassifier
xgclf_model = XGBClassifier()       # Create xgboost object
X_train['cosine_sim'] = X_train['cosine_sim'].astype('float')

xgclf_model.fit(X_train, y_train)
xgclf_predict_test = xgclf_model.predict(X_train)
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, xgclf_predict_test)))
print("Log Loss: {0:.4f}".format(metrics.log_loss(y_train, xgclf_predict_test)))

Accuracy: 0.7470
Log Loss: 8.7384


  if diff:


***