In [15]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score as accuracy
from sklearn import svm

In [16]:
df = pd.read_csv("./data/train.csv")
df = df[:150000]
df.shape

(150000, 6)

### Approach 1 - Feature Extraction

In [17]:
#Fuzzy Wuzzy 
fuzzy_ratio = []
for i,row in df.iterrows():
    fuzzy_ratio.append(fuzz.ratio(str(row["question1"]),str(row["question2"])))

df["fuzzy_ratio"] = fuzzy_ratio
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,fuzzy_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,65
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,45
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,7
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,37


In [18]:
#Common Words
common = []
for i,row in df.iterrows():
    q1 = str(row["question1"]).split()
    q2 = str(row["question2"]).split()
    common.append(len(set(q1).intersection(set(q2))))
        
df["common_word_cnt"] = common
df.head()  

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,fuzzy_ratio,common_word_cnt
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,10
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,65,4
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,45,3
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,7,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,37,2


In [19]:
#Sentiments of individual questions
SIA = SentimentIntensityAnalyzer()
q1_score = []
q2_score = []
for i,row in df.iterrows():
    q1_sent_score = SIA.polarity_scores(str(row["question1"]))["compound"]
    q1_score.append(q1_sent_score)
    q2_sent_score = SIA.polarity_scores(str(row["question2"]))["compound"]
    q2_score.append(q2_sent_score)
    
df["q1_sent_score"] = q1_score
df["q2_sent_score"] = q2_score
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,fuzzy_ratio,common_word_cnt,q1_sent_score,q2_sent_score
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,10,0.296,0.296
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,65,4,0.34,0.34
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,45,3,0.3182,0.2732
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,7,0,-0.3298,0.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,37,2,0.0,0.0


In [20]:
#Classifcation

df_features = pd.DataFrame()
df_features["fuzzy_ratio"] = df["fuzzy_ratio"]
df_features["common_word_cnt"] = df["common_word_cnt"]
df_features["q1_sent_score"] = df["q1_sent_score"]
df_features["q2_sent_score"] = df["q2_sent_score"]


y = df["is_duplicate"]

X_train, X_test, y_train, y_test = train_test_split(df_features, df["is_duplicate"], test_size=0.3, random_state=2,stratify = y)
clf = RandomForestClassifier(max_depth=400, random_state=0,n_estimators=300)
model = clf.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(accuracy(y_test,y_pred))

0.6907777777777778


In [21]:
clf1 = MLPClassifier(max_iter = 400)
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print(accuracy(y_test,y_pred))

0.6979333333333333
