In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
# Problem Statement
'''Over 100 million people visit Quora every month, so it's no surprise that many people ask similarly worded questions. 
Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, 
and make writers feel they need to answer multiple versions of the same question. 
Quora values canonical questions because they provide a better experience to active seekers and writers, 
and offer more value to both of these groups in the long term.

Currently, Quora uses a Random Forest model to identify duplicate questions.
In this competition, Kagglers are challenged to tackle this natural language processing problem by
applying advanced techniques to classify whether question pairs are duplicates or not. 
Doing so will make it easier to find high quality answers to questions resulting in an improved experience for Quora writers, seekers, and readers.'''

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.shape

(404290, 6)

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
new_df = df.sample(30000)

In [6]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [7]:
new_df.duplicated().sum()

0

In [8]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
289616,What are some examples of how to write a short...,What is your favorite diary entry that you hav...
371541,Which offshore company registration service ha...,How credible are Rating companies (eg CRISIL) ...
234187,What is the most common full time job and what...,What is the difference between part time and f...
388101,What is builders' tea?,Is tea a diuretic?
225869,How do the Delhi Metro cards and tokens work?,What makes the metro rail system in Delhi the ...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts 
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [10]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1,temp_df2],axis=1)
temp_df.shape


(30000, 6000)

In [11]:
temp_df['is_duplicate'] = new_df['is_duplicate']

In [12]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
289616,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
371541,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234187,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
289616,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
371541,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234187,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291443,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
213493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.736

In [38]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.7191666666666666