In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('questions.csv')

In [3]:
df.shape

(404351, 6)

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
#creating new data frame with 3000 rows from df
new_df = df.sample(3000)

In [6]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
203746,203746,401999,402000,What is the benefit of doing MBA after doing B...,What are the benefits of doing an MBA after B....,1
276025,276025,542722,542723,What is the definition of 'levity' and how is ...,What is the definition of 'knead' and how is i...,0
246989,246989,486374,486375,Where could I find someone with a Google devel...,Will copying and pasting from Google make me a...,0
112703,112703,223448,223449,How does Netflix compensate its employees?,Are Netflix employees really that good?,0
355510,355510,696267,696268,What would decrease a material's thermal energy?,Why is Tyrion not able to complete his joke re...,0


In [7]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [8]:
new_df.duplicated().sum()

0

In [9]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
203746,What is the benefit of doing MBA after doing B...,What are the benefits of doing an MBA after B....
276025,What is the definition of 'levity' and how is ...,What is the definition of 'knead' and how is i...
246989,Where could I find someone with a Google devel...,Will copying and pasting from Google make me a...
112703,How does Netflix compensate its employees?,Are Netflix employees really that good?
355510,What would decrease a material's thermal energy?,Why is Tyrion not able to complete his joke re...


In [10]:
#using TF_IDF vectorizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer(max_features=3000)

questions = list(ques_df['question1']) + list(ques_df['question2'])
q1_arr,q2_arr = np.vsplit(tfidfvectorizer.fit_transform(questions).toarray(),2)

In [12]:
temp_df1 = pd.DataFrame(q1_arr, index = ques_df.index) # data frame having embeddings of questions from question1
temp_df2 = pd.DataFrame(q2_arr, index = ques_df.index) # data frame having embeddings of questions from question2
temp_df = pd.concat([temp_df1,temp_df2], axis = 1)


In [14]:
temp_df.shape

(3000, 6000)

In [15]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
203746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
355510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
temp_df.sample(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
259260,0.422266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.469833,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.249063,0.0,0.0,0.549257,0.0,0.0,0.0,0.0
221955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.310204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
327279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
mean_squared_error(y_test,y_pred)

2.415355165125867e-05

In [28]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
xr = XGBRegressor()
xr.fit(x_train,y_train)
y_pred = xr.predict(x_test)
mean_squared_error(y_test,y_pred)

2.6499385480835036e-12