In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize

In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import gensim

In [4]:
from gensim.utils import simple_preprocess

In [5]:
df = pd.read_csv('/content/train.csv')

In [6]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
new_df= df.sample(30000)

In [8]:
new_df.shape

(30000, 6)

In [22]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
48568,48568,86571,86572,Should I service my bike according to the kilo...,Should I have my motorcycle oil changed even i...,0
397413,397413,530535,483863,Where can I learn SAP ABAP in Bangalore with p...,Where can I learn SAP ABAP in Bangalore?,0
112299,112299,183766,183767,How many terrorists are there in the world?,Why are there so many terrorists in the world ?,0
146471,146471,231360,231361,What is the capital of India?,Where is the capital of India?,0
135823,135823,216839,216840,what are some good YouTube video ideas?,What are some Cool YouTube video ideas?,0


In [9]:
new_df= new_df.dropna()

In [10]:
questions= list(new_df['question1']) + list(new_df['question2'])

In [11]:
print(questions)

Output hidden; open in https://colab.research.google.com to view.

In [12]:
len(questions)

60000

In [13]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
question_tokenize=[]
for ques in questions:
  question_tokenize.append(simple_preprocess(ques))

In [23]:
from gensim.models import Word2Vec

In [24]:
model=Word2Vec(
    window=5,
    vector_size=100,
    min_count=2,
    workers=4
)

In [25]:
model.build_vocab(question_tokenize)

In [26]:
model.train(question_tokenize,total_examples=model.corpus_count,epochs= model.epochs)

(2214439, 3125360)

In [27]:
print(question_tokenize[0])

['should', 'service', 'my', 'bike', 'according', 'to', 'the', 'kilometers', 'run', 'or', 'months', 'duration', 'defined', 'in', 'the', 'manual']


In [28]:
model.wv.get_normed_vectors().shape

(15264, 100)

In [33]:
def question_vectors(question,model):
  words= [w for w in question.split() if w in model.wv.key_to_index]

  if len(words)==0:
        return np.zeros(model.vector_size)
  return np.mean([model.wv[w] for w in words], axis=0)

In [34]:
new_df['q1_vec'] = new_df['question1'].apply(lambda questions: question_vectors(questions, model))
new_df['q2_vec'] = new_df['question2'].apply(lambda questions: question_vectors(questions, model))


In [35]:
#feature engineering

In [42]:
#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

def cosine_simi(v1,v2):
  return cosine_similarity(v1.reshape(1,-1),v2.reshape(1,-1))[0][0]

In [37]:
#euclidian distance

def euclidian(v1,v2):
  return np.linalg.norm(v1-v2)

In [38]:
#manhatten distance

def manhatten(v1,v2):
  return np.sum(np.abs(v1-v2))

In [39]:
#absolute difference vector

def abs_diff(v1,v2):
  return np.abs(v1-v2)

In [40]:
def features(row):

  v1= row['q1_vec']
  v2= row['q2_vec']

  return pd.Series([
      cosine_simi(v1,v2),
      euclidian(v1,v2),
      manhatten(v1,v2),
      abs_diff(v1,v2)
  ])

In [43]:
new_df[['cosine_similarity','euclidian_distance','manhatten_distance','abs_diff']]= new_df.apply(features,axis=1)

In [44]:
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_vec,q2_vec,cosine_similarity,euclidian_distance,manhatten_distance,abs_diff
48568,48568,86571,86572,Should I service my bike according to the kilo...,Should I have my motorcycle oil changed even i...,0,"[0.012585431, 0.15658927, 0.17162368, 0.224829...","[0.1702399, 0.25221696, 0.4188528, -0.00390309...",0.884569,1.521244,12.853615,"[0.15765446, 0.095627695, 0.24722913, 0.228732..."
397413,397413,530535,483863,Where can I learn SAP ABAP in Bangalore with p...,Where can I learn SAP ABAP in Bangalore?,0,"[-0.5980331, -0.019940075, 0.02279295, -0.1406...","[-0.97301286, -0.07982236, 0.33531916, -0.2617...",0.957022,1.857869,14.335747,"[0.3749798, 0.059882287, 0.31252623, 0.1211225..."
112299,112299,183766,183767,How many terrorists are there in the world?,Why are there so many terrorists in the world ?,0,"[-0.5378502, 0.40226522, 0.32053316, -0.155154...","[-0.6895482, 0.37975767, 0.30443805, -0.173436...",0.957404,1.340793,10.905995,"[0.151698, 0.022507548, 0.016095102, 0.0182812..."
146471,146471,231360,231361,What is the capital of India?,Where is the capital of India?,0,"[0.5970622, 0.4739149, 0.4476208, 0.19010021, ...","[0.5970622, 0.4739149, 0.4476208, 0.19010021, ...",1.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
135823,135823,216839,216840,what are some good YouTube video ideas?,What are some Cool YouTube video ideas?,0,"[-1.302948, 0.57271737, 0.7118061, 0.32731614,...","[-1.3302089, 0.94679326, 0.69908446, 0.5580878...",0.921829,2.925819,22.192762,"[0.0272609, 0.3740759, 0.012721658, 0.23077169..."


In [45]:
new_df.shape

(30000, 12)

In [46]:
x= new_df[['cosine_similarity','euclidian_distance','manhatten_distance']].values
y= new_df['is_duplicate']

In [48]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,stratify=y,random_state=100)

In [52]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()
from xgboost import XGBClassifier
xgb= XGBClassifier()
from sklearn.metrics import accuracy_score,confusion_matrix

In [53]:
rf.fit(x_train,y_train)
y_pred= rf.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_pred,y_test))

0.6451666666666667
[[2995 1338]
 [ 791  876]]


In [54]:
xgb.fit(x_train,y_train)
y_pred1= xgb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_pred1,y_test))

0.6451666666666667
[[3043 1292]
 [ 743  922]]
