In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import nltk
import gensim
import re

from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
import joblib

from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from xgboost import XGBClassifier

In [2]:
dataset = pd.read_csv('../Datasets/Train_data_post_EDA.csv')

In [3]:
dataset.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Data Preprocessing

In [4]:
sw = stopwords.words('English')

In [5]:
lm = WordNetLemmatizer()

In [6]:
import re

In [7]:
def cleanData(sentence):
  if sentence.__contains__('[ math ]'):
      sentence = sentence.replace('[ math ]', '')
      sentence = sentence.replace('[ /math ]', '')
      sentence = re.sub(' +', " ", sentence)
      return sentence
  else:
    cleaned_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    cleaned_sentence = re.sub(' +', " ", cleaned_sentence)
    return cleaned_sentence

In [8]:
def removeStopWords(sentence):
  sentence = cleanData(sentence)
  sentence = sentence.lower()
  words = nltk.word_tokenize(sentence)
  words = [word for word in words if not word in sw]
  return " ".join(words)

In [9]:
dataset['Q1_cleaned'] = dataset['question1'].apply(removeStopWords)
dataset['Q2_cleaned'] = dataset['question2'].apply(removeStopWords)
dataset.head()

Unnamed: 0,question1,question2,is_duplicate,Q1_cleaned,Q2_cleaned
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection using vpn,internet speed increased hacking dns
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math math divided
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water


In [10]:
dataset.to_csv('../Datasets/Cleaned-Data.csv', index=False)

In [11]:
dataset = dataset.sample(100000)

## Vectorizing Text Data

In [12]:
# Q1 = dataset['Q1_cleaned'].apply(lambda x : x.split(' '))
# Q2 = dataset['Q2_cleaned'].apply(lambda x : x.split(' '))

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# # merge texts
questions = list(dataset['Q1_cleaned']) + list(dataset['Q2_cleaned'])

# This code is to vectorize using Word2Vec - START

# model = Word2Vec(window=10, min_count=2, workers=8, vector_size=300)
# model.build_vocab(questions, progress_per=1000)
# model.train(questions, total_examples=model.corpus_count, epochs=model.epochs)

# This code is to vectorize using Word2Vec - END

cv = TfidfVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [14]:
# def vectorize(arr):
#   return [list(model.wv[word]) for word in arr]

In [15]:
# questions_new = list()

In [16]:
# for question in questions:
#   questions_new.append(vectorize(question))

In [17]:
# model_q1.train(Q1, total_examples= model_q1.corpus_count, epochs=model_q1.epochs)
# model_q2.train(Q2, total_examples= model_q2.corpus_count, epochs=model_q2.epochs)

In [18]:
temp_df1 = pd.DataFrame(q1_arr, index= dataset.index, columns=list(range(0,3000)))
temp_df2 = pd.DataFrame(q2_arr, index= dataset.index, columns=list(range(3001,6001)))
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(100000, 6000)

In [19]:
temp_df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
temp_df2.head()

Unnamed: 0,3001,3002,3003,3004,3005,3006,3007,3008,3009,3010,...,5991,5992,5993,5994,5995,5996,5997,5998,5999,6000
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5991,5992,5993,5994,5995,5996,5997,5998,5999,6000
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
temp_df['is_duplicate'] = dataset['is_duplicate']

In [23]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5992,5993,5994,5995,5996,5997,5998,5999,6000,is_duplicate
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [24]:
temp_df.to_csv('../Datasets/Cleaned-Vectorized-Data.csv', index=False)

## Baseline Modeling

In [25]:
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test = train_test_split(temp_df.drop('is_duplicate', axis=1), temp_df['is_duplicate'], test_size=0.2,random_state=1)

In [26]:
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score
# xgb = XGBClassifier()
# xgb.fit(X_train,y_train)
# y_pred = xgb.predict(X_test)
# accuracy_score(y_test,y_pred)

In [27]:
# y_pred_train = xgb.predict(X_train)

# accuracy_score(y_train, y_pred_train)

In [28]:
# joblib.dump(xgb, '../Model/baseline_model_xgb.pkl')

## Adding some more artificial features

In [29]:
dataset.head()

Unnamed: 0,question1,question2,is_duplicate,Q1_cleaned,Q2_cleaned
274406,Does watching a streamed video consume less da...,How can I download full videos?,0,watching streamed video consume less data down...,download full videos
165134,How do I remember things that I read?,How can I remember most of the things I read?,1,remember things read,remember things read
235858,How should I start preparing for the CFA Level...,How can I get started with preparing for CFA l...,1,start preparing cfa level exam,get started preparing cfa level exam
188912,Do Asian people consider themselves white?,Why do Asian people like to call themselves si...,0,asian people consider white,asian people like call simple
201214,What's your opinion on Indian Prime Minister M...,What do you think of abolishing 500 and 1000 R...,1,opinion indian prime minister modi new policy ...,think abolishing rupee currency notes indian g...


In [30]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5992,5993,5994,5995,5996,5997,5998,5999,6000,is_duplicate
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [31]:
def getUnique(arr):
  return len(set(arr.split(' ')))

In [32]:
def getCommon(arr1, arr2):

  set1 = set(arr1.split(' '))
  set2 = set(arr2.split(' '))
  return len(set1 & set2)

In [33]:
def wordsTotal(arr1, arr2):

  set1 = set(arr1.split(' '))
  set2 = set(arr2.split(' '))
  return len(set1) + len(set2)

In [34]:
# Lengths of each sentences
q1_len = [len(sentence) for sentence in dataset['Q1_cleaned']]
q2_len = [len(sentence) for sentence in dataset['Q2_cleaned']]

# Number of words in each sentences
q1_words = [len(arr.split(' ')) for arr in dataset['Q1_cleaned']]
q2_words = [len(arr.split(' ')) for arr in dataset['Q2_cleaned']]

common_words = [getCommon(arr1, arr2) for arr1, arr2 in zip(dataset['Q1_cleaned'], dataset['Q2_cleaned'])]

total_words = [wordsTotal(arr1, arr2) for arr1, arr2 in zip(dataset['Q1_cleaned'], dataset['Q2_cleaned'])]

In [35]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5992,5993,5994,5995,5996,5997,5998,5999,6000,is_duplicate
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [36]:
temp_df['q1_len'] = q1_len
temp_df['q2_len'] = q2_len

temp_df['q1_words'] = q1_words
temp_df['q2_words'] = q2_words

temp_df['common_words'] = common_words
temp_df['total_words'] = total_words

temp_df['words_share'] = round(temp_df['common_words']/temp_df['total_words'],2)

In [37]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5999,6000,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,total_words,words_share
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,64,20,9,3,1,11,0.09
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,20,20,3,3,3,6,0.5
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,30,36,5,6,4,11,0.36
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,27,29,4,5,2,9,0.22
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,75,55,10,7,3,17,0.18


## Adding Fuzzy Features

In [38]:
import fuzzywuzzy
from fuzzywuzzy import fuzz



In [39]:
tokenSet = list()
tokenSort = list()
partialTokenSet = list()
partialTokenSort = list()

In [40]:
def getTokenSet(df):
  tokenSet.append(fuzz.token_set_ratio(df['Q1_cleaned'], df['Q2_cleaned']))
  tokenSort.append(fuzz.token_sort_ratio(df['Q1_cleaned'], df['Q2_cleaned']))
  partialTokenSet.append(fuzz.partial_token_set_ratio(df['Q1_cleaned'], df['Q2_cleaned']))
  partialTokenSort.append(fuzz.partial_token_sort_ratio(df['Q1_cleaned'], df['Q2_cleaned']))

In [41]:
dataset.apply(getTokenSet, axis=1)

274406    None
165134    None
235858    None
188912    None
201214    None
          ... 
163587    None
20735     None
189284    None
336722    None
402936    None
Length: 100000, dtype: object

In [42]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5999,6000,is_duplicate,q1_len,q2_len,q1_words,q2_words,common_words,total_words,words_share
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,64,20,9,3,1,11,0.09
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,20,20,3,3,3,6,0.5
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,30,36,5,6,4,11,0.36
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,27,29,4,5,2,9,0.22
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,75,55,10,7,3,17,0.18


In [43]:
temp_df['token_set'] = tokenSet
temp_df['token_sort'] = tokenSort
temp_df['partial_token_set'] = partialTokenSet
temp_df['partial_token_sort'] = partialTokenSort

In [44]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,q2_len,q1_words,q2_words,common_words,total_words,words_share,token_set,token_sort,partial_token_set,partial_token_sort
274406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20,9,3,1,11,0.09,49,45,100,80
165134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20,3,3,3,6,0.5,100,100,100,100
235858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36,5,6,4,11,0.36,91,91,100,87
188912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29,4,5,2,9,0.22,62,68,100,67
201214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55,10,7,3,17,0.18,55,43,100,47


In [45]:
temp_df.to_csv('../Datasets/Final-Data.csv', index=False)

## Creating advance model

In [46]:
from sklearn.model_selection import train_test_split

In [48]:
X_train_advance, X_test_advance, y_train_advance, y_test_advance = train_test_split(temp_df.drop(['is_duplicate'], axis=1), temp_df['is_duplicate'], test_size=.2, random_state=2)

MemoryError: Unable to allocate 4.47 GiB for an array with shape (6001, 100000) and data type float64

In [None]:
xgb_advance = XGBClassifier()
xgb_advance.fit(X_train_advance, y_train_advance)

pred_train = xgb_advance.predict(X_train_advance)
pred_test = xgb_advance.predict(X_test_advance)





In [None]:
print('Training Accuracy:', accuracy_score(y_train_advance, pred_train))
print('Validation Accuracy:', accuracy_score(y_test_advance, pred_test))

Training Accuracy: 0.831
Validation Accuracy: 0.7601666666666667


In [None]:
print(classification_report(y_train_advance, pred_train))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87     15179
           1       0.79      0.73      0.76      8821

    accuracy                           0.83     24000
   macro avg       0.82      0.81      0.81     24000
weighted avg       0.83      0.83      0.83     24000



In [None]:
print(classification_report(y_test_advance, pred_test))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82      3855
           1       0.69      0.60      0.64      2145

    accuracy                           0.76      6000
   macro avg       0.74      0.73      0.73      6000
weighted avg       0.76      0.76      0.76      6000



In [None]:
joblib.dump(xgb_advance, '../Model/xgb_advance.pkl')

['../Model/xgb_advance.pkl']

## Testing Model's Performance

In [None]:
dataset.head()

Unnamed: 0,question1,question2,is_duplicate,Q1_cleaned,Q2_cleaned
163602,"Do Westerners (Americans, Europeans) perceive ...",How come so many Finnish people look East Asian?,0,westerners americans europeans perceive chines...,come many finnish people look east asian
99113,What are all of Voldemort's Horcruxes?,Which is the most dangerous Horcrux of Voldemort?,0,voldemort horcruxes,dangerous horcrux voldemort
43455,What ever happened to ActiveBuyersGuide.com?,What ever happened to IntellectualWhores.com?,0,ever happened activebuyersguide com,ever happened intellectualwhores com
271012,Why doesn't Amazon operate in Pakistan?,Is homeschooling better than traditional schoo...,0,amazon operate pakistan,homeschooling better traditional schooling
320121,Is Steve Jobs alive?,What if Steve Jobs were still alive?,0,steve jobs alive,steve jobs still alive


In [None]:
temp_df.shape

(30000, 6012)

In [None]:
model = joblib.load('../Model/xgb_advance.pkl')

In [None]:
from prediction_pipeline import *

In [None]:
np.random.choice(dataset.index)

182104

In [None]:
randomNumber = np.random.choice(dataset.index)	#np.random.randint(0, dataset.shape[0]+1)

q1 = pd.Series(dataset['Q1_cleaned'][randomNumber])
q2 = pd.Series(dataset['Q2_cleaned'][randomNumber])

# q1 = pd.Series('Why are you so happy?')
# q2 = pd.Series('Why is he so sad?')
df = predict(q1, q2)

true_value = dataset['is_duplicate'][randomNumber]
pred = model.predict(df)

print('Question1: ', dataset['question1'][randomNumber])
print('Question2: ', dataset['question2'][randomNumber])
print()
print('True Value: ', true_value)
print('Predicted Value: ', pred[0])

Question1:  What are the safety precautions on handling shotguns proposed by the NRA in Maine?
Question2:  What are the safety precautions on handling shotguns proposed by the NRA in Mississippi?

True Value:  1
Predicted Value:  1


In [None]:
def matchIntent(q1, q2):
  q1 = pd.Series(q1)
  q2 = pd.Series(q2)

  df = predict(q1, q2)
  pred = model.predict(df)

  if pred[0] == 1:
    print('The intent of both the question seems to be same!')
  else:
    print('These are two different questions.')

In [None]:
q1 = 'What are the safety precautions on handling shotguns proposed by the NRA in Maine?'
q2 = 'What are the safety precautions on handling shotguns proposed by the NRA in Mississippi?'

matchIntent(q1, q2)

The intent of both the question seems to be same!
