In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('questions.csv')
df = df.sample(200000,random_state=20)

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
67050,67050,133306,133307,How much black money would be recovered in 201...,How much black money has been recovered after ...,0
156722,156722,309930,309931,Does a magnetic field have mass?,Can we increase the mass of an electron accele...,0
318365,318365,624667,624668,Why has Dhoni left the captaincy from ODI and ...,Why did M.S.Dhoni left captaincy from ODI & T20?,1
225123,225123,443700,443701,Where can I get best assistance in Sydney for ...,Where can I get highest quality service at exc...,1
256584,256584,505029,505030,As a web developer how can I contribute to ope...,How do I contribute on GitHub?,1


In [4]:
df.shape

(200000, 6)

In [5]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       1
is_duplicate    0
dtype: int64

In [6]:
df.dropna(inplace = True)

In [7]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [8]:
df.shape

(199998, 6)

In [9]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

     # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
     "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [10]:
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1


In [11]:
df['q1_len'] = df['question1'].str.len() 
df['q2_len'] = df['question2'].str.len()
df['q1_num_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_num_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6


In [12]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

df['word_common'] = df.apply(common_words, axis=1)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,6
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,4
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,7
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,8
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,5


In [13]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))


df['word_total'] = df.apply(total_words, axis=1)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,6,31
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,4,25
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,7,21
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,8,27
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,5,19


In [14]:
df['word_share'] = round(df['word_common']/df['word_total'],2)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,6,31,0.19
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,4,25,0.16
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,7,21,0.33
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,8,27,0.3
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,5,19,0.26


In [15]:
# Advanced Features
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

     # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [16]:
token_features = df.apply(fetch_token_features, axis=1)

df["cwc_min"]       = list(map(lambda x: x[0], token_features))
df["cwc_max"]       = list(map(lambda x: x[1], token_features))
df["csc_min"]       = list(map(lambda x: x[2], token_features))
df["csc_max"]       = list(map(lambda x: x[3], token_features))
df["ctc_min"]       = list(map(lambda x: x[4], token_features))
df["ctc_max"]       = list(map(lambda x: x[5], token_features))
df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [17]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,word_total,word_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,...,31,0.19,0.57142,0.363633,0.499988,0.249997,0.54545,0.315788,0.0,1.0
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,...,25,0.16,0.999967,0.299997,0.333322,0.124998,0.666656,0.199999,0.0,0.0
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,...,21,0.33,0.99998,0.99998,0.399992,0.399992,0.699993,0.699993,1.0,1.0
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,...,27,0.3,0.499992,0.33333,0.833319,0.833319,0.666661,0.53333,0.0,1.0
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,...,19,0.26,0.99995,0.333328,0.749981,0.428565,0.833319,0.384612,1.0,0.0


In [18]:
import distance

def fetch_length_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    if strs:
        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    else:
        length_features[2] = length_features
    
    return length_features

In [19]:
length_features = df.apply(fetch_length_features, axis=1)

df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
df['mean_len'] = list(map(lambda x: x[1], length_features))
df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,...,0.363633,0.499988,0.249997,0.54545,0.315788,0.0,1.0,8.0,15.0,0.304348
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,...,0.299997,0.333322,0.124998,0.666656,0.199999,0.0,0.0,14.0,13.0,0.46875
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,...,0.99998,0.399992,0.399992,0.699993,0.699993,1.0,1.0,0.0,10.0,0.416667
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,...,0.33333,0.833319,0.833319,0.666661,0.53333,0.0,1.0,3.0,13.5,0.228571
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,...,0.333328,0.749981,0.428565,0.833319,0.384612,1.0,0.0,7.0,9.5,0.466667


In [20]:
# Fuzzy Features
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*4
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

In [21]:
fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

print(df.shape)
df.head()

(199998, 28)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
67050,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,...,0.315788,0.0,1.0,8.0,15.0,0.304348,64,79,72,74
156722,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,...,0.199999,0.0,0.0,14.0,13.0,0.46875,25,65,38,81
318365,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,...,0.699993,1.0,1.0,0.0,10.0,0.416667,83,81,79,90
225123,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,...,0.53333,0.0,1.0,3.0,13.5,0.228571,65,56,54,71
256584,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,...,0.384612,1.0,0.0,7.0,9.5,0.466667,58,72,62,95


In [22]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [23]:
def preprocess_for_w2v(text):
    text = str(text)
    return text.lower().split()

In [24]:
ques_1 = df['question1'].apply(preprocess_for_w2v)
ques_2 = df['question2'].apply(preprocess_for_w2v)
story = ques_1 + ques_2
story

67050     [how, much, black, money, would, be, recovered...
156722    [does, a, magnetic, field, have, mass, can, we...
318365    [why, has, dhoni, left, the, captaincy, from, ...
225123    [where, can, i, get, best, assistance, in, syd...
256584    [as, a, web, developer, how, can, i, contribut...
                                ...                        
23393     [why, do, some, people, prefer, to, watch, pre...
302753    [which, fruits, or, vegetables, should, be, ea...
59404     [what, is, pallavi, anupallavi, and, charanam,...
79394     [if, trump, was, not, born, wealthy, and, had,...
366117    [i, am, a, minor, my, parents, have, been, fig...
Length: 199998, dtype: object

In [25]:
model = gensim.models.Word2Vec(vector_size=300, window=10, min_count=2, epochs=10)

In [26]:
model.build_vocab(story)

In [27]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(31775889, 45015550)

In [28]:
len(model.wv.index_to_key)

38446

In [29]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(model.vector_size)  # Return a zero vector if no valid words
    return np.mean(model.wv[doc], axis=0)

In [30]:
from tqdm import tqdm
q1 = []
for doc in tqdm(df['question1'].values):
    q1.append(document_vector(doc))

100%|█████████████████████████████████████████████████████████████████████████| 199998/199998 [05:26<00:00, 611.83it/s]


In [31]:
q2 = []
for doc in tqdm(df['question2'].values):
    q2.append(document_vector(doc))

100%|█████████████████████████████████████████████████████████████████████████| 199998/199998 [05:00<00:00, 665.33it/s]


In [32]:
q1 = np.array(q1)
q1

array([[ 0.40825072, -0.09912595, -0.51878774, ...,  0.150473  ,
        -0.47094202, -0.62864369],
       [-0.62509924, -0.61352843, -0.1137698 , ..., -1.31178892,
        -0.90050811,  0.00830886],
       [ 0.0229643 ,  0.68753552, -1.07763171, ..., -0.78560394,
         0.15064242,  0.2418164 ],
       ...,
       [-0.04090406,  0.06441434, -0.17042761, ..., -0.15917854,
        -0.56225914, -0.28561106],
       [-0.44355088,  0.10884364, -0.27806392, ...,  0.15839076,
        -0.51798373,  0.4572868 ],
       [-0.85735393,  0.30801699, -0.96859837, ..., -0.31635484,
        -0.50448382,  0.61216575]])

In [33]:
q2 = np.array(q2)
q2

array([[ 4.27371323e-01,  4.00154829e-01, -5.37906766e-01, ...,
        -9.42157209e-02, -3.48312944e-01,  1.96305275e-01],
       [-4.46544409e-01, -5.86572587e-02,  2.84419835e-01, ...,
        -2.25683764e-01, -7.37317324e-01, -2.48043031e-01],
       [ 8.31904039e-02,  2.91406661e-01, -1.58445811e+00, ...,
        -1.01766801e+00, -1.45258784e-01,  5.03102601e-01],
       ...,
       [ 4.02001977e-01,  7.11710155e-01, -4.33822311e-02, ...,
         1.00149214e-01,  2.01766431e-01, -4.53181744e-01],
       [-6.42033443e-02, -1.05950783e-03, -2.20624432e-01, ...,
         8.13645050e-02, -5.92739999e-01,  2.34878123e-01],
       [-3.99110228e-01,  1.80399552e-01, -7.89935708e-01, ...,
        -3.49011496e-02, -7.67115295e-01,  5.23887575e-01]])

In [34]:
print(q1.shape)
print(q2.shape)

(199998, 300)
(199998, 300)


In [35]:
# Create DataFrames from NumPy arrays
w2v_df_q1 = pd.DataFrame(q1, columns=[f'w2v_q1_{i}' for i in range(q1.shape[1])])
w2v_df_q2 = pd.DataFrame(q2, columns=[f'w2v_q2_{i}' for i in range(q2.shape[1])])

In [37]:
w2v_df_q1

Unnamed: 0,w2v_q1_0,w2v_q1_1,w2v_q1_2,w2v_q1_3,w2v_q1_4,w2v_q1_5,w2v_q1_6,w2v_q1_7,w2v_q1_8,w2v_q1_9,...,w2v_q1_290,w2v_q1_291,w2v_q1_292,w2v_q1_293,w2v_q1_294,w2v_q1_295,w2v_q1_296,w2v_q1_297,w2v_q1_298,w2v_q1_299
0,0.408251,-0.099126,-0.518788,0.807419,-0.630124,-0.436053,-0.557988,0.137854,-0.346078,0.403664,...,-0.110513,0.483008,-0.134914,1.273940,0.541537,0.353931,-0.314391,0.150473,-0.470942,-0.628644
1,-0.625099,-0.613528,-0.113770,1.359711,-0.312611,0.146388,-0.952952,-0.577094,-0.594156,-0.074704,...,-0.188411,0.032448,1.014385,-0.027904,-0.308149,0.264546,0.070570,-1.311789,-0.900508,0.008309
2,0.022964,0.687536,-1.077632,0.167828,-0.387949,-0.371010,-0.059383,0.304711,-0.513844,0.614526,...,0.040280,0.341068,0.531009,-0.144183,0.271762,0.233495,-0.012375,-0.785604,0.150642,0.241816
3,-0.655306,0.704583,0.057471,0.537470,-0.522217,0.168312,-0.719647,-0.290330,-0.508361,-0.234510,...,-0.681928,-0.414441,0.289619,0.278693,0.632768,-0.419662,-0.207049,0.035833,-0.101162,0.400287
4,-0.839511,-0.390187,0.311445,0.598654,-0.137514,0.062045,-0.508460,-0.200370,-0.259981,0.241290,...,-0.480113,-0.917441,0.009183,0.711517,0.483728,-0.513020,-0.293525,0.212670,-0.712542,0.038061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199993,0.630922,0.423625,-0.982027,0.217450,-0.527610,-0.744059,0.606943,-0.249922,-0.402748,-0.175492,...,-0.570813,-0.346289,-0.257765,0.200424,-0.052809,-0.153775,-0.725219,-0.418886,-0.060348,-0.099253
199994,-0.098605,0.177446,-0.323583,0.796582,-0.537914,-0.511198,0.525983,-0.012113,0.319358,-0.082156,...,-0.069640,-0.322265,-0.233862,-0.035279,0.290924,-0.492234,-0.204081,-0.080349,-0.377371,-0.387732
199995,-0.040904,0.064414,-0.170428,0.620909,-0.336230,-0.010155,-0.203578,-0.256912,-0.172561,0.223064,...,0.012699,-0.197405,-0.097291,0.700352,0.712381,0.101591,-0.296465,-0.159179,-0.562259,-0.285611
199996,-0.443551,0.108844,-0.278064,-0.139325,-0.485336,-0.509246,-0.179617,-0.337552,-0.160974,-0.103044,...,0.017674,0.043523,0.274274,0.345098,0.411571,-0.146832,-0.514933,0.158391,-0.517984,0.457287


In [38]:
w2v_df_q2

Unnamed: 0,w2v_q2_0,w2v_q2_1,w2v_q2_2,w2v_q2_3,w2v_q2_4,w2v_q2_5,w2v_q2_6,w2v_q2_7,w2v_q2_8,w2v_q2_9,...,w2v_q2_290,w2v_q2_291,w2v_q2_292,w2v_q2_293,w2v_q2_294,w2v_q2_295,w2v_q2_296,w2v_q2_297,w2v_q2_298,w2v_q2_299
0,0.427371,0.400155,-0.537907,0.794622,-0.443694,-0.322544,0.007568,-0.020581,-0.493553,0.192701,...,0.187181,0.899955,0.230219,0.901589,0.085242,0.317505,-0.385086,-0.094216,-0.348313,0.196305
1,-0.446544,-0.058657,0.284420,0.831508,0.171402,-0.070425,-0.532165,-0.282180,-0.539858,0.017095,...,-0.380505,0.201228,0.508731,0.313068,-0.035627,0.343254,-0.274848,-0.225684,-0.737317,-0.248043
2,0.083190,0.291407,-1.584458,-0.203334,-0.504688,-0.378328,0.179308,0.328868,-0.326977,0.378099,...,0.079383,0.155246,0.306156,0.294689,0.189529,0.275116,0.080980,-1.017668,-0.145259,0.503103
3,-0.300961,0.527646,0.057936,0.735649,-0.442252,0.816016,-0.684017,-0.340313,-0.630977,-0.137384,...,-0.610371,-0.449583,0.287909,0.179781,0.306805,-0.075594,-0.445308,-0.002530,-0.076152,0.628191
4,-0.660743,-0.281215,0.051541,0.324349,-0.143890,0.276282,-0.244699,-0.615617,-0.320406,0.317098,...,0.085550,-0.662203,0.284297,0.595672,0.283072,-0.621653,0.112158,0.179823,-0.719757,0.523645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199993,0.344692,0.352510,-1.123874,0.104021,-0.336418,-0.671111,0.604629,0.049842,-0.179438,-0.151306,...,-0.240442,-0.277096,-0.370234,0.006134,-0.297467,-0.271082,-1.064430,-0.359696,0.089268,0.046537
199994,-0.082570,0.154190,-0.273704,0.738369,-0.435710,-0.560033,0.492791,-0.027761,0.292112,-0.081139,...,-0.021913,-0.284136,-0.224410,-0.023680,0.271843,-0.476745,-0.171429,-0.103796,-0.332965,-0.344440
199995,0.402002,0.711710,-0.043382,0.647240,-0.277880,-0.210373,0.384052,0.280111,0.034947,0.229074,...,-0.331516,0.401695,-0.082822,0.492297,0.547155,0.446216,-0.523335,0.100149,0.201766,-0.453182
199996,-0.064203,-0.001060,-0.220624,0.227762,-0.474811,-0.190730,-0.167221,-0.425227,-0.207653,-0.123546,...,-0.170384,-0.409367,-0.206726,0.488282,0.290509,-0.313760,-0.622632,0.081365,-0.592740,0.234878


In [39]:
# Combine the new features back into the original DataFrame
df = pd.concat([
    df.reset_index(drop=True), 
    w2v_df_q1.reset_index(drop=True), 
    w2v_df_q2.reset_index(drop=True)
], axis=1)

In [40]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,w2v_q2_290,w2v_q2_291,w2v_q2_292,w2v_q2_293,w2v_q2_294,w2v_q2_295,w2v_q2_296,w2v_q2_297,w2v_q2_298,w2v_q2_299
0,67050,133306,133307,how much black money would be recovered in 201...,how much black money has been recovered after ...,0,68,100,11,20,...,0.187181,0.899955,0.230219,0.901589,0.085242,0.317505,-0.385086,-0.094216,-0.348313,0.196305
1,156722,309930,309931,does a magnetic field have mass,can we increase the mass of an electron accele...,0,31,119,6,21,...,-0.380505,0.201228,0.508731,0.313068,-0.035627,0.343254,-0.274848,-0.225684,-0.737317,-0.248043
2,318365,624667,624668,why has dhoni left the captaincy from odi and t20,why did m s dhoni left captaincy from odi t20,1,49,47,10,12,...,0.079383,0.155246,0.306156,0.294689,0.189529,0.275116,0.080980,-1.017668,-0.145259,0.503103
3,225123,443700,443701,where can i get best assistance in sydney for ...,where can i get highest quality service at exc...,1,69,97,12,15,...,-0.610371,-0.449583,0.287909,0.179781,0.306805,-0.075594,-0.445308,-0.002530,-0.076152,0.628191
4,256584,505029,505030,as a web developer how can i contribute to ope...,how do i contribute on github,1,64,29,13,6,...,0.085550,-0.662203,0.284297,0.595672,0.283072,-0.621653,0.112158,0.179823,-0.719757,0.523645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199993,23393,46673,46674,why do some people prefer to watch precure to ...,some people love to watch horror movies but w...,0,63,92,12,17,...,-0.240442,-0.277096,-0.370234,0.006134,-0.297467,-0.271082,-1.064430,-0.359696,0.089268,0.046537
199994,302753,594543,594544,which fruits or vegetables should be eaten to ...,which fruits or vegetables should be eaten reg...,1,58,68,10,11,...,-0.021913,-0.284136,-0.224410,-0.023680,0.271843,-0.476745,-0.171429,-0.103796,-0.332965,-0.344440
199995,59404,118166,118167,what is pallavi anupallavi and charanam in a ...,is the song jotheyale from geetha the most lov...,0,185,136,40,26,...,-0.331516,0.401695,-0.082822,0.492297,0.547155,0.446216,-0.523335,0.100149,0.201766,-0.453182
199996,79394,157720,157721,if trump was not born wealthy and had not inhe...,if a more handsome and wealthy man wanted to h...,0,135,163,28,34,...,-0.170384,-0.409367,-0.206726,0.488282,0.290509,-0.313760,-0.622632,0.081365,-0.592740,0.234878


In [41]:
df.drop(columns=['id','qid1','qid2', 'question1', 'question2'], inplace=True)

In [46]:
df

Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,w2v_q2_290,w2v_q2_291,w2v_q2_292,w2v_q2_293,w2v_q2_294,w2v_q2_295,w2v_q2_296,w2v_q2_297,w2v_q2_298,w2v_q2_299
0,0,68,100,11,20,6,31,0.19,0.571420,0.363633,...,0.187181,0.899955,0.230219,0.901589,0.085242,0.317505,-0.385086,-0.094216,-0.348313,0.196305
1,0,31,119,6,21,4,25,0.16,0.999967,0.299997,...,-0.380505,0.201228,0.508731,0.313068,-0.035627,0.343254,-0.274848,-0.225684,-0.737317,-0.248043
2,1,49,47,10,12,7,21,0.33,0.999980,0.999980,...,0.079383,0.155246,0.306156,0.294689,0.189529,0.275116,0.080980,-1.017668,-0.145259,0.503103
3,1,69,97,12,15,8,27,0.30,0.499992,0.333330,...,-0.610371,-0.449583,0.287909,0.179781,0.306805,-0.075594,-0.445308,-0.002530,-0.076152,0.628191
4,1,64,29,13,6,5,19,0.26,0.999950,0.333328,...,0.085550,-0.662203,0.284297,0.595672,0.283072,-0.621653,0.112158,0.179823,-0.719757,0.523645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199993,0,63,92,12,17,5,28,0.18,0.333328,0.222220,...,-0.240442,-0.277096,-0.370234,0.006134,-0.297467,-0.271082,-1.064430,-0.359696,0.089268,0.046537
199994,1,58,68,10,11,10,21,0.48,0.999980,0.833319,...,-0.021913,-0.284136,-0.224410,-0.023680,0.271843,-0.476745,-0.171429,-0.103796,-0.332965,-0.344440
199995,0,185,136,40,26,8,47,0.17,0.333330,0.214284,...,-0.331516,0.401695,-0.082822,0.492297,0.547155,0.446216,-0.523335,0.100149,0.201766,-0.453182
199996,0,135,163,28,34,11,58,0.19,0.181817,0.166665,...,-0.170384,-0.409367,-0.206726,0.488282,0.290509,-0.313760,-0.622632,0.081365,-0.592740,0.234878


In [51]:
def has_sequence(cell):
    return isinstance(cell, (list, np.ndarray))

for col in df.columns:
    if df[col].apply(has_sequence).any():
        print(f"Column '{col}' contains sequences!")

Column 'longest_substr_ratio' contains sequences!


In [52]:
df['longest_substr_ratio'] = df['longest_substr_ratio'].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)

In [53]:
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(159998, 622)
(40000, 622)
(159998,)
(40000,)


In [56]:
# Convert to dense array
X_train_dense = X_train.values
X_test_dense = X_test.values

In [57]:
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [58]:
# Define the RNN model
rnn_model = Sequential()
rnn_model.add(LSTM(128, input_shape=(X_train_dense.shape[1], 1), return_sequences=True))  # First LSTM layer
rnn_model.add(Bidirectional(LSTM(64, return_sequences=False)))  # Second LSTM layer
rnn_model.add(Dense(32, activation='relu'))  # Fully connected layer
rnn_model.add(Dropout(0.5))  # Dropout layer for regularization
rnn_model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [59]:
rnn_model.summary()

In [60]:
rnn_model.fit(X_train_dense, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=2)

Epoch 1/10
2000/2000 - 1416s - 708ms/step - accuracy: 0.6645 - loss: 0.5534 - val_accuracy: 0.6936 - val_loss: 0.5326
Epoch 2/10
2000/2000 - 2813s - 1s/step - accuracy: 0.7065 - loss: 0.5177 - val_accuracy: 0.7350 - val_loss: 0.4909
Epoch 3/10
2000/2000 - 2117s - 1s/step - accuracy: 0.7338 - loss: 0.4872 - val_accuracy: 0.7374 - val_loss: 0.4795
Epoch 4/10
2000/2000 - 1487s - 743ms/step - accuracy: 0.7465 - loss: 0.4733 - val_accuracy: 0.7594 - val_loss: 0.4548
Epoch 5/10
2000/2000 - 1375s - 687ms/step - accuracy: 0.7554 - loss: 0.4627 - val_accuracy: 0.7589 - val_loss: 0.4541
Epoch 6/10
2000/2000 - 1360s - 680ms/step - accuracy: 0.7610 - loss: 0.4576 - val_accuracy: 0.7602 - val_loss: 0.4575
Epoch 7/10
2000/2000 - 2495s - 1s/step - accuracy: 0.7621 - loss: 0.4544 - val_accuracy: 0.7710 - val_loss: 0.4443
Epoch 8/10
2000/2000 - 1495s - 747ms/step - accuracy: 0.7709 - loss: 0.4443 - val_accuracy: 0.7750 - val_loss: 0.4389
Epoch 9/10
2000/2000 - 1399s - 699ms/step - accuracy: 0.7731 - lo

<keras.src.callbacks.history.History at 0x1ff70fa4fb0>

In [104]:
# Making predictions
y_pred_rnn = rnn_model.predict(X_test_dense)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 359ms/step


In [114]:
y_pred_rnn_binary = [1 if pred >= 0.5 else 0 for pred in y_pred_rnn]
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_rnn_binary)
print("RNN (LSTM) Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred_rnn_binary))

RNN (LSTM) Accuracy: 0.767
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.74      0.80     12694
           1       0.64      0.81      0.72      7306

    accuracy                           0.77     20000
   macro avg       0.76      0.78      0.76     20000
weighted avg       0.79      0.77      0.77     20000



In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.817

In [57]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred1 = xgb.predict(X_test)
accuracy_score(y_test,y_pred1)

0.8144

In [58]:
from sklearn.metrics import confusion_matrix

In [59]:
# for random forest model
confusion_matrix(y_test,y_pred)

array([[21937,  3376],
       [ 3944, 10743]], dtype=int64)

In [60]:
# for xgboost model
confusion_matrix(y_test,y_pred1)

array([[21653,  3660],
       [ 3764, 10923]], dtype=int64)

In [61]:
import pickle

pickle.dump(rf,open('rf_model.pkl','wb'))
pickle.dump(model,open('w2v_model.pkl','wb'))

In [62]:
pickle.dump(rf,open('xgb_model.pkl','wb'))

In [63]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return len(w1 & w2)

In [64]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))    
    return (len(w1) + len(w2))

In [65]:
def test_fetch_token_features(q1,q2):
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))

      # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [66]:
def test_fetch_length_features(q1,q2):
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    
    return length_features

In [67]:
def test_fetch_fuzzy_features(q1,q2):
    
    fuzzy_features = [0.0]*4
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

In [68]:
def query_point_creator(q1,q2):
    
    input_query = []
    
    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)
    
    # fetch basic features
    input_query.append(len(q1))
    input_query.append(len(q2))
    
    input_query.append(len(q1.split(" ")))
    input_query.append(len(q2.split(" ")))
    
    input_query.append(test_common_words(q1,q2))
    input_query.append(test_total_words(q1,q2))
    input_query.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))
    
    # fetch token features
    token_features = test_fetch_token_features(q1,q2)
    input_query.extend(token_features)
    
    # fetch length based features
    length_features = test_fetch_length_features(q1,q2)
    input_query.extend(length_features)

     # fetch fuzzy features
    fuzzy_features = test_fetch_fuzzy_features(q1,q2)
    input_query.extend(fuzzy_features)

    # w2v feature for q1
    q1_w2v = document_vector(q1)
    
    # w2v feature for q2
    q2_w2v = document_vector(q2)
    
    
    return np.hstack((np.array(input_query),q1_w2v,q2_w2v)).reshape(1, -1)
    

In [129]:
q1 = 'How can I improve my English vocabulary?'
q2 = 'What are the best ways to enhance English vocabulary skills?'
q3 = 'Which city serves as the capital of India?'
q4 = 'What is the business capital of India?'
q5 = 'Where is the capital of India?'
q6 = 'What is the current capital of India?'

In [131]:
output = rf.predict(query_point_creator(q5,q6))
output

array([0], dtype=int64)

In [133]:
output = xgb.predict(query_point_creator(q5,q6))
output

array([1])