# Importing Data and Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [76]:
import pandas as pd
import numpy as np
import re
import string
import xgboost as xgb
import nltk
import pickle
from gensim.models import Word2Vec
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [5]:
df = pd.read_csv('/home/shavak/Prince-Sem3-BScDS/questions.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
df = df.dropna(subset=['question1', 'question2'])

In [7]:
df = df[(df['question1'].str.strip() != '') & (df['question2'].str.strip() != '')]

In [8]:
df = df[(df['question1'].str.len() <= 500) & (df['question2'].str.len() <= 500)]

In [9]:
df = df[~df['question1'].str.contains(r'\d') &  ~df['question2'].str.contains(r'\d')]

In [10]:
df = df[df['question1'].apply(lambda x: isinstance(x, str)) &  df['question2'].apply(lambda x: isinstance(x, str))]

In [11]:
df = df.reset_index(drop=True)

In [12]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
4,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
...,...,...,...,...,...,...
340047,404346,789792,789793,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
340048,404347,789794,789795,Do you believe there is life after death?,Is it true that there is life after death?,1
340049,404348,789796,789797,What is one coin?,What's this coin?,0
340050,404349,789798,789799,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


# Check for Missing Values

In [13]:
df.shape

(340052, 6)

In [14]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [15]:
df = df.dropna()

In [16]:
df.shape

(340052, 6)

# Inspecting Data Uniqueness and Distribution

In [17]:
# Check for duplicate rows
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Check the distribution of the target column `is_duplicate`
print(df['is_duplicate'].value_counts())

Number of duplicate rows: 0
is_duplicate
0    210239
1    129813
Name: count, dtype: int64


# Checking for Class Balance

In [18]:
duplicate_percentage = df['is_duplicate'].mean() * 100
print(f"Percentage of duplicate pairs: {duplicate_percentage:.2f}%")
print(f"Percentage of non-duplicate pairs: {100 - duplicate_percentage:.2f}%")

Percentage of duplicate pairs: 38.17%
Percentage of non-duplicate pairs: 61.83%


# Remove Columns

In [19]:
# Remove the 'id', 'qid1', and 'qid2' columns
df = df.drop(['id', 'qid1', 'qid2'], axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
4,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1


# Text Preprocessing

In [20]:
def preprocess_text(text):
    # Check for NaN values and return an empty string or the original text
    if pd.isna(text):
        return ''

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Removing extra whitespaces
    text = ' '.join(tokens)

    return text

In [21]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/shavak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/shavak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/shavak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
# Preprocess the questions
df['question1'] = df['question1'].apply(preprocess_text)
df['question2'] = df['question2'].apply(preprocess_text)

# Display the updated DataFrame
df[['question1', 'question2']].head()

Unnamed: 0,question1,question2
0,step step guide invest share market india,step step guide invest share market
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...
2,increase speed internet connection using vpn,internet speed increased hacking dns
3,one dissolve water quikly sugar salt methane c...,fish would survive salt water
4,astrology capricorn sun cap moon cap risingwha...,im triple capricorn sun moon ascendant caprico...


In [23]:
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0
4,astrology capricorn sun cap moon cap risingwha...,im triple capricorn sun moon ascendant caprico...,1


# Feature Engineering

In [24]:
# Word Overlap Features
def jaccard_similarity(q1, q2):
    a = set(q1.split())
    b = set(q2.split())

    if len(a) == 0 and len(b) == 0:
        return 1.0
    elif len(a) == 0 or len(b) == 0:
        return 0.0

    return len(a.intersection(b)) / len(a.union(b))

In [25]:
df['jaccard'] = df.apply(lambda row: jaccard_similarity(row['question1'], row['question2']), axis=1)

In [26]:
df

Unnamed: 0,question1,question2,is_duplicate,jaccard
0,step step guide invest share market india,step step guide invest share market,0,0.833333
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0,0.222222
2,increase speed internet connection using vpn,internet speed increased hacking dns,0,0.222222
3,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0,0.153846
4,astrology capricorn sun cap moon cap risingwha...,im triple capricorn sun moon ascendant caprico...,1,0.400000
...,...,...,...,...
340047,many keywords racket programming language late...,many keywords perl programming language latest...,0,0.750000
340048,believe life death,true life death,1,0.500000
340049,one coin,whats coin,0,0.333333
340050,approx annual cost living studying uic chicago...,little hairfall problem want use hair styling ...,0,0.000000


In [27]:
def compute_token_features(row):
    q1 = row['question1'].split()
    q2 = row['question2'].split()

    # Convert to sets for comparison
    tokens_q1 = set(q1)
    tokens_q2 = set(q2)

    # Count common words
    cwc = len(tokens_q1.intersection(tokens_q2))

    # Count common stop words
    csc = len(tokens_q1.intersection(tokens_q2).intersection(stop_words))

    # Length of questions
    len_q1, len_q2 = len(tokens_q1), len(tokens_q2)

    # Token Features
    cwc_min = cwc / min(len_q1, len_q2) if min(len_q1, len_q2) > 0 else 0
    cwc_max = cwc / max(len_q1, len_q2) if max(len_q1, len_q2) > 0 else 0

    csc_min = csc / min(len([word for word in tokens_q1 if word in stop_words]),
                         len([word for word in tokens_q2 if word in stop_words])) if min(len([word for word in tokens_q1 if word in stop_words]),
                         len([word for word in tokens_q2 if word in stop_words])) > 0 else 0

    csc_max = csc / max(len([word for word in tokens_q1 if word in stop_words]),
                         len([word for word in tokens_q2 if word in stop_words])) if max(len([word for word in tokens_q1 if word in stop_words]),
                         len([word for word in tokens_q2 if word in stop_words])) > 0 else 0

    # Common Tokens
    ctc_min = cwc / min(len_q1, len_q2) if min(len_q1, len_q2) > 0 else 0
    ctc_max = cwc / max(len_q1, len_q2) if max(len_q1, len_q2) > 0 else 0

    # Last and first word equality
    last_word_eq = int(q1[-1] == q2[-1]) if q1 and q2 else 0
    first_word_eq = int(q1[0] == q2[0]) if q1 and q2 else 0

    return pd.Series([cwc_min, cwc_max, csc_min, csc_max, ctc_min, ctc_max, last_word_eq, first_word_eq])

In [28]:
def compute_length_features(row):
    len_q1 = len(row['question1'].split())
    len_q2 = len(row['question2'].split())

    mean_len = (len_q1 + len_q2) / 2
    abs_len_diff = abs(len_q1 - len_q2)

    # Finding longest common substring ratio
    def longest_common_substring_ratio(s1, s2):
        # Find longest common substring using dynamic programming
        max_length = 0
        for i in range(len(s1)):
            for j in range(len(s2)):
                length = 0
                while (i + length < len(s1)) and (j + length < len(s2)) and (s1[i + length] == s2[j + length]):
                    length += 1
                max_length = max(max_length, length)
        return max_length / min(len(s1), len(s2)) if min(len(s1), len(s2)) > 0 else 0

    longest_substr_ratio = longest_common_substring_ratio(row['question1'], row['question2'])

    return pd.Series([mean_len, abs_len_diff, longest_substr_ratio])

In [29]:
def compute_fuzzy_features(row):
    q1, q2 = row['question1'], row['question2']

    fuzz_ratio = fuzz.ratio(q1, q2)
    fuzz_partial_ratio = fuzz.partial_ratio(q1, q2)
    token_sort_ratio = fuzz.token_sort_ratio(q1, q2)
    token_set_ratio = fuzz.token_set_ratio(q1, q2)

    return pd.Series([fuzz_ratio, fuzz_partial_ratio, token_sort_ratio, token_set_ratio])

In [30]:
stop_words = set(stopwords.words('english'))

In [31]:
token_features = df[['question1', 'question2']].apply(compute_token_features, axis=1)
token_features.columns = ['cwc_min', 'cwc_max', 'csc_min', 'csc_max', 'ctc_min', 'ctc_max', 'last_word_eq', 'first_word_eq']

In [32]:
token_features

Unnamed: 0,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
0,1.000000,0.833333,0.0,0.0,1.000000,0.833333,0.0,1.0
1,0.666667,0.250000,0.0,0.0,0.666667,0.250000,0.0,0.0
2,0.400000,0.333333,0.0,0.0,0.400000,0.333333,0.0,0.0
3,0.400000,0.200000,0.0,0.0,0.400000,0.200000,0.0,0.0
4,0.571429,0.571429,0.0,0.0,0.571429,0.571429,1.0,0.0
...,...,...,...,...,...,...,...,...
340047,0.857143,0.857143,0.0,0.0,0.857143,0.857143,1.0,1.0
340048,0.666667,0.666667,0.0,0.0,0.666667,0.666667,1.0,0.0
340049,0.500000,0.500000,0.0,0.0,0.500000,0.500000,1.0,0.0
340050,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0


In [33]:
length_features = df.apply(compute_length_features, axis=1)
length_features.columns = ['mean_len', 'abs_len_diff', 'longest_substr_ratio']

In [34]:
length_features

Unnamed: 0,mean_len,abs_len_diff,longest_substr_ratio
0,6.5,1.0,1.000000
1,6.5,5.0,0.838710
2,5.5,1.0,0.250000
3,7.5,5.0,0.206897
4,8.0,0.0,0.294118
...,...,...,...
340047,7.0,0.0,0.666667
340048,3.0,0.0,0.800000
340049,2.0,0.0,0.625000
340050,11.0,4.0,0.065574


In [35]:
fuzzy_features = df.apply(compute_fuzzy_features, axis=1)
fuzzy_features.columns = ['fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio', 'token_set_ratio']

In [36]:
fuzzy_features

Unnamed: 0,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,92,100,92,100
1,59,94,59,84
2,55,46,65,65
3,25,52,40,51
4,64,65,50,69
...,...,...,...,...
340047,93,91,89,95
340048,73,80,61,80
340049,56,62,56,67
340050,35,38,29,29


In [37]:
df = pd.concat([df, token_features, length_features, fuzzy_features], axis=1)

In [38]:
df

Unnamed: 0,question1,question2,is_duplicate,jaccard,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,mean_len,abs_len_diff,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,step step guide invest share market india,step step guide invest share market,0,0.833333,1.000000,0.833333,0.0,0.0,1.000000,0.833333,0.0,1.0,6.5,1.0,1.000000,92,100,92,100
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0,0.222222,0.666667,0.250000,0.0,0.0,0.666667,0.250000,0.0,0.0,6.5,5.0,0.838710,59,94,59,84
2,increase speed internet connection using vpn,internet speed increased hacking dns,0,0.222222,0.400000,0.333333,0.0,0.0,0.400000,0.333333,0.0,0.0,5.5,1.0,0.250000,55,46,65,65
3,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0,0.153846,0.400000,0.200000,0.0,0.0,0.400000,0.200000,0.0,0.0,7.5,5.0,0.206897,25,52,40,51
4,astrology capricorn sun cap moon cap risingwha...,im triple capricorn sun moon ascendant caprico...,1,0.400000,0.571429,0.571429,0.0,0.0,0.571429,0.571429,1.0,0.0,8.0,0.0,0.294118,64,65,50,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,many keywords racket programming language late...,many keywords perl programming language latest...,0,0.750000,0.857143,0.857143,0.0,0.0,0.857143,0.857143,1.0,1.0,7.0,0.0,0.666667,93,91,89,95
340048,believe life death,true life death,1,0.500000,0.666667,0.666667,0.0,0.0,0.666667,0.666667,1.0,0.0,3.0,0.0,0.800000,73,80,61,80
340049,one coin,whats coin,0,0.333333,0.500000,0.500000,0.0,0.0,0.500000,0.500000,1.0,0.0,2.0,0.0,0.625000,56,62,56,67
340050,approx annual cost living studying uic chicago...,little hairfall problem want use hair styling ...,0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,11.0,4.0,0.065574,35,38,29,29


In [39]:
# Combine question1 and question2 into a single list for TF-IDF
questions_combined = df['question1'].tolist() + df['question2'].tolist()

In [40]:
questions_combined

['step step guide invest share market india',
 'story kohinoor kohinoor diamond',
 'increase speed internet connection using vpn',
 'one dissolve water quikly sugar salt methane carbon di oxide',
 'astrology capricorn sun cap moon cap risingwhat say',
 'buy tiago',
 'good geologist',
 'use instead',
 'method find separation slit using fresnel biprism',
 'read find youtube comment',
 'make physic easy learn',
 'first sexual experience like',
 'law change status student visa green card u compare immigration law canada',
 'manipulation mean',
 'girl want friend guy reject',
 'many quora user posting question readily answered google',
 'best digital marketing institution banglore',
 'rocket look white',
 'whats causing someone jealous',
 'question ask quora',
 'mean every time look clock number',
 'tip making job interview process medicine',
 'web application',
 'society place much importance sport',
 'best way make money online',
 'prepare ca final law',
 'whats one thing would like bette

# Add TF-IDF Features

In [41]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_df=0.95, max_features=3000)

In [42]:
# Fit and transform the combined questions
tfidf_matrix = tfidf_vectorizer.fit_transform(questions_combined)

In [43]:
# Save TF-IDF model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [44]:
tfidf_matrix.shape

(680104, 3000)

In [45]:
# Convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

In [46]:
num_questions = len(df)
tfidf_question1 = tfidf_matrix[:num_questions, :]
tfidf_question2 = tfidf_matrix[num_questions:, :]

In [47]:
tfidf_question1_df = pd.DataFrame.sparse.from_spmatrix(tfidf_question1)
tfidf_question2_df = pd.DataFrame.sparse.from_spmatrix(tfidf_question2)

In [48]:
tfidf_question1_df.columns = [f'tfidf_q1_{i}' for i in range(tfidf_question1_df.shape[1])]
tfidf_question2_df.columns = [f'tfidf_q2_{i}' for i in range(tfidf_question2_df.shape[1])]

In [49]:
tfidf_question1_df

Unnamed: 0,tfidf_q1_0,tfidf_q1_1,tfidf_q1_2,tfidf_q1_3,tfidf_q1_4,tfidf_q1_5,tfidf_q1_6,tfidf_q1_7,tfidf_q1_8,tfidf_q1_9,...,tfidf_q1_2990,tfidf_q1_2991,tfidf_q1_2992,tfidf_q1_2993,tfidf_q1_2994,tfidf_q1_2995,tfidf_q1_2996,tfidf_q1_2997,tfidf_q1_2998,tfidf_q1_2999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
tfidf_question2_df

Unnamed: 0,tfidf_q2_0,tfidf_q2_1,tfidf_q2_2,tfidf_q2_3,tfidf_q2_4,tfidf_q2_5,tfidf_q2_6,tfidf_q2_7,tfidf_q2_8,tfidf_q2_9,...,tfidf_q2_2990,tfidf_q2_2991,tfidf_q2_2992,tfidf_q2_2993,tfidf_q2_2994,tfidf_q2_2995,tfidf_q2_2996,tfidf_q2_2997,tfidf_q2_2998,tfidf_q2_2999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
df = pd.concat([df, tfidf_question1_df, tfidf_question2_df], axis=1)

In [52]:
df

Unnamed: 0,question1,question2,is_duplicate,jaccard,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,...,tfidf_q2_2990,tfidf_q2_2991,tfidf_q2_2992,tfidf_q2_2993,tfidf_q2_2994,tfidf_q2_2995,tfidf_q2_2996,tfidf_q2_2997,tfidf_q2_2998,tfidf_q2_2999
0,step step guide invest share market india,step step guide invest share market,0,0.833333,1.000000,0.833333,0.0,0.0,1.000000,0.833333,...,0,0,0,0,0,0,0,0,0,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0,0.222222,0.666667,0.250000,0.0,0.0,0.666667,0.250000,...,0,0,0,0,0,0,0,0,0,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0,0.222222,0.400000,0.333333,0.0,0.0,0.400000,0.333333,...,0,0,0,0,0,0,0,0,0,0
3,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0,0.153846,0.400000,0.200000,0.0,0.0,0.400000,0.200000,...,0,0,0,0,0,0,0,0,0,0
4,astrology capricorn sun cap moon cap risingwha...,im triple capricorn sun moon ascendant caprico...,1,0.400000,0.571429,0.571429,0.0,0.0,0.571429,0.571429,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,many keywords racket programming language late...,many keywords perl programming language latest...,0,0.750000,0.857143,0.857143,0.0,0.0,0.857143,0.857143,...,0,0,0,0,0,0,0,0,0,0
340048,believe life death,true life death,1,0.500000,0.666667,0.666667,0.0,0.0,0.666667,0.666667,...,0,0,0,0,0,0,0,0,0,0
340049,one coin,whats coin,0,0.333333,0.500000,0.500000,0.0,0.0,0.500000,0.500000,...,0,0,0,0,0,0,0,0,0,0
340050,approx annual cost living studying uic chicago...,little hairfall problem want use hair styling ...,0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0


# Word2Vec

In [53]:
# Preprocess text: tokenization and lowercasing
def preprocess(text):
    text = str(text)
    return text.lower().split()

In [54]:
# Tokenize the questions
df['tokens_q1'] = df['question1'].apply(preprocess)
df['tokens_q2'] = df['question2'].apply(preprocess)

In [55]:
# Combine tokens from both questions
all_tokens = df['tokens_q1'].tolist() + df['tokens_q2'].tolist()

In [56]:
# Train Word2Vec model
model_word2vec = Word2Vec(sentences=all_tokens, vector_size=300, window=5, min_count=1, workers=5)

In [57]:
# Save Word2Vec model
model_word2vec.save('word2vec_model.model')

In [58]:
# Function to compute average word vectors for a given list of tokens
def get_vector(tokens):
    vectors = [model_word2vec.wv[word] for word in tokens if word in model_word2vec.wv]
    if not vectors:
        return np.zeros(model_word2vec.vector_size)  # Return zero vector if no words are in the model
    return np.mean(vectors, axis=0)

In [59]:
# Get average vectors for question1 and question2
df['w2v_q1'] = df['tokens_q1'].apply(get_vector)
df['w2v_q2'] = df['tokens_q2'].apply(get_vector)

In [60]:
# Convert list of vectors into a array
w2v_q1_arr = np.array(df['w2v_q1'].tolist())
w2v_q2_arr = np.array(df['w2v_q2'].tolist())

In [61]:
# Create DataFrames from NumPy arrays
w2v_df_q1 = pd.DataFrame(w2v_q1_arr, columns=[f'w2v_q1_{i}' for i in range(w2v_q1_arr.shape[1])])
w2v_df_q2 = pd.DataFrame(w2v_q2_arr, columns=[f'w2v_q2_{i}' for i in range(w2v_q2_arr.shape[1])])

In [62]:
w2v_df_q1

Unnamed: 0,w2v_q1_0,w2v_q1_1,w2v_q1_2,w2v_q1_3,w2v_q1_4,w2v_q1_5,w2v_q1_6,w2v_q1_7,w2v_q1_8,w2v_q1_9,...,w2v_q1_290,w2v_q1_291,w2v_q1_292,w2v_q1_293,w2v_q1_294,w2v_q1_295,w2v_q1_296,w2v_q1_297,w2v_q1_298,w2v_q1_299
0,0.313036,0.367587,-0.181376,-0.102361,-0.023513,-0.665600,0.293679,0.038476,-0.206227,-0.132553,...,-0.686913,-0.013943,0.176212,0.149427,-0.028951,-0.300208,-0.186839,0.869027,-0.658669,-0.090026
1,0.297149,0.012492,-0.188932,-0.150577,0.112389,-0.598460,0.154175,0.103354,0.105211,-0.398360,...,0.237753,0.115842,-0.102310,0.149944,0.173658,0.218004,-0.251527,0.369272,0.103645,-0.469481
2,-0.097394,0.133514,0.480107,0.577176,0.076835,-0.097536,-0.007140,0.851002,-0.872461,0.150416,...,-0.491904,-0.400899,0.092065,-0.398652,-0.376707,0.598607,0.314585,-0.096257,-0.027890,0.009703
3,0.028234,0.374388,0.434428,0.478447,0.187263,-0.406812,0.017751,0.480130,0.033057,0.193854,...,-0.125946,-0.044675,0.072361,-0.411841,-0.107308,0.313682,0.049749,-0.439043,-0.040162,0.159128
4,-0.058269,0.051020,0.120310,-0.098182,0.398316,-0.163532,0.299498,0.472440,-0.115831,-0.239967,...,-0.127973,-0.041369,0.069685,0.116065,0.149694,0.081077,0.007711,-0.083025,0.089640,-0.020580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,-0.290811,0.004022,-0.266233,-0.299383,-0.277659,-0.260481,0.129139,0.138544,0.181792,-0.638111,...,-0.306172,0.702675,0.194870,0.002591,-0.027669,-0.050499,0.071206,0.188461,0.415276,-0.119467
340048,1.007005,-0.507045,-0.097751,0.599582,0.158413,0.077772,0.304462,0.958924,-0.429251,0.303208,...,0.476126,0.303339,0.107556,-0.200640,0.072074,-0.066604,-0.176022,0.131900,0.168595,-0.645968
340049,0.073607,-0.643653,0.504657,-0.184316,-0.324612,-0.762591,-0.055417,-0.335398,-0.601509,-0.256591,...,-0.112189,0.057527,0.258628,-0.102601,0.087516,-0.163709,0.747776,-0.240074,0.102385,-0.384504
340050,0.111367,-0.331650,-0.053652,-0.175558,-0.084375,-0.202719,-0.151699,0.070678,0.303368,0.146257,...,0.080300,0.089442,0.736401,-0.394999,0.526883,-0.245444,-0.361630,-0.105191,-0.541445,0.269721


In [63]:
w2v_df_q2

Unnamed: 0,w2v_q2_0,w2v_q2_1,w2v_q2_2,w2v_q2_3,w2v_q2_4,w2v_q2_5,w2v_q2_6,w2v_q2_7,w2v_q2_8,w2v_q2_9,...,w2v_q2_290,w2v_q2_291,w2v_q2_292,w2v_q2_293,w2v_q2_294,w2v_q2_295,w2v_q2_296,w2v_q2_297,w2v_q2_298,w2v_q2_299
0,0.418497,0.514937,-0.070696,-0.187710,-0.014389,-0.693056,0.276345,0.116600,-0.132035,-0.226985,...,-0.776374,0.050668,0.232525,0.282205,0.001893,-0.007409,-0.072573,0.945223,-0.521711,-0.206186
1,0.076432,0.138408,0.346086,0.340556,0.039827,-0.518194,0.277938,-0.295856,-0.196684,0.293447,...,-0.061363,0.110269,-0.023867,-0.152012,-0.068100,-0.441250,0.179726,0.145481,-0.118104,-0.091690
2,0.048734,0.144543,0.125962,0.483714,-0.041876,-0.217535,-0.145066,0.719534,-0.621070,0.338176,...,-0.353854,-0.058739,0.082026,-0.339601,-0.027975,0.483464,0.275175,-0.211090,0.094044,0.032412
3,0.193953,0.381375,0.765678,0.897743,0.674192,-0.583262,-0.033807,0.486330,0.038661,0.524678,...,0.292884,-0.075270,-0.098897,-0.774043,-0.289239,0.210369,-0.052106,-0.289073,-0.321038,0.377097
4,-0.078210,-0.221360,0.086937,-0.184125,0.484000,0.067842,0.413245,0.288630,-0.153956,-0.159623,...,-0.143759,-0.100399,0.117523,0.073155,0.332787,0.002447,-0.149884,0.007247,0.105602,-0.027703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,-0.267374,-0.003124,-0.261206,-0.311467,-0.275026,-0.251344,0.136569,0.143295,0.168740,-0.643347,...,-0.314028,0.695321,0.183288,0.000376,-0.055743,-0.039500,0.072761,0.186275,0.434294,-0.122827
340048,0.894806,-0.635648,0.002603,0.546099,0.189192,-0.020710,0.388376,0.658083,-0.580619,-0.045670,...,0.676459,0.385361,0.236602,0.017281,0.139327,0.075328,-0.030631,-0.067471,0.175815,-0.463252
340049,-0.251205,-0.131692,0.170839,-0.078396,-0.546372,-0.669672,0.746647,0.630334,0.057647,-0.740047,...,0.504827,0.604887,-0.041370,-0.718000,0.004090,0.536168,-0.059402,-0.121769,0.260259,-0.196869
340050,0.140337,-0.053855,0.180124,-0.043530,-0.024620,0.116317,0.359726,0.288662,-0.002658,-0.264729,...,-0.349479,-0.086902,0.142003,-0.036317,0.044096,-0.200231,0.220063,0.120130,0.226375,0.073842


In [64]:
# Combine the new features back into the original DataFrame
df = pd.concat([df, w2v_df_q1, w2v_df_q2], axis=1)

In [65]:
# Drop intermediate columns and original questions
df.drop(columns=['tokens_q1', 'tokens_q2', 'w2v_q1', 'w2v_q2', 'question1', 'question2'], inplace=True)

In [66]:
df

Unnamed: 0,is_duplicate,jaccard,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,...,w2v_q2_290,w2v_q2_291,w2v_q2_292,w2v_q2_293,w2v_q2_294,w2v_q2_295,w2v_q2_296,w2v_q2_297,w2v_q2_298,w2v_q2_299
0,0,0.833333,1.000000,0.833333,0.0,0.0,1.000000,0.833333,0.0,1.0,...,-0.776374,0.050668,0.232525,0.282205,0.001893,-0.007409,-0.072573,0.945223,-0.521711,-0.206186
1,0,0.222222,0.666667,0.250000,0.0,0.0,0.666667,0.250000,0.0,0.0,...,-0.061363,0.110269,-0.023867,-0.152012,-0.068100,-0.441250,0.179726,0.145481,-0.118104,-0.091690
2,0,0.222222,0.400000,0.333333,0.0,0.0,0.400000,0.333333,0.0,0.0,...,-0.353854,-0.058739,0.082026,-0.339601,-0.027975,0.483464,0.275175,-0.211090,0.094044,0.032412
3,0,0.153846,0.400000,0.200000,0.0,0.0,0.400000,0.200000,0.0,0.0,...,0.292884,-0.075270,-0.098897,-0.774043,-0.289239,0.210369,-0.052106,-0.289073,-0.321038,0.377097
4,1,0.400000,0.571429,0.571429,0.0,0.0,0.571429,0.571429,1.0,0.0,...,-0.143759,-0.100399,0.117523,0.073155,0.332787,0.002447,-0.149884,0.007247,0.105602,-0.027703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340047,0,0.750000,0.857143,0.857143,0.0,0.0,0.857143,0.857143,1.0,1.0,...,-0.314028,0.695321,0.183288,0.000376,-0.055743,-0.039500,0.072761,0.186275,0.434294,-0.122827
340048,1,0.500000,0.666667,0.666667,0.0,0.0,0.666667,0.666667,1.0,0.0,...,0.676459,0.385361,0.236602,0.017281,0.139327,0.075328,-0.030631,-0.067471,0.175815,-0.463252
340049,0,0.333333,0.500000,0.500000,0.0,0.0,0.500000,0.500000,1.0,0.0,...,0.504827,0.604887,-0.041370,-0.718000,0.004090,0.536168,-0.059402,-0.121769,0.260259,-0.196869
340050,0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,-0.349479,-0.086902,0.142003,-0.036317,0.044096,-0.200231,0.220063,0.120130,0.226375,0.073842


# Split Data (80-20)

In [67]:
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
X_train.shape

(272041, 6616)

In [70]:
X_test.shape

(68011, 6616)

In [71]:
y_train.shape

(272041,)

In [72]:
y_test.shape

(68011,)

## ML Models

# Model 1: XGBoost

In [74]:
# Convert training data into DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'eta': 0.1,
    'eval_metric': 'logloss'
}

# Train in batches
num_rounds = 100  # You can adjust this as needed
bst = xgb.train(params, dtrain, num_boost_round=num_rounds)

# Make predictions on the test set
y_pred_xgb = bst.predict(dtest)
y_pred_xgb_binary = [1 if pred > 0.5 else 0 for pred in y_pred_xgb]

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb_binary))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb_binary))

# Save XGBoost model
bst.save_model('xgboost_model.json')

XGBoost Accuracy: 0.7948861213627207
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.84      0.83     42043
           1       0.73      0.73      0.73     25968

    accuracy                           0.79     68011
   macro avg       0.78      0.78      0.78     68011
weighted avg       0.79      0.79      0.79     68011

