In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import re
from bs4 import BeautifulSoup
 

import string
exclude = string.punctuation

from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('questions.csv')

In [None]:
newdf =  df.sample(50000,random_state=2)

In [None]:
newdf.head()

In [None]:
newdf['question1'] = newdf['question1'].astype('string')
newdf['question2'] = newdf['question2'].astype('string')

In [None]:
newdf.info()

### Text Preprocessing

In [None]:
newdf['question1'] = newdf['question1'].str.lower()
newdf['question2'] = newdf['question2'].str.lower()

In [None]:
def rem_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

df['question1'] = newdf['question1'].apply(rem_html_tags)
df['question2'] = newdf['question2'].apply(rem_html_tags)

In [None]:
def rem_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

newdf['question1'] = newdf['question1'].apply(rem_url)
newdf['question2'] = newdf['question2'].apply(rem_url)

In [None]:
def remove_puncs(text):
    return text.translate(str.maketrans('','',exclude))

newdf['question1'] = newdf['question1'].apply(remove_puncs)
newdf['question2'] = newdf['question2'].apply(remove_puncs)

In [None]:
def preprocess(text):
    
    text = text.replace('%',' percent ')
    text = text.replace('$',' dollar ')
    text = text.replace('₹',' rupee ')
    text = text.replace('€',' euro ')
    text = text.replace('@',' at ')
    
    text = text.replace('[math]','')
    
    #Some common contractions 
    # found on stack overflow
    # https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    
    contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "I'd": "I had / I would",
    "I'd've": "I would have",
    "I'll": "I shall / I will",
    "I'll've": "I shall have / I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
}
   
    d_text = []
    
    for word in text.split():
        if word in contractions:
            word = contractions[word]
        d_text.append(word)
        
    text = ' '.join(d_text)
    text = text.replace("'ve", "have")
    text = text.replace("'re", "are")
    text = text.replace("n't", "not")
    text = text.replace("'ll", "will")
    
    return text

In [None]:
newdf['question1'] = newdf['question1'].apply(preprocess)
newdf['question2'] = newdf['question2'].apply(preprocess)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def tokenization(text):
    return nlp(text)


In [None]:
newdf['question1'] = newdf['question1'].apply(tokenization)
newdf['question2'] = newdf['question2'].apply(tokenization)

In [None]:
def return_actual_question(s):
    return "".join(s.text)

In [None]:
newdf['question1'] = newdf['question1'].apply(return_actual_question)
newdf['question2'] = newdf['question2'].apply(return_actual_question)

In [None]:
newdf

### Feature Engineering

In [None]:
newdf['q1len'] = newdf['question1'].apply(lambda x:len(x))
newdf['q2len'] = newdf['question2'].apply(lambda x:len(x))

In [None]:
newdf

In [None]:
def countwords(text):
    count=0
    for word in text.split(' '):
        count+=1
    return count

newdf['q1words'] = newdf['question1'].apply(countwords)
newdf['q2words'] = newdf['question2'].apply(countwords)

In [None]:
def common_words(row):
    # Splitting each question into words and converting to lowercase
    words1 = set(map(lambda x: x.lower(), row['question1'].split()))
    words2 = set(map(lambda x: x.lower(), row['question2'].split()))
    # Finding the common words
    common_words_set = words1.intersection(words2)
    return len(common_words_set)
newdf['common_words'] = newdf.apply(common_words, axis=1)

In [None]:
def total_words(row):
    w1 = set(map(lambda x: x.lower(), row['question1'].split()))
    w2 = set(map(lambda x: x.lower(), row['question2'].split()))
    return len(w1) + len(w2)
newdf['total_words'] = newdf.apply(total_words, axis=1)

In [None]:
newdf['wordshare'] = round(newdf['common_words'] / newdf['total_words'],2)

In [None]:
sns.displot(newdf, x='common_words', hue='is_duplicate', kde=True, common_norm=False)
plt.legend()
plt.show()

In [None]:
sns.displot(newdf, x='total_words', hue='is_duplicate', kde=True, common_norm=False)
plt.legend()
plt.show()

In [None]:
sns.displot(newdf, x='wordshare', hue='is_duplicate', kde=True, common_norm=False)
plt.legend()
plt.show()

### Advanced Features

- Length Based Features:
1. Mean Length - Avg of the 2 question lengths
2. abs_len_diff - Abs difference b/w the lengths of the 2 qns
3. Longest_substr_ratio - Ratio of len of longest sub string to the question with a smaller length

- Token Features
1. cwc_min - Ratio of Common words to the smaller qn
2. cwc_max - Ratio of Common words to the lengthier qn
3. csc_min - ratio of common stop words to the smaller stop word count among 2 qns
4. csc_man - ratio of common stop words to the larger stop word count among 2 qns
5. ctc_min - Ratio of common tokens to the smalller token count among 2 qns
6. ctc_max - Ratio of common tokens to the larger token count among 2 qns
7. last_word_eq - 1 if the last word of the qns are the same
8. first_word_eq - 0 if the last word of the qns are the same

- Fuzzy Features
1. fuzz_ratio - Fuzz ratio score from fuzzwuzzy
2. fuzz_partial_ratio - from fuzzwuzzy
3. token_sort_ratio - from fuzzwuzzy
4. token_set_ratio - from fuzzwuzzy

In [None]:
import nltk
from nltk.corpus import stopwords
def tokenfeatures(row):
    
    tkft = [0]*8
    
    
    min_len = min(row['q1len'], row['q2len'])
    max_len = max(row['q1len'], row['q2len'])
    
    q1tokens = row['question1'].split()
    q2tokens = row['question2'].split()
    
    stop_words = stopwords.words('english')
    
    common_tokens = set([word for word in q1tokens if word in q2tokens])
    
    q1_stopwords = set([word for word in q1tokens if word in stop_words])
    q2_stopwords = set([word for word in q2tokens if word in stop_words])
    
    
    
    common_stopwords = q1_stopwords.intersection(q2_stopwords)
    
    nr1 = min(len(q1_stopwords),len(q2_stopwords))
    nr2 = max(len(q1_stopwords),len(q2_stopwords))
    nr3 = min(len(q1tokens),len(q2tokens))
    nr4 = max(len(q1tokens),len(q2tokens))
    
    
    tkft[0] = round(row['common_words'] / min_len, 2) if min_len != 0 else 0.0
    tkft[1] = round(row['common_words'] / max_len, 2) if max_len != 0 else 0.0
    tkft[2] = round(len(common_stopwords)/nr1,2) if nr1 != 0 else 0.0
    tkft[3] = round(len(common_stopwords)/nr2,2) if nr2 != 0 else 0.0
    tkft[4] = round(len(common_tokens)/ nr3,2) if nr3 != 0 else 0.0
    tkft[5] = round(len(common_tokens)/ nr4,2) if nr4 != 0 else 0.0
    tkft[6] = int(q1tokens[-1] == q2tokens[-1])
    tkft[7] = int(q1tokens[0] == q2tokens[0])
    
    return tkft

In [None]:
tokenfeatures = newdf.apply(tokenfeatures,axis=1)

In [None]:
newdf['cwc_min'] = list(map(lambda x: x[0], tokenfeatures))
newdf['cwc_max'] = list(map(lambda x: x[1], tokenfeatures))
newdf['csc_min'] = list(map(lambda x: x[2], tokenfeatures))
newdf['csc_max'] = list(map(lambda x: x[3], tokenfeatures))
newdf['ctc_min'] = list(map(lambda x: x[4], tokenfeatures))
newdf['ctc_max'] = list(map(lambda x: x[5], tokenfeatures))
newdf['last_word_eq'] = list(map(lambda x: x[6], tokenfeatures))
newdf['first_word_eq'] = list(map(lambda x: x[7], tokenfeatures))

In [None]:
import distance

def length_features(row):
    
    lenft = [0]*3
    
    q1tokens = row['question1'].split()
    q2tokens = row['question2'].split()
    
    lenft[0] = lenft[0] = round((len(q1tokens) + len(q2tokens)) / 2, 2)


    lenft[1] = abs((len(q1tokens) - len(q2tokens)))
    
    nr5 = min(len(q1tokens),len(q2tokens))
    
    subs = list(distance.lcsubstrings(row['question1'],row['question2']))
    
    lenft[2] = round(len(subs) / nr5, 2) if nr5 != 0 else 0.0
                     
    return lenft

In [None]:
lengthfeatures = newdf.apply(length_features,axis=1)

In [None]:
newdf['mean_len'] = list(map(lambda x: x[0], lengthfeatures))
newdf['abs_len_diff'] = list(map(lambda x: x[1], lengthfeatures))
newdf['longest_substr_ratio'] = list(map(lambda x: x[2], lengthfeatures))

In [None]:
from fuzzywuzzy import fuzz

def fetch_fuzzyfeatures(row):
    
    fuzzfeat = [0]*4
    
    fuzzfeat[0] = fuzz.QRatio(row['question1'],row['question2'])
    
    fuzzfeat[1] = fuzz.partial_ratio(row['question1'],row['question2'])
    
    fuzzfeat[2] = fuzz.token_sort_ratio(row['question1'],row['question2'])
    
    fuzzfeat[3] = fuzz.token_set_ratio(row['question1'],row['question2'])
    
    return fuzzfeat

In [None]:
fuzzyfeatures = newdf.apply(fetch_fuzzyfeatures,axis=1)

newdf['fuzz_ratio'] = list(map(lambda x: x[0], fuzzyfeatures))
newdf['full_partial_ratio'] = list(map(lambda x: x[1], fuzzyfeatures))
newdf['full_sort_ratio'] = list(map(lambda x: x[2], fuzzyfeatures))
newdf['full_set_ratio'] = list(map(lambda x: x[3], fuzzyfeatures))

In [None]:
newdf

In [None]:
quesdf = newdf[['question1','question2']]

In [None]:
final_df = newdf.drop(columns=['id','qid1','qid2','question1','question2'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
questions = list(quesdf['question1'].fillna('')) + list(quesdf['question2'].fillna(''))

# Call the CountVectorizer object
cv = CountVectorizer(max_features=3000)
q1, q2 = np.vsplit(cv.fit_transform(questions).toarray(), 2)


In [None]:
tdf1 = pd.DataFrame(q1, index = newdf.index)
tdf2 = pd.DataFrame(q2, index = newdf.index)

In [None]:
bowdf = pd.concat([tdf1,tdf2],axis=1)

In [None]:
findf = pd.concat([final_df,bowdf],axis=1)

In [None]:
X = findf.iloc[:,1:]
y = findf.iloc[:,0]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values,y.values,test_size=0.2,random_state=None)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
from xgboost import XGBClassifier
xbg = XGBClassifier()
xbg.fit(X_train,y_train)
y_pred1 = xbg.predict(X_test)
accuracy_score(y_test,y_pred1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred1)

In [None]:
# Import necessary libraries
from sklearn.svm import SVC

# Create an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can choose different kernels like 'linear', 'poly', 'rbf', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred2 = svm_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy:.2f}")
