<a href="https://www.kaggle.com/code/siddhantsoam/quora-same-question-pair?scriptVersionId=143041990" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import IPython.display as ipd
def boom():
    beep = np.sin(2*np.pi*400*np.arange(10000*2)/10000)
    return ipd.Audio(beep, rate=10000, autoplay=True)

***Buisness Objectives and contraints***

1) Cost of mis-classificaton is very high

2) Use a probability threshold to make the decision so that it can be changed in future

3) No strict latency requirements

4) Interpretability is partially important



To split it in train/test dataset, we should use timestamp in this case because the new questions might be different , so we just need to sort the data as per the timestamp and take old 70% as train and new 30% as test dataset. But we dont have the timestamp present in this dataset so we will do random splitting

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_colwidth',100)    #to display the whole question

In [None]:
df = pd.read_csv('../input/dataset/train.csv')
df.shape

In [None]:
df.head()

In [None]:
df.sample(10)      # picks up randomly 10 datapoints

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# duplicate number of rows
df.duplicated().sum()

In [None]:
# distribution of duplicate and non-duplicate quesitons

print(df['is_duplicate'].value_counts())
print(round((df['is_duplicate'].value_counts()/df['is_duplicate'].count())*100, 2))
df['is_duplicate'].value_counts().plot(kind='bar')

In [None]:
# repeated questions

qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())
print("unique questions :",np.unique(qid).shape[0])
x = qid.value_counts()>1
print("repeated questions : ", x[x].shape[0])

In [None]:
# repeated questions histogram

plt.hist(qid.value_counts().values, bins=160)
plt.xlabel('Number of occurrences of question')
plt.ylabel('Number of questions')
plt.yscale('log')
plt.show()

In [None]:
#checking for Nan values
nan_rows = df[df.isnull().any(1)]
print(nan_rows)

In [None]:
df = df.fillna(' ')
nan_rows = df[df.isnull().any(1)]
print(nan_rows)

Feature Engineering

- freq_qid1 = frequency of qid 1
- freq_qid2
- q1len
- q2len
- q1words
- q2words
- words common : # of common unique words
- words total : total words in q1 + total words in q2 (unique)
- word share : (word common) / (word total)
- freq_q1 + freq_q2
- freq_q1 - freq_q2

In [None]:
df['freq_qid1'] = df.groupby('qid1')['qid1'].transform('count')
df['freq_qid2'] = df.groupby('qid2')['qid2'].transform('count')
df['q1_len'] = df['question1'].str.len()
df['q2_len'] = df['question2'].str.len()
df['q1_n_words'] = df['question1'].apply(lambda x: len(x.split(" ")))
df['q2_n_words'] = df['question2'].apply(lambda x: len(x.split(" ")))

def common_words(row):
    l1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    l2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return 1.0 * len(l1&l2)

df['word_common'] = df.apply(common_words, axis=1)

def total_words(row):
    t1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    t2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return 1.0 * (len(t1) + len(t2))

df['word_total'] = df.apply(total_words, axis=1)

df['word_share'] = round(df['word_common']/df['word_total'],2)

df['freq_q1+q2'] = df['freq_qid1'] + df['freq_qid2']
df['freq_q1-q2'] = abs(df['freq_qid1'] - df['freq_qid2'])

df.to_csv("basic_feature_engineering_train.csv", index = False)

df.head()


In [None]:
df.describe()

In [None]:
print("Number of quesitons with minimum length [quesiton 1] : ",df[df['q1_n_words']==1].shape[0])

In [None]:
#feature words share

plt.figure(figsize=(12,8))
plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='word_share', data = df[0:])

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==0]['word_share'], label = 'non_duplicate')
sns.distplot(df[df['is_duplicate']==1]['word_share'], label = 'duplicate')
plt.show()


Imp feature as the distributions are different

In [None]:
#feature words common

plt.figure(figsize=(12,8))
plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='word_common', data = df[0:])

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==0]['word_common'], label = 'non_duplicate')
sns.distplot(df[df['is_duplicate']==1]['word_common'], label = 'duplicate')
plt.show()

Not so important feature

***Preprocessing of Text***

- Remove html tags  (quora contains htms tags as well)
- remove punctuation
- preform stemming
- remove stop words
- expanding contractions

In [None]:
import re
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

def preprocess(x):
    x = str(x).lower().strip()
    
    # Replace certial special chars with their string equivalents
    x = x.replace('%', ' percent').replace('$',' dollar').replace(',000,000','m').replace(',000','k').replace("won't", 'will not').replace('cannot', 'can not').replace("n't", ' not').replace("what's", 'what is').replace("'ve",' have').replace("he's", 'he is').replace("she's",'she is').replace("'ll", ' will')
    
    x = re.sub(r'([0-9]+)000000000' , r'\1b', x)
    x = re.sub(r'([0-9]+)000000' , r'\1m', x)
    x = re.sub(r'([0-9]+)000' , r'\1k', x)

    porter = PorterStemmer()
    if type(x) == type(''):
        x = porter.stem(x)
        x = BeautifulSoup(x)
        x = x.get_text()
        
    pattern = re.compile('\W')
    
    
    if type(x) == type(''):
        x = re.sub(pattern, ' ',x).strip()
    
    
    return x
    

In [None]:
preprocess("I've already! wasn't <b>done</b>?")

***Advance Feature Extraction***

- tokens = set of unique words in a sentence
- stop words = nlp stop words
- word = token which is not a stop word


Token Based
- cwc_min = common_word_count/min(len(q1_words),len(q2_words))
- cwc-max = common_word_count/max(len(q1_words),len(q2_words))
- csc_min = common_stop_count/min(len(q1_stops),len(q2_stops))
- csc-max = common_stop_count/max(len(q1_stops),len(q2_stops))
- ctc_min = common_token_count/min(len(q1_tokens),len(q2_tokens))
- ctc-max = common_token_count/max(len(q1_tokens),len(q2_tokens))
- last_word_eq = int(q1_token[-1] == q2_token[-1])
- first_word_eq = int(q1_token[0] == q2_token[0])

Length Based

- mean_len = (len(q1_tokens) + len(q2_tokens)) / 2
- abs_len_diff = abs(len(q1_tokens) - len(q2_tokens))
- longest_substr_ratio : len(longest_common_substr) / min(len(q1_tokens),len(q2_tokens))

***fuzzywuzzy***

gives value b/w 0-100 
0 - dissimilar
100 - similar

- fuzz_ratio : checks the edit distance between 2 strings [minimum no. of add/delete/insert operations required to make strings equals]
    
    issue 1) yankees, newyork yankees = 60 [newyork needs to be added sp low score but both are same teams]
          2) Newyork mets, Newyork Yankees = 75 , edit distance is less so high score but both are different teams
          
- fuzz_partial_ratio : checks if any partial substring matches completely or not [longer the substring higher the value]
    
    issue : newyork vs atlanta , atlanta vs newyork will give low score
    
- token_sort_ratio : take all the tokens, sort them and them compare
    
    issue : s1 = mariners vs angels
            s2 = los angeles angels of anaheim seattle mariners 
            It will not give very high score as s2 is long with extra tokens
         
- token_set_ratio : apply token sort , so s1 = angels mariners vs , s2 = anaheim angeles angels los mariners of seattle
    
    now t0 = [sorted intersection]
        t1 = [sorted_intersection] + [sorted rest of s1]
        t2 = [sorted_intersection] + [sorted rest of s2]
        
    and then compare each pair and take the max fuzz value among all 

    issue : s1 = sirhan, sirhan   s2 = sirhan   : value is 100, so repetitive words have issues




In [None]:
df.head()

In [None]:
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)
df.head()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

In [None]:
from nltk.corpus import stopwords

def fetch_token_features(row):
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001
    
    STOP_WORDS = stopwords.words('english')
    
    token_features = [0.0]*8
    
    # extracting tokens
    q1_tokens = q1.split(" ")
    q2_tokens = q2.split(" ")
    
    if len(q1_tokens)==0 or len(q2_tokens)==0:
        return token_features
    
    #extracting stop words
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    #extracting words
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    common_word_count = len(q1_words.intersection(q2_words))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    token_features[0] = common_word_count/(min(len(q1_words) , len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count/(max(len(q1_words) , len(q2_words)) + SAFE_DIV)
    
    token_features[2] = common_stop_count/(min(len(q1_stops) , len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count/(max(len(q1_stops) , len(q2_stops)) + SAFE_DIV)
    
    token_features[4] = common_token_count/(min(len(q1_tokens) , len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count/(max(len(q1_tokens) , len(q2_tokens)) + SAFE_DIV)
    
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

In [None]:
token_features = df.apply(fetch_token_features , axis=1)

df['cwc_min'] = list(map(lambda x : x[0], token_features))
df['cwc_max'] = list(map(lambda x : x[1], token_features))
df['csc_min'] = list(map(lambda x : x[2], token_features))
df['csc_max'] = list(map(lambda x : x[3], token_features))
df['ctc_min'] = list(map(lambda x : x[4], token_features))
df['ctc_max'] = list(map(lambda x : x[5], token_features))
df['last_word_eq'] = list(map(lambda x : x[6], token_features))
df['first_word_eq'] = list(map(lambda x : x[7], token_features))

In [None]:
df.head()

In [None]:
!pip install distance

In [None]:
import distance
def fetch_length_features(row):
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0]*3
    
    # extracting tokens
    q1_tokens = q1.split(" ")
    q2_tokens = q2.split(" ")
    
    if len(q1_tokens)==0 or len(q2_tokens)==0:
        return length_features
    
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1,q2))
    if len(strs)>0:
        length_features[2] = len(strs[0]) / min(len(q1_tokens), len(q2_tokens))
    else:
        length_features[2] = 0.0
    
    return length_features

In [None]:
length_features = df.apply(fetch_length_features, axis=1)

df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
df['mean_len'] = list(map(lambda x: x[1], length_features))
df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

In [None]:
df.head()

In [None]:
!pip install fuzzywuzzy

In [None]:
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*4
    
    fuzzy_features[0] = fuzz.QRatio(q1, q2)
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
    
    return fuzzy_features

In [None]:
fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)

df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

In [None]:
df.head()

***Checking Commit***

In [None]:
df.info()

***Analysis on advanced features***

In [None]:
df = pd.read_csv('../input/quora-processed-data/processed_data_wo_index.csv')
df.head()

In [None]:
sns.pairplot(df[['ctc_min', 'cwc_min', 'csc_min', 'is_duplicate']], hue = 'is_duplicate')

In [None]:
sns.pairplot(df[['ctc_max', 'cwc_max', 'csc_max', 'is_duplicate']], hue = 'is_duplicate')

In [None]:
sns.pairplot(df[['last_word_eq', 'first_word_eq', 'is_duplicate']], hue = 'is_duplicate')

In [None]:
sns.pairplot(df[['abs_len_diff', 'mean_len', 'longest_substr_ratio', 'is_duplicate']], hue = 'is_duplicate')

In [None]:
sns.pairplot(df[['fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'is_duplicate']], hue = 'is_duplicate')

*** idf weighted word2vec using GLOVE***

In [None]:
df = pd.read_csv('../input/quora-processed-data/processed_data_wo_index.csv')

In [None]:
df.head()

In [None]:
!pip install tqdm

In [None]:
df['question1'] = df['question1'].apply(lambda x : str(x))
df['question2'] = df['question2'].apply(lambda x : str(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#merge texts

questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False,)
tfidf.fit_transform(questions)

#dict key -> word and value -> idf value
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

In [None]:
# pip update spacy

In [None]:
import spacy

***from here use tdf to point to say sample data***

In [None]:
tdf = df.sample(100000)
tdf.head()

In [None]:
import spacy
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

vecs1 = []

#tqdm is used to display progress
for qu1 in tqdm(list(tdf['question1'])):
    doc1 = nlp(qu1)
    mean_vec1 = np.zeros([len(doc1) , 96])
    for word1 in doc1:
        vec1 = word1.vector
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        mean_vec1 += vec1*idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

tdf['q1_feats_m'] = vecs1

In [None]:
vecs2 = []

#tqdm is used to display progress
for qu2 in tqdm(list(tdf['question2'])):
    doc2 = nlp(qu2)
    mean_vec2 = np.zeros([len(doc2) , 96])
    for word2 in doc2:
        vec2 = word2.vector
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        mean_vec2 += vec2*idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)

tdf['q2_feats_m'] = vecs2

In [None]:
tdf.head()

In [None]:
# tdf.to_csv('/kaggle/working/word2vec.csv' , index = False)

In [None]:
df_q1 = pd.DataFrame(tdf.q1_feats_m.values.tolist(), index = tdf.index)
df_q1.head()

In [None]:
new_col_names = {col : str(col) + '_q1' for col in df_q1.columns}
df_q1.rename(columns = new_col_names, inplace=True)
df_q1.head()

In [None]:
df_q2 = pd.DataFrame(tdf.q2_feats_m.values.tolist(), index = tdf.index)
df_q2.head()

In [None]:
new_col_names = {col : str(col) + '_q2' for col in df_q2.columns}
df_q2.rename(columns = new_col_names, inplace=True)
df_q2.head()

***Total number of features***

- 11 basic features
- 15 advanced features
- 96 featurs for q1
- 96 features for q2
- total : 218

In [None]:
df_basic_adv = tdf.drop(['id','qid1','qid2','question1','question2','is_duplicate', 'q1_feats_m','q2_feats_m'], axis=1)
df_basic_adv.head()

In [None]:
f_df = pd.concat([df_q1 , df_q2], axis=1)
f_df.head()

In [None]:
f_df = pd.concat([df_basic_adv, f_df], axis=1)
f_df.head()

In [None]:
checkpoint_dataset = pd.concat([f_df, pd.DataFrame(tdf['is_duplicate'])], axis=1)
checkpoint_dataset.head()

In [None]:
checkpoint_dataset.to_csv('/kaggle/working/quora_idf_wieghted_word2vec_sample.csv')

In [None]:
y_true = tdf['is_duplicate']
y_true.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(f_df, y_true, random_state=42, stratify=y_true, test_size=0.3)

In [None]:
print("Number of training points :", X_train.shape)
print("Number of testing  points :", X_test.shape)

In [None]:
from collections import Counter
print("-"*10, "Distribution of output variable in train data", "-"*10)
train_distr = Counter(y_train)
train_len = len(y_train)
print("Class 0: ", round(int(train_distr[0])/train_len,3), "Class 1: ", round(int(train_distr[1])/train_len, 3))

print("-"*10, "Distribution of output variable in test data", "-"*10)
test_distr = Counter(y_test)
test_len = len(y_test)
print("Class 0: ", round(int(test_distr[0])/test_len,3), "Class 1: ", round(int(test_distr[1])/test_len,3))

***Building a random model***

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
predicted_y = np.zeros((test_len, 2))
for i in range(test_len):
    rand_probs = np.random.rand(1,2)
    predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
    
print("Log Loss on Test Data using Random model :",log_loss(y_test, predicted_y))

***Logistic Regression with hyperparameter tuning***

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

In [None]:
alpha = [10 ** x for x in range(-5,2)]

log_error_array = []
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log_loss', random_state=42)
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(log_loss(y_test, predict_y, labels = clf.classes_))
    print('For value of alpha = ',i ,"the log loss is: ", log_loss(y_test, predict_y, labels = clf.classes_))
    
best_alpha = np.argmin(log_error_array)


In [None]:
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log_loss', random_state=42)
clf.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_train, y_train)

predict_y = sig_clf.predict_proba(X_train)
print("For alpha value: ",alpha[best_alpha], "Train Log Loss: ", log_loss(y_train, predict_y, labels = clf.classes_))
predict_y = sig_clf.predict_proba(X_test)
print("For alpha value: ",alpha[best_alpha], "Test Log Loss: ", log_loss(y_test, predict_y, labels = clf.classes_))

***Lnear SVM***

In [None]:
alpha = [10 ** x for x in range(-5,2)]

log_error_array = []
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42)
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(log_loss(y_test, predict_y, labels = clf.classes_))
    print('For value of alpha = ',i ,"the log loss is: ", log_loss(y_test, predict_y, labels = clf.classes_))
    
best_alpha = np.argmin(log_error_array)

clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l1', loss='hinge', random_state=42)
clf.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_train, y_train)

predict_y = sig_clf.predict_proba(X_train)
print("For alpha value: ",alpha[best_alpha], "Train Log Loss: ", log_loss(y_train, predict_y, labels = clf.classes_))
predict_y = sig_clf.predict_proba(X_test)
print("For alpha value: ",alpha[best_alpha], "Test Log Loss: ", log_loss(y_test, predict_y, labels = clf.classes_))

***XG Boost***

In [None]:
import xgboost as xgb
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4


#DMatrix = DMatrix objects are specific data structures used by XGBoost to efficiently store and handle data 
#          during training and prediction
d_train = xgb.DMatrix(X_train, label = y_train)
d_test = xgb.DMatrix(X_test, label = y_test)

#to check for early stopping, it pevents overfitting
watchlist = [(d_train, 'train'), (d_test, 'valid')]

#verbose_eval = print after every 20 iters
#early_stopping_rounds = if perf not improved after 20 iters, then stop
bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=20)

xgdmat = xgb.DMatrix(X_train,y_train)

predict_y = bst.predict(d_test)

print("The test log loss is: ", log_loss(y_test, predict_y, labels=clf.classes_))