In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 

In [None]:
!pip install dask_ml

### 1. Vectorization : Bag Of Words -  With Basic Features 

In [None]:
## Import dataset 
df_basic = pd.read_csv("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/basic_train.csv")
df_basic.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_common,word_total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,2,51,88,8,13,4.0,20.0,0.2,6,2


In [None]:
df_basic.shape

(404287, 17)

In [None]:
### We will use only 30k rows for sake of computation
df_basic = df_basic.sample(n=40000,random_state=12)

In [None]:
## Applying Bag of Words 
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df_basic['question1']) + list(df_basic['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [None]:
temp_df1 = pd.DataFrame(q1_arr, index= df_basic.index)
temp_df2 = pd.DataFrame(q2_arr, index= df_basic.index)


In [None]:
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(40000, 6000)

In [None]:
## Dropping columns that are not required 
final_df = df_basic.drop(columns=['id','qid1','qid2','question1','question2'])



In [None]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)


(40000, 6012)


In [None]:
final_df_basic = final_df.copy()

In [None]:
final_df_basic.to_csv("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/basic_train_vectorized.csv")

In [None]:
final_df.sample(3)

Unnamed: 0,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_common,word_total,word_share,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
285194,0,1,1,108,121,15,17,5.0,31.0,0.16129,...,0,0,0,0,0,0,0,0,0,0
151303,0,1,2,35,32,7,7,6.0,14.0,0.428571,...,0,0,0,0,0,0,0,0,0,0
259561,0,3,2,51,60,9,11,5.0,20.0,0.25,...,0,0,0,0,0,0,0,0,0,0


### 2. Training - with Basic Features

In [None]:
## Load preproccesed dataset with basic features 
final_df = pd.read_csv("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/basic_train_vectorized.csv")

In [None]:
final_df = final_df.drop(columns=['Unnamed: 0'])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, matthews_corrcoef
## creating  metrics scores 
def metrics(y_true, y_pred):
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    return {'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1,
            'ROC AUC': roc_auc,
            'True Positive Rate': tpr,
            'False Positive Rate': fpr,
            'MCC': mcc}

In [None]:
x,y = final_df.drop(columns=['is_duplicate'] ).values, final_df['is_duplicate'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
X_train

array([[ 1.,  1., 38., ...,  0.,  0.,  0.],
       [ 3.,  3., 52., ...,  0.,  0.,  0.],
       [ 1.,  1., 34., ...,  0.,  0.,  0.],
       ...,
       [ 8.,  7., 33., ...,  0.,  0.,  0.],
       [ 1.,  1., 70., ...,  0.,  0.,  0.],
       [ 1.,  1., 89., ...,  0.,  0.,  0.]])

In [None]:
## Using Random Forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8275

In [None]:
metrics(y_true=y_test, y_pred=y_pred)

{'Accuracy': 0.8275,
 'Precision': 0.8505747126436781,
 'Recall': 0.6486850977747809,
 'F1 Score': 0.7360367253251722,
 'ROC AUC': 0.7907708365314111,
 'True Positive Rate': array([0.       , 0.6486851, 1.       ]),
 'False Positive Rate': array([0.        , 0.06714342, 1.        ]),
 'MCC': 0.6237297344930357}

In [None]:
## Using XGBoost 
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.837625

In [None]:
metrics(y_true=y_test, y_pred=y_pred)


{'Accuracy': 0.837625,
 'Precision': 0.802540834845735,
 'Recall': 0.7454484153742414,
 'F1 Score': 0.7729417933927635,
 'ROC AUC': 0.8186916292206924,
 'True Positive Rate': array([0.        , 0.74544842, 1.        ]),
 'False Positive Rate': array([0.        , 0.10806516, 1.        ]),
 'MCC': 0.6479019442094254}

### 3. Vectorization : BOW using  Advanced Features 

In [None]:
## Import dataset 
df_advanced = pd.read_csv("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/advanced_train.csv")
df_advanced.sample(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,...,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
400313,400316,90278,533665,How can I trace my family line to make sure I'...,Today is my cousin sister's birthday. What sho...,0,2,1,75,77,...,0.153845,0.117646,0,0,4,15.0,20,23,46,44
123909,123910,525,180415,What are your New Year's resolutions for 2017?,What are some of your New Year's resolutions f...,0,32,6,46,54,...,0.874989,0.699993,0,1,2,9.0,90,85,90,94


In [None]:
df_advanced = df_advanced.sample(n=20000,random_state=12)


from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df_advanced['question1']) + list(df_advanced['question2'])

## Making max_features = 5k 
cv = CountVectorizer(max_features=5000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)





In [None]:
temp_df1 = pd.DataFrame(q1_arr, index= df_advanced.index)
temp_df2 = pd.DataFrame(q2_arr, index= df_advanced.index)

temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(20000, 10000)

In [None]:
final_df = df_advanced.drop(columns=['id','qid1','qid2','question1','question2'])

In [None]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)

(20000, 10026)


In [None]:
final_df.head(2)

Unnamed: 0,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_common,word_total,word_share,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
91592,1,22,18,48,61,10,12,6.0,22.0,0.272727,...,0,0,0,0,0,0,0,0,0,0
274912,0,1,1,34,19,5,4,2.0,9.0,0.222222,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x,y = final_df.drop(columns=['is_duplicate']) , final_df['is_duplicate']

### Training with Advanced Features: 

In [None]:
!pip install dask_ml

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
import pandas as pd
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)


In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test  = np.array(y_test )

In [None]:
## Using Random Forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8265

In [None]:
metrics(y_true=y_test, y_pred=y_pred)

{'Accuracy': 0.8265,
 'Precision': 0.8122605363984674,
 'Recall': 0.7024519549370444,
 'F1 Score': 0.7533759772565742,
 'ROC AUC': 0.8020489401341183,
 'True Positive Rate': array([0.        , 0.70245195, 1.        ]),
 'False Positive Rate': array([0.        , 0.09835407, 1.        ]),
 'MCC': 0.6245312221141693}

In [None]:
## Using XGBoost 
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)
metrics(y_true=y_test, y_pred=y_pred)

{'Accuracy': 0.83575,
 'Precision': 0.8034188034188035,
 'Recall': 0.7475149105367793,
 'F1 Score': 0.7744593202883625,
 'ROC AUC': 0.8183580172916735,
 'True Positive Rate': array([0.        , 0.74751491, 1.        ]),
 'False Positive Rate': array([0.        , 0.11079888, 1.        ]),
 'MCC': 0.6466084453725007}