In [2]:
#import pandas for file handling and reading
import pandas as pd

In [3]:
#import regular expression for data pre-processing
import re
from string import punctuation
from nltk.stem import SnowballStemmer

In [4]:
#read the file
df=pd.read_csv('../input/traincsv/train.csv')

In [5]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
#check data type for column for further manipulation
df['question1'].dtype

dtype('O')

In [7]:
#convert data type of column to string
df['question1']=df['question1'].astype("string")

In [8]:
df['question1'].dtype

StringDtype

In [9]:
#convert data type of column to string
df['question2']=df['question2'].astype("string")

In [10]:
#Check missing values
df['question1'].isna().sum()

1

In [11]:
#replace missing value with other string
df['question1'].fillna('missing', inplace=True)

In [12]:
df['question1'].isna().sum()

0

In [13]:
#Check missing values
df['question2'].isna().sum()

2

In [14]:
#replace missing value with other string
df['question2'].fillna('missing', inplace=True)

In [15]:
df['question1'].isna().sum()

0

In [16]:
#create a method for preprocessing data
def preprocess(text):
    text = text.lower()
    text = re.sub("\'s", " ", text) 
    text = re.sub(" whats ", " what is ", text)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text)
    text = re.sub("b\.g\.", " bg ", text)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text)
    text = re.sub("\(s\)", " ", text)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    return text

In [17]:
#preproces both the columns
df['question1']=df['question1'].apply(preprocess)

In [18]:
df['question2']=df['question2'].apply(preprocess)

In [19]:
#import nltk for tokenization
from nltk.tokenize import word_tokenize 

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
#concat both the column and vector
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())

CountVectorizer(token_pattern='\\w{1,}')

In [21]:
#vectorize both columns
trainq1_trans = count_vect.transform(df['question1'].values)
trainq2_trans = count_vect.transform(df['question2'].values)

In [22]:
labels = df['is_duplicate'].values

In [23]:
#split data for train test
from sklearn.model_selection import train_test_split

In [24]:
from scipy.sparse import hstack
import scipy

In [25]:
X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
y = labels
X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [26]:
#train the model and import xgboast
import xgboost as xgb

In [27]:
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) 
xgb_prediction = xgb_model.predict(X_valid)



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [28]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [29]:
#check the model for train and test accuracy
print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))

training score: 0.8287045701243634


In [30]:
print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))

validation score: 0.7587244544350897


In [31]:
print(classification_report(y_valid, xgb_prediction))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84     83975
           1       0.78      0.60      0.68     49441

    accuracy                           0.79    133416
   macro avg       0.78      0.75      0.76    133416
weighted avg       0.79      0.79      0.78    133416



In [22]:
#import tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)


In [24]:
#concat both column and create a word vector tfidf
tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())

TfidfVectorizer(max_features=5000, token_pattern='\\w{1,}')

In [25]:
# create vector as per tfidf for both columns
trainq1_trans = tfidf_vect.transform(df['question1'].values)

In [26]:
trainq2_trans = tfidf_vect.transform(df['question2'].values)

In [27]:
labels = df['is_duplicate'].values

In [29]:
#import packages for train and spliting the data for training and testing
from scipy.sparse import hstack
import scipy

In [30]:
X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
y = labels

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [36]:
import xgboost as xgb

In [37]:
#train the mdel
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) 




Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [38]:
#test the model
xgb_prediction = xgb_model.predict(X_valid)

In [40]:
#test the model for the accuracy
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [41]:
print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))

word level tf-idf training score: 0.8880485807630081


In [42]:
print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))

word level tf-idf validation score: 0.7621387620660536


In [43]:
print(classification_report(y_valid, xgb_prediction))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84     83975
           1       0.78      0.60      0.68     49441

    accuracy                           0.79    133416
   macro avg       0.79      0.75      0.76    133416
weighted avg       0.79      0.79      0.78    133416

