In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import timeit

In [26]:
df=pd.read_csv('train.csv').sample(1000)

In [27]:
df.is_duplicate.value_counts()

0    592
1    408
Name: is_duplicate, dtype: int64

In [28]:
df.isna().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

## Without Feature Engineering

In [29]:
question_df = df[['question1','question2']]

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(question_df['question1']) + list(question_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [32]:
temp_df1 = pd.DataFrame(q1_arr, index= question_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= question_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(1000, 6000)

In [33]:
temp_df['is_duplicate'] = df['is_duplicate']

In [34]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

In [35]:
from sklearn.ensemble import     RandomForestClassifier
from sklearn.tree import         DecisionTreeClassifier
from sklearn.tree import         ExtraTreeClassifier
from sklearn.ensemble import     AdaBoostClassifier
from sklearn.ensemble import     BaggingClassifier
from sklearn.ensemble import     ExtraTreesClassifier
from sklearn.ensemble import     GradientBoostingClassifier
from sklearn.ensemble import     VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import          SVC
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_absolute_error,mean_squared_error

In [36]:
LR  = LogisticRegression()
RF  = RandomForestClassifier()
DT  = DecisionTreeClassifier()
ET  = ExtraTreeClassifier()
ABC = AdaBoostClassifier()
BC  = BaggingClassifier()
GBC = GradientBoostingClassifier()

In [62]:
ml_algo = [LR,RF,DT,ET,ABC,BC,GBC]
ml_name = ['LR','RF','DT','ET','ABC','BC','GBC']

In [52]:
import time
score = []
time_taken = []
for i in ml_algo:
    start = timeit.default_timer()
    i.fit(X_train,y_train)
    y_pred = i.predict(X_test)
    stop = timeit.default_timer()
    score.append(accuracy_score(y_test,y_pred))
    time_taken.append(stop - start)

In [82]:
score_chart = pd.DataFrame({'ml_name':ml_name, 'score':[x*100 for x in score], 'time_taken':time_taken})

In [83]:
score_chart

Unnamed: 0,ml_name,score,time_taken
0,LR,60.5,0.368664
1,RF,64.0,2.292988
2,DT,58.5,0.765625
3,ET,58.0,0.079584
4,ABC,57.5,4.938496
5,BC,62.0,4.765967
6,GBC,61.0,16.298272


In [101]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
fig = px.bar(score_chart,x="ml_name", y="score",
             hover_data=["score","time_taken"], color="time_taken",
             labels={'pop':'population of Canada'}, height=400, text_auto=True)
fig.show()

## Feauring Engineering

In [102]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
71293,71293,33778,122704,Can a British citizen work in the US if he doe...,I am a US citizen and live in the US but I wor...,0
5172,5172,10190,10191,Why do people adopt savannah cats?,Why is my adopted cat hiding from me?,0
127014,127014,204568,204569,Does Palantir sponsor H1B visas?,How do engineers get promoted at Palantir?,0
372138,372138,60122,320508,What hotel in Maibong Hill-station would be sa...,What hotel in Peermade Hill-station would be s...,0
376821,376821,370628,53426,What is the difference between a USB charging ...,Do all USB-C cables support all USB-C modes?,0
...,...,...,...,...,...,...
61249,61249,106977,106978,Why is my body so hot when I first wake up fro...,Why is it nearly impossible to sleep soon afte...,0
20892,20892,39378,39379,What will happen if postpaid bill is not paid?,What happens if you refuse to pay a prepaid ph...,0
344595,344595,472841,472842,Are there some differences between internation...,Is it true that honey is used in the manufactu...,0
93272,93272,155995,155996,Can anyone briefly explain the differences bet...,Could a Moore machine be converted into a DFA?,0


### Remove Tags

In [104]:
import re
re.findall('<.*?>',df['question1'][71293])

[]

In [105]:
def remove_tag(raw_text):
    val = re.sub(re.compile('<.*?>'),'',raw_text)
    return val

### LowerCase

In [106]:
val = [x.isupper() for x in df['question1'][71293] ]
counter = 0
for i in val:
    if i == False:
        counter = counter + 1
a = len(df['question1'][71293]) - counter
print('total number of upper text is ', a)

total number of upper text is  8


In [107]:
df['question1'] = df['question1'].apply(lambda x:x.lower())
df['question2'] = df['question2'].apply(lambda x:x.lower())

In [108]:
val = [x.isupper() for x in df['question1'][71293] ]
counter = 0
for i in val:
    if i == False:
        counter = counter + 1
a = len(df['question1'][71293]) - counter
print('total number of upper text is ', a)

total number of upper text is  0


### Stopwords

In [109]:
from nltk.corpus import stopwords
enquiry = stopwords.words('english')
for i in df['question1'][71293].split():
    for j in enquiry:
        if i == j:
            print('Found ------ ',i)
print("Not Found -------")

Found ------  can
Found ------  a
Found ------  in
Found ------  the
Found ------  if
Found ------  he
Found ------  have
Found ------  a
Found ------  from
Found ------  his
Found ------  or
Found ------  can
Found ------  a
Found ------  for
Found ------  in
Found ------  the
Found ------  if
Found ------  he
Found ------  has
Not Found -------


In [110]:
df['question1'] = df['question1'].apply(lambda x:[item for item in x.split() if item not in enquiry]).apply(lambda x:' '.join(x))
df['question2'] = df['question2'].apply(lambda x:[item for item in x.split() if item not in enquiry]).apply(lambda x:' '.join(x))

In [111]:
for i in df['question1'][71293].split():
    for j in enquiry:
        if i == j:
            print('Found ------ ',i)
print("Not Found -------")

Not Found -------


### Remove URL

In [116]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [118]:
a = 'british citizen work us doesn’t visa employer? british citizen work free us family there? please watch www.forme.com'
remove_url(a)

'british citizen work us doesn’t visa employer? british citizen work free us family there? please watch '

In [120]:
df['question1'] = df['question1'].apply(remove_url)
df['question2'] = df['question2'].apply(remove_url)

### Chat word treatment

### Spelling treatment

In [122]:
from textblob import TextBlob

In [131]:
incorr = 'british citizen work us doesn’t visa employer? british citizen work free us family there?'
start = timeit.default_timer()
textb = TextBlob(incorr)
print(textb.correct().string)
stop = timeit.default_timer()
print("time-taken :", (stop-start))
print("whole time-taken :",(stop-start)*len(df))

british citizen work us doesn’t isa employer? british citizen work free us family there?
time-taken : 0.001225699999849894
whole time-taken : 1.225699999849894


In [137]:
def correction(text):
    textb = TextBlob(text)
    return textb.correct().string 

In [139]:
# df['question1'].apply(correction)

### Stemming 

In [154]:
df['question1'][372138]

'hotel maibong hill-station would safe unmarried couples, without harassment police, hotel staff, moral police?'

In [164]:
from nltk.stem.porter import PorterStemmer 
ps =  PorterStemmer()
punct = '?:!.,;'
sent_word = nltk.word_tokenize(df['question1'][372138])
for word in sent_word:
    if word in punct:
        sent_word.remove(word)
print("{0:20}{1:20}".format('word','lemma'))   
for word in sent_word:
    print("{0:20}{1:20}".format(word, ps.stem(word) )) 
print('')
print('*-'*50)
print('')
print("time-taken :", (stop-start))
print("whole time-taken :",(stop-start)*len(df))
print('')
print('*-'*50)

word                lemma               
hotel               hotel               
maibong             maibong             
hill-station        hill-stat           
would               would               
safe                safe                
unmarried           unmarri             
couples             coupl               
without             without             
harassment          harass              
police              polic               
hotel               hotel               
staff               staff               
moral               moral               
police              polic               

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

time-taken : 0.0016998000000967295
whole time-taken : 1.6998000000967295

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


### Lemmatization

In [165]:
from nltk.stem import WordNetLemmatizer
start = timeit.default_timer()
wntlem =  WordNetLemmatizer()
punct = '?:!.,;'
sent_word = nltk.word_tokenize(df['question1'][372138])
for word in sent_word:
    if word in punct:
        sent_word.remove(word)
print("{0:20}{1:20}".format('word','lemma'))   
for word in sent_word:
    print("{0:20}{1:20}".format(word, wntlem.lemmatize(word,pos='v'))) 
stop = timeit.default_timer()  
print('')
print('*-'*50)
print('')
print("time-taken :", (stop-start))
print("whole time-taken :",(stop-start)*len(df))
print('')
print('*-'*50)
    

word                lemma               
hotel               hotel               
maibong             maibong             
hill-station        hill-station        
would               would               
safe                safe                
unmarried           unmarried           
couples             couple              
without             without             
harassment          harassment          
police              police              
hotel               hotel               
staff               staff               
moral               moral               
police              police              

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

time-taken : 0.001430399999662768
whole time-taken : 1.430399999662768

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-


In [166]:
def lemme_word(text):
    return ' '.join([wntlem.lemmatize(word,pos='v') for word in text.split() ])

In [169]:
df['question1'] = df['question1'].apply(lemme_word)
df['question2'] = df['question2'].apply(lemme_word)

## ML Model

In [170]:
question_df_pre = df[['question1','question2']]

### Bag of words

In [171]:
questions_pre = list(question_df_pre['question1']) + list(question_df_pre['question2'])
cv = CountVectorizer(max_features=3000)
q1_arr_pre, q2_arr_pre = np.vsplit(cv.fit_transform(questions_pre).toarray(),2)

In [172]:
temp_df1_pre = pd.DataFrame(q1_arr_pre, index= question_df_pre.index)
temp_df2_pre = pd.DataFrame(q2_arr_pre, index= question_df_pre.index)
temp_df_pre = pd.concat([temp_df1_pre, temp_df2_pre], axis=1)
temp_df_pre.shape

(1000, 6000)

In [173]:
temp_df_pre['is_duplicate'] = df['is_duplicate']

In [174]:
# {k: v for k, v in sorted(cv.vocabulary_.items(),reverse=True, key=lambda item: item[1])}

In [175]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df_pre.iloc[:,0:-1].values,temp_df_pre.iloc[:,-1].values,test_size=0.2,random_state=1)

In [180]:
import time
fe_score = []
fe_time_taken = []
for i in ml_algo:
    start = timeit.default_timer()
    i.fit(X_train,y_train)
    y_pred = i.predict(X_test)
    stop = timeit.default_timer()
    fe_score.append(accuracy_score(y_test,y_pred))
    fe_time_taken.append(stop - start)

In [181]:
fe_score_chart = pd.DataFrame({'ml_name':ml_name, 'score':[x*100 for x in fe_score], 'time_taken':fe_time_taken})

In [182]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
fig = px.bar(fe_score_chart,x="ml_name", y="score",
             hover_data=["score","time_taken"], color="time_taken",
             labels={'pop':'population of Canada'}, height=400, text_auto=True)
fig.show()

### N-grams

In [183]:
cv_ngrams = CountVectorizer(ngram_range=(2,2),max_features=3000)
q1_arr_pre_ngrams, q2_arr_pre_ngrams = np.vsplit(cv_ngrams.fit_transform(questions_pre).toarray(),2)

In [184]:
temp_df1_pre_ngrams = pd.DataFrame(q1_arr_pre_ngrams, index= question_df_pre.index)
temp_df2_pre_ngrams= pd.DataFrame(q2_arr_pre_ngrams, index= question_df_pre.index)
temp_df_pre_ngrams = pd.concat([temp_df1_pre_ngrams, temp_df2_pre_ngrams], axis=1)
temp_df_pre_ngrams.shape

(1000, 6000)

In [185]:
temp_df_pre_ngrams['is_duplicate'] = df['is_duplicate']

In [186]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df_pre_ngrams.iloc[:,0:-1].values,temp_df_pre_ngrams.iloc[:,-1].values,test_size=0.2,random_state=1)

In [187]:
ngram_score = []
ngram_time_taken = []
for i in ml_algo:
    i.fit(X_train,y_train)
    y_pred = i.predict(X_test)
    stop = timeit.default_timer()
    ngram_score.append(accuracy_score(y_test,y_pred))
    ngram_time_taken.append(stop - start)

In [188]:
ngram_score_chart = pd.DataFrame({'ml_name':ml_name, 'score':[x*100 for x in ngram_score], 'time_taken':ngram_time_taken})

In [189]:
pio.renderers.default = 'iframe'
fig = px.bar(ngram_score_chart,x="ml_name", y="score",
             hover_data=["score","time_taken"], color="time_taken",
             labels={'pop':'population of Canada'}, height=400, text_auto=True)
fig.show()

In [190]:
# {k: v for k, v in sorted(cv.vocabulary_.items(),reverse=True, key=lambda item: item[1])}

### TF-IDF Vectorizers

In [191]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [192]:
q1_arr_pre_tfidf, q2_arr_pre_tfidf = np.vsplit(tfidf.fit_transform(questions_pre).toarray(),2)

In [193]:
temp_df1_pre_tfidf = pd.DataFrame(q1_arr_pre_tfidf, index= question_df_pre.index)
temp_df2_pre_tfidf = pd.DataFrame(q2_arr_pre_tfidf, index= question_df_pre.index)
temp_df_pre_tfidf = pd.concat([temp_df1_pre_tfidf, temp_df2_pre_tfidf], axis=1)
temp_df_pre_tfidf.shape

(1000, 7138)

In [194]:
temp_df_pre_tfidf['is_duplicate'] = df['is_duplicate']

In [195]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df_pre_tfidf.iloc[:,0:-1].values,temp_df_pre_tfidf.iloc[:,-1].values,test_size=0.2,random_state=1)

In [196]:
tfidf_score = []
tfidf_time_taken = []
for i in ml_algo:
    i.fit(X_train,y_train)
    y_pred = i.predict(X_test)
    stop = timeit.default_timer()
    tfidf_score.append(accuracy_score(y_test,y_pred))
    tfidf_time_taken.append(stop - start)

In [197]:
tfidf_score_chart = pd.DataFrame({'ml_name':ml_name, 'score':[x*100 for x in tfidf_score], 'time_taken':tfidf_time_taken})

In [198]:
fig = px.bar(tfidf_score_chart,x="ml_name", y="score",
             hover_data=["score","time_taken"], color="time_taken",
             labels={'pop':'population of Canada'}, height=400, text_auto=True)
fig.show()

### Word2vec

In [199]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
def process_term(corpus):
    story = []
    for para in corpus:
        raw_sent = sent_tokenize(para)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))
    return story

In [200]:
question_df_pre_w2c = df[['question1','question2']]

In [201]:
x = process_term(question_df_pre_w2c['question1'])

In [202]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [203]:
model.build_vocab(x)

In [204]:
model.train(x, total_examples=model.corpus_count, epochs=model.epochs)

(17388, 28165)

In [211]:
model.wv.most_similar('uk')

[('mail', 0.32273921370506287),
 ('man', 0.31433743238449097),
 ('existence', 0.25792840123176575),
 ('non', 0.24913446605205536),
 ('post', 0.2418147325515747),
 ('throw', 0.23306265473365784),
 ('gas', 0.22379234433174133),
 ('recruit', 0.22159899771213531),
 ('data', 0.21756024658679962),
 ('amount', 0.21368533372879028)]

In [212]:
model.wv.similarity('modi','india')

0.024443489

In [213]:
from sklearn.decomposition import PCA

In [214]:
pca = PCA(n_components=3)

In [215]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [219]:
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2)
fig.show()