https://www.kaggle.com/c/quora-question-pairs

# Import Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import pickle

In [3]:
pd.options.display.max_colwidth = -1

# Import Data

In [4]:
train = pd.read_csv('../[Data]-Quora-Question-Pairs/train.csv')

In [5]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [6]:
train.shape

(404290, 6)

In [7]:
train_X = train[['question1', 'question2']]

In [8]:
train_y = train['is_duplicate']

In [9]:
print('There are {} ({:.2f}%) pair of same questions'.format(train_y.sum(), train_y.sum()*100/len(train_y)))

There are 149263 (36.92%) pair of same questions


In [10]:
test = pd.read_csv('../[Data]-Quora-Question-Pairs/test.csv')

In [11]:
test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare with iPad Pro?,Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?
1,1,Should I have a hair transplant at age 24? How much would it cost?,How much cost does hair transplant require?
2,2,What but is the best way to send money from China to the US?,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [12]:
test.shape

(2345796, 3)

In [13]:
test_X = test[['question1', 'question2']]

In [14]:
X_All = pd.concat([train_X, test_X], axis=0)

In [15]:
X_All.shape

(2750086, 2)

# Preprocessing

In [16]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

stop_words = set(nltk.corpus.stopwords.words('english'))
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = remove_punc(x)
    x = remove_extra_spaces(x)
    x = lower_case(x)
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [17]:
X_All.head()

Unnamed: 0,question1,question2
0,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?
1,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
2,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?
3,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?"
4,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?


In [18]:
X_All['question1'] = X_All['question1'].apply(str)
X_All['question2'] = X_All['question2'].apply(str)
X_All.head()

Unnamed: 0,question1,question2
0,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?
1,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
2,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?
3,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?"
4,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?


In [19]:
X_All['question1'] = X_All['question1'].apply(lambda x : cleanText(x, True, True, False, False))
X_All['question2'] = X_All['question2'].apply(lambda x : cleanText(x, True, True, False, False))
X_All.head()
X_All.to_csv('../[Data]-Quora-Question-Pairs/x_all.csv', index=False)

In [20]:
X_All = pd.read_csv('../[Data]-Quora-Question-Pairs/x_all.csv')
X_All['question1'] = X_All['question1'].apply(str)
X_All['question2'] = X_All['question2'].apply(str)

In [21]:
X_All.head()

Unnamed: 0,question1,question2
0,step step guid invest share market india,step step guid invest share market
1,stori kohinoor koh noor diamond,would happen indian govern stole kohinoor koh noor diamond back
2,increas speed internet connect use vpn,internet speed increas hack dn
3,mental lone solv,find remaind math 23 24 math divid 24 23
4,one dissolv water quikli sugar salt methan carbon di oxid,fish would surviv salt water


In [22]:
X_clean_train = X_All.iloc[:404290]
X_clean_test = X_All.iloc[404290:]

In [23]:
X_clean_test.head()

Unnamed: 0,question1,question2
404290,surfac pro 4 compar ipad pro,microsoft choos core m3 core i3 home surfac pro 4
404291,hair transplant age 24 much would cost,much cost hair transplant requir
404292,best way send money china us,send money china
404293,food emulsifi,food fibr
404294,aberystwyth start read,start read


In [24]:
X_clean_train.head()

Unnamed: 0,question1,question2
0,step step guid invest share market india,step step guid invest share market
1,stori kohinoor koh noor diamond,would happen indian govern stole kohinoor koh noor diamond back
2,increas speed internet connect use vpn,internet speed increas hack dn
3,mental lone solv,find remaind math 23 24 math divid 24 23
4,one dissolv water quikli sugar salt methan carbon di oxid,fish would surviv salt water


#### Vectorizing 

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorizer1 = TfidfVectorizer(ngram_range=(1,3))
vectorizer1.fit(X_All['question1'])
pickle.dump(vectorizer1, open('q1vect.pickle', 'wb'))

vectorizer2 = TfidfVectorizer(ngram_range=(1,3))
vectorizer2.fit(X_All['question2'])
pickle.dump(vectorizer2, open('q2vect.pickle', 'wb'))

In [27]:
x_q1_train = vectorizer1.transform(X_clean_train['question1'])
x_q2_train = vectorizer2.transform(X_clean_train['question2'])

x_q1_test = vectorizer1.transform(X_clean_test['question1'])
x_q2_test = vectorizer2.transform(X_clean_test['question2'])

In [28]:
print(x_q1_train.shape)
print(x_q2_train.shape)
print(x_q1_test.shape)
print(x_q2_test.shape)

(404290, 6980053)
(404290, 7049388)
(2345796, 6980053)
(2345796, 7049388)


In [29]:
from scipy.sparse import hstack

In [30]:
X_train_vect = hstack((x_q1_train, x_q2_train))
X_test_vect = hstack((x_q1_test, x_q2_test))

In [31]:
print(X_train_vect.shape)
print(X_test_vect.shape)

(404290, 14029441)
(2345796, 14029441)


# Machine Learning

In [32]:
from sklearn.model_selection import cross_val_score

### LogisticRegression

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
model = LogisticRegression()
scores = cross_val_score(model, X_train_vect, train_y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))

Accuracy: 0.81 (+/- 0.00)


### Multinomial Naive Bayes

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [36]:
model = MultinomialNB()
scores = cross_val_score(model, X_train_vect, train_y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))

Accuracy: 0.79 (+/- 0.00)


### SGDClassifier

In [37]:
from sklearn.linear_model import SGDClassifier

In [38]:
model = SGDClassifier()
scores = cross_val_score(model, X_train_vect, train_y, cv=10)
print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))



Accuracy: 0.73 (+/- 0.00)


In [39]:
model = LogisticRegression()
model.fit(X_train_vect, train_y)
y_pred = model.predict(X_test_vect)

In [40]:
submission = pd.read_csv('../[Data]-Quora-Question-Pairs/sample_submission.csv')

In [41]:
submission['is_duplicate'] = y_pred
submission.to_csv('T13LR.csv')