# Kaggle - Quora Question Pairs

https://www.kaggle.com/c/quora-question-pairs/data

In [45]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for train test split
from sklearn.model_selection import train_test_split

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [38]:
# Import train and test datasets
train_all = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
print train.shape
print test.shape

(404290, 6)
(2345796, 3)


In [39]:
train_all.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [40]:
train_all.loc[train['is_duplicate']==1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [42]:
# Percent of train that is duplicate
print train_all.loc[train_all['is_duplicate']==1].shape[0]
print train_all.shape[0]
149263.0 / 404290.0 * 100

149263
404290


36.9197853026293

In [43]:
test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [90]:
X = train_all.ix[:,:5]
y = train_all.ix[:,5:]
print X.shape
print y.shape

(404290, 5)
(404290, 1)


In [100]:
# Split train into true train and dev set
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1)
print X_train.shape
print X_dev.shape
print y_train.shape
print y_dev.shape

(363861, 5)
(40429, 5)
(363861, 1)
(40429, 1)


In [141]:
X_train['question1'].head()

214595    Is it possible to visit all countries in the w...
284404                   What is Pakistan occupied Kashmir?
401977    How does short-term disability insurance work ...
328067    If I was bitten by a rat and a deep but small ...
118112                               How can I kill myself?
Name: question1, dtype: object

# First Try using simple similarity function

In [143]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

Test similar function between examples to see what scores they get. I don't think this is going to work

In [158]:
print X_train.iloc[2][3]
print X_train.iloc[2][4]
print y_train.iloc[2][0]

How does short-term disability insurance work in Florida?
How does short-term disability insurance work in Texas?
0


In [159]:
similar(X_train.iloc[2][3],X_train.iloc[2][4])

0.9107142857142857

In [160]:
print X_train.iloc[5][3]
print X_train.iloc[5][4]
print y_train.iloc[5][0]

How could we measure pole strength of a magnet?
How can we estimate magnetic strength for the poles of a bar magnet?
1


In [161]:
similar(X_train.iloc[5][3],X_train.iloc[5][4])

0.6260869565217392

We can see above that the first set of observations have a 91% similarity while they aren't duplicates, and the second has 62% similarity but is different

In [187]:
train_similartest = train.copy()
train_similartest.head()

#train_similartest.apply(lambda question1, question2: similar(question1 ,question2), axis=1)

train_similartest.apply(lambda row: similar(row['question1'], row['question2']), axis=1)

TypeError: ("'float' object is not iterable", u'occurred at index 201841')

# Other stuff

In [140]:
train_vector = CountVectorizer()
x = train_vector.fit_transform(X_train['question1'].values.astype('U'))

In [133]:
# Train a logistic regression model using a "l1" penalty. 
train_vector = CountVectorizer()
X_train_matrix = train_vector.fit_transform(X_train['question1'].values.astype('U'))

In [106]:
# Setup TFIDF vector and transformed data
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
X_dev_tfidf = tfidf.transform(X_dev)

print X_tfidf.shape
print X_tfidf
print y_train.shape
print X_dev_tfidf

# Create logistic Regression model with C=100
lr = LogisticRegression() 
lr.fit(X_tfidf, y_train)
tfidf_preds = lr.predict(X_dev_tfidf)

# Find and print score
f1 = metrics.f1_score(y_true = y_dev, y_pred = tfidf_preds, average='weighted')
print "The f1 score using TfidfVectorizer: " + str(f1)


(5, 5)
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
(363861, 1)
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0


ValueError: Found input variables with inconsistent numbers of samples: [5, 363861]