### imports

In [1]:
import numpy as np
import pandas as pd
import itertools
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### load, clean and explore data

In [5]:
train =  pd.read_csv('../data-quora-question-pairs/train.csv')
#test =  pd.read_csv('../data-quora-question-pairs/train.csv')

In [6]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
## We don't need NaNs
train['question1'].fillna('', inplace=True)
train['question2'].fillna('', inplace=True)
train.drop(columns=['id', 'qid1', 'qid2'], inplace=True)

In [8]:
counts_vectorizer = CountVectorizer(max_features=10000).fit(itertools.chain(train['question1'], train['question2']))

In [9]:
len(counts_vectorizer.vocabulary_)

10000

In [10]:
counts_vectorizer.vocabulary_

{'what': 9793,
 'is': 4809,
 'the': 8979,
 'step': 8547,
 'by': 1465,
 'guide': 4035,
 'to': 9089,
 'invest': 4756,
 'in': 4528,
 'share': 8088,
 'market': 5531,
 'india': 4560,
 'story': 8577,
 'of': 6243,
 'diamond': 2667,
 'how': 4346,
 'can': 1517,
 'increase': 4549,
 'speed': 8423,
 'my': 5971,
 'internet': 4713,
 'connection': 2084,
 'while': 9808,
 'using': 9469,
 'vpn': 9667,
 'why': 9822,
 'am': 541,
 'mentally': 5672,
 'very': 9566,
 'lonely': 5356,
 'solve': 8349,
 'it': 4836,
 'which': 9807,
 'one': 6284,
 'dissolve': 2785,
 'water': 9728,
 'sugar': 8673,
 'salt': 7816,
 'and': 585,
 'carbon': 1558,
 'oxide': 6422,
 'astrology': 820,
 'capricorn': 1549,
 'sun': 8692,
 'cap': 1536,
 'moon': 5876,
 'rising': 7661,
 'does': 2832,
 'that': 8977,
 'say': 7862,
 'about': 253,
 'me': 5607,
 'should': 8148,
 'buy': 1460,
 'be': 1049,
 'good': 3940,
 'when': 9801,
 'do': 2820,
 'you': 9971,
 'use': 9461,
 'instead': 4663,
 'motorola': 5908,
 'company': 1989,
 'hack': 4064,
 'charter

### explore NLP features

In [11]:
fuzz.WRatio(train.question1[1], train.question2[1])

86

In [12]:
train.question1[1], train.question2[1]

('What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?')

### create NLP features

In [13]:
def fuzzyAnalysis(row):
    tokenSortRatio = fuzz.token_sort_ratio(row['question1'], row['question2'])
    tokenSetRatio = fuzz.token_set_ratio(row['question1'], row['question2'])
    wratio = fuzz.WRatio(row['question1'], row['question2'])
    return wratio, tokenSetRatio, tokenSortRatio
def tokenSortRatio(row):
    return fuzz.token_sort_ratio(row['question1'], row['question2'])
def tokenSetRatio(row):
    return fuzz.token_set_ratio(row['question1'], row['question2'])
def wratio(row):
    return fuzz.WRatio(row['question1'], row['question2'])

In [14]:
train['tokenSortRatio'] = train.apply(tokenSortRatio, axis=1)

In [15]:
train['tokenSetRatio'] = train.apply(tokenSetRatio, axis=1)

In [16]:
train['wratio'] = train.apply(wratio, axis=1)

In [17]:
train.head(10)

Unnamed: 0,question1,question2,is_duplicate,tokenSortRatio,tokenSetRatio,wratio
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,93,100,95
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,63,86,86
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,66,66,63
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,36,36,35
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,47,67,86
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,74,79,75
6,Should I buy tiago?,What keeps childern active and far from phone ...,0,23,24,35
7,How can I be a good geologist?,What should I do to be a great geologist?,1,64,71,67
8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,87,93,88
9,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,50,65,62


In [18]:
train.tokenSetRatio.isna().value_counts()

False    404290
Name: tokenSetRatio, dtype: int64

In [111]:
train.to_csv('quora_train.csv', index=False)