https://www.kaggle.com/c/quora-question-pairs

# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_colwidth = -1
pd.options.display.max_columns = 15

import pickle

In [3]:
from tqdm.auto import tqdm
from IPython.display import clear_output
tqdm.pandas()

  from pandas import Panel


# Text Cleaning

In [4]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

def remove_handles(text):
    HANDLES_REGEX = re.compile(r'@\S+')
    return HANDLES_REGEX.sub(r' ', text)

def remove_incomplete_last_word(text):
    INCOMPLETE_LAST_WORD_REGEX = re.compile(r'\S+…')
    return INCOMPLETE_LAST_WORD_REGEX.sub(r' ', text )
    
def remove_hashtags(text):
    HASHTAGS_REGEX = re.compile(r'#\S+')
    return HASHTAGS_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

remove_shortwords = lambda x: ' '.join(word for word in x.split() if len(word) > 2)

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = remove_handles(x)
    x = remove_incomplete_last_word(x)
    x = remove_hashtags(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    x = remove_shortwords(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

# Data

In [5]:
train = pd.read_csv('../[Data]-Quora-Question-Pairs/train.csv')

In [6]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [8]:
train = train.fillna(' ')

            id    qid1    qid2                         question1  \
105780  105780  174363  174364  How can I develop android app?     
201841  201841  303951  174364  How can I create an Android app?   
363362  363362  493340  493341  NaN                                

                                                                                                                          question2  \
105780  NaN                                                                                                                           
201841  NaN                                                                                                                           
363362  My Chinese name is Haichao Yu. What English name is most suitable for me considering the pronounciation of my Chinese name?   

        is_duplicate  
105780  0             
201841  0             
363362  0             


In [12]:
test = pd.read_csv('../[Data]-Quora-Question-Pairs/test.csv')

In [13]:
test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare with iPad Pro?,Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?
1,1,Should I have a hair transplant at age 24? How much would it cost?,How much cost does hair transplant require?
2,2,What but is the best way to send money from China to the US?,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


# Feature Extraction

In [20]:
train.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)
test.drop(['test_id'], axis=1, inplace=True)

In [21]:
print('Merging all questions')
all_question = pd.concat([train['question1'], train['question2'], test['question1'], test['question2']], axis=0)
all_question = all_question.reset_index(drop=True)
all_question.head()

Merging all questions


0    What is the step by step guide to invest in share market in india?          
1    What is the story of Kohinoor (Koh-i-Noor) Diamond?                         
2    How can I increase the speed of my internet connection while using a VPN?   
3    Why am I mentally very lonely? How can I solve it?                          
4    Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?
dtype: object

* text cleaning

In [22]:
tqdm.pandas()
all_question = all_question.progress_apply(lambda x : cleanText(x, True, False, False, False))

  from pandas import Panel


HBox(children=(IntProgress(value=0, max=5500172), HTML(value='')))




In [23]:
train['question1'] = all_question[:404290]
train['question2'] = all_question[404290:2*404290].reset_index(drop=True)
test['question1'] = all_question[2*404290:2*404290+2345796].reset_index(drop=True)
test['question2'] = all_question[2*404290+2345796:].reset_index(drop=True)

* TFIDF Vectors
* Cosine Similarity of TFIDF Vectors

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
tfidfVectorizer = TfidfVectorizer()

tfidfVectorizer.fit(all_question)
del all_question

In [26]:
vect_train_q1 = tfidfVectorizer.transform(train['question1'])
vect_train_q2 = tfidfVectorizer.transform(train['question2'])
train['Cosine'] = cosine_similarity(vect_train_q1,vect_train_q2)

del vect_train_q1, vect_train_q2

MemoryError: 

In [None]:
vect_test_q1 = tfidfVectorizer.transform(test['question1'])
vect_test_q2 = tfidfVectorizer.transform(test['question2'])
test['Cosine'] = cosine_similarity(vect_test_q1,vect_test_q2)

del vect_test_q1, vect_test_q2

In [None]:
pickle.dump(tfidfVectorizer, open('../[Data]-Quora-Question-Pairs/tfidfVectorizer.pkl', 'wb'))

In [None]:
sns.heatmap(train[['is_duplicate', 'Cosine']].corr(), annot=True)

In [None]:
train.to_csv('../[Data]-Quora-Question-Pairs/train_feats.csv', index=False)
test.to_csv('../[Data]-Quora-Question-Pairs/test_feats.csv', index=False)