In [1]:
import numpy as np
import pandas as pd
import itertools as it
import pickle
import glob
import os
import string
import gc

import nltk
import spacy
import en_core_web_md
import sematch
from tqdm import tqdm
from nltk.corpus import wordnet as wn

from tqdm import tqdm, tqdm_notebook
from scipy import sparse
from scipy.optimize import minimize

import re

In [2]:
def concat_words(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: (' '.join(i for i in x)))
    return df

def text_to_wordlist(text):
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    text = ''.join([c for c in text if c not in punctuation])
    return(text)

def process_data(data):
    data.replace(abbr_dict,regex=True,inplace=True)
    return data

def basic_cleaning2(string):
    string = str(string)
    string = string.lower()
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    #string = ' '.join([i for i in string.split() if i not in ["a", "and", "of", "the", "to", "on", "in", "at", "is"]])
    string = re.sub(' +', ' ', string)
    return string

def clean_part1(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: text_to_wordlist(x))
    return df

def clean_part2(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: basic_cleaning2(x).split())
        df[i] = df[i].apply(lambda x: (' '.join(i for i in x)))
    return df

In [3]:

punctuation='["\'?,\.]' # I will replace all these punctuation with ''
abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    punctuation:'',
    '\s+':' ', # replace multi space with one single space
}

In [4]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'

df_train = pd.read_csv(src + 'train.csv')
df_train['test_id'] = -1

df_test = pd.read_csv(src + 'test.csv')
df_test['id'] = -1
df_test['qid1'] = -1
df_test['qid2'] = -1
df_test['is_duplicate'] = -1

df = pd.concat([df_train, df_test])
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['uid'] = np.arange(df.shape[0])
df = df.set_index(['uid'])
print(df_train.shape, df_test.shape)

(404290, 7) (2345796, 7)


In [5]:
print(df.shape)
df.fillna('empty', inplace = True)
df = clean_part1(df)
df = clean_part2(df)

(2750086, 7)


In [6]:
nlp = en_core_web_md.load()
df.head()['question1'].apply(lambda s: ' '.join([c.lemma_ for c in nlp(str(s)) if c.lemma_  != '?']))

uid
0    what be the step by step guide to invest in sh...
1     what be the story of kohinoor koh i noor diamond
2    how can i increase the speed of -PRON- interne...
3    why be i mentally very lonely how can i solve ...
4    which one dissolve in water quickly sugar salt...
Name: question1, dtype: object

In [7]:
SYMBOLS = set(' '.join(string.punctuation).split(' ') + ['...', '“', '”', '\'ve'])

q1 = []
for doc in nlp.pipe(df['question1'], n_threads=8, batch_size=10000):
    word_list = ([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
    q1.append(' '.join(i for i in word_list))

q2 = []
for doc in nlp.pipe(df['question2'], n_threads=8, batch_size=10000):
    word_list = ([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
    q2.append(' '.join(i for i in word_list))


In [8]:
q1 = pd.DataFrame(q1)
q2 = pd.DataFrame(q2)

df['question1'] = q1
df['question2'] = q2

df_train = pd.read_csv(src + 'train.csv')
df_train['test_id'] = -1
df_test = pd.read_csv(src + 'test.csv')

df_train = df.iloc[:df_train.shape[0], :]
df_test = df.iloc[df_train.shape[0]:, :]

df_train.to_csv('df_train_spacy_lemmat.csv', index = False)
df_test.to_csv('df_test_spacy_lemmat.csv', index = False)