In [1]:
import numpy as np
import pandas as pd
import itertools as it
import pickle
import glob
import os
import string
import gc
import re
import time
import nltk
import spacy
import textacy
import en_core_web_md
import sematch
import gensim

from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from tqdm import tqdm, tqdm_notebook
from scipy import sparse
from scipy.optimize import minimize

from cleaning_utils import *

In [5]:
def load_quora(src):
    print('Loading Quora dataset.')
    df_train = pd.read_csv(src + 'train.csv')
    df_train['test_id'] = -1
    df_test = pd.read_csv(src + 'test.csv')
    df_test['id'] = -1
    df_test['qid1'] = -1
    df_test['qid2'] = -1
    df_test['is_duplicate'] = -1
    df = pd.concat([df_train, df_test])
    df['question1'] = df['question1'].fillna('')
    df['question2'] = df['question2'].fillna('')
    df['uid'] = np.arange(df.shape[0])
    df = df.set_index(['uid'])
    shapes = (df_train.shape[0], df_test.shape[0])
    print('Dataset loaded,', df_train.shape, df_test.shape)
    return df, shapes

def clean_lemmat_quora(src):
    t = time.time()
    df, shapes = load_quora(src)
    print('Cleaning based on forums functions.')
    df = clean_part1(df)
    df = clean_part2(df)
    print('Cleaning using textacy.')
    df.question1 = df.question1.apply(lambda x: textacy.preprocess.preprocess_text(x, fix_unicode = True,
                                                                            lowercase = True,
                                                                            no_contractions = True,
                                                                            transliterate = True))
    df.question2 = df.question2.apply(lambda x: textacy.preprocess.preprocess_text(x, fix_unicode = True,
                                                                            lowercase = True,
                                                                            no_contractions = True,
                                                                            transliterate = True))
    print('Lemmatizing text.')
    SYMBOLS = set(' '.join(string.punctuation).split(' ') + ['...', '“', '”', '\'ve'])
    q1 = []
    for doc in tqdm(nlp.pipe(df['question1'], n_threads=8, batch_size=10000)):
        word_list = ([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
        q1.append(' '.join(i for i in word_list))
    q2 = []
    for doc in tqdm(nlp.pipe(df['question2'], n_threads=8, batch_size=10000)):
        word_list = ([c.lemma_ for c in doc if c.lemma_ not in SYMBOLS])
        q2.append(' '.join(i for i in word_list))
    q1 = pd.DataFrame(q1)
    q2 = pd.DataFrame(q2)
    df['question1'] = q1
    df['question2'] = q2
    print('Correcting words. Using most probable substitutes.')
    df.question1 = df.question1.apply(lambda x: (' '.join([correction(i) for i in word_tokenize(x)])))
    df.question2 = df.question2.apply(lambda x: (' '.join([correction(i) for i in word_tokenize(x)])))
    df_train = df.iloc[:shapes[0], :]
    df_test = df.iloc[shapes[0]:, :]
    print('Text cleaning done, time it took:', time.time() - t)
    return df_train, df_test

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('../../data/embeddings/GoogleNews-vectors-negative300.bin', 
                                                        binary=True)
words = model.index2word
w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i
WORDS = w_rank

In [6]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'
nlp = en_core_web_md.load()
df_train, df_test = clean_lemmat_quora(src)

df_train.to_csv('df_train_spacylemmat_fullclean.csv', index = False)
df_test.to_csv('df_test_spacylemmat_fullclean.csv', index = False)

Loading Quora dataset.
Dataset loaded, (404290, 7) (2345796, 7)
Cleaning based on forums functions.
Cleaning using textacy.


0it [00:00, ?it/s]

Lemmatizing text.


2750086it [07:31, 6086.26it/s]
2750086it [07:34, 6055.79it/s]


Correcting words. Using most probable substitutes.
Text cleaning done, time it took: 2688.0724029541016
