In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import spacy
import ujson as json
import numpy
import numpy as np
import pandas as pd
import en_core_web_md
from tqdm import tqdm

import numpy
import numpy.random
import json
from spacy.tokens.span import Span

In [None]:
def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        if tree_truncate:
            if isinstance(doc, Span):
                queue = [doc.root]
            else:
                queue = [sent.root for sent in doc.sents]
        else:
            queue = list(doc)
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            if token.has_vector:
                Xs[i, j] = token.rank+1
            else:
                Xs[i, j] = (token.shape % (nr_unk-1))+2
            j += 1
            if j >= max_length:
                break
        else:
            Xs[i, len(words)] = 1
    return Xs

def spacy_encode(df, df_lens, settings, savename, save = True):
    print('Encoding data according to following settings:', settings, '\n')
    train_texts1, train_texts2 = df['question1'], df['question2']
    print("Loading spaCy")
    nlp = en_core_web_md.load()
    assert nlp.path is not None
    print("Processing texts...")
    encoded_data = []
    for texts in tqdm((train_texts1, train_texts2)):
        encoded_data.append(get_word_ids(list(nlp.pipe(texts, n_threads=6, batch_size=5000)),
                         max_length=settings['sentence_length'],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    q1, q2 = encoded_data
    if save:
        q1_tr = q1[:df_lens[0]]
        q1_te = q1[df_lens[0]:]
        q2_tr = q2[:df_lens[0]]
        q2_te = q2[df_lens[0]:]
        np.save('q1train_{}'.format(savename), q1_tr)
        np.save('q2train_{}'.format(savename), q2_tr)
        np.save('q1test_{}'.format(savename), q1_te)
        np.save('q2test_{}'.format(savename), q2_te)
        return
    else:
        return q1, q2
    
def load_quora_data(src_train, src_test):
    df_train = pd.read_csv(src_train)
    df_test = pd.read_csv(src_test)
    df_train.fillna('NULL', inplace = True)
    df_test.fillna('NULL', inplace = True)
    df = pd.concat([df_train, df_test])
    df_lens = (df_train.shape[0], df_test.shape[0])
    return df, df_lens

In [None]:
src_train = 'df_train_lemmatfullcleanSTEMMED.csv'
src_test = 'df_test_lemmatfullcleanSTEMMED.csv'

settings = {
    'tree_truncate': False,
    'gru_encode': False,
    'sentence_length': 48,
    }

df, lengths = load_quora_data(src_train, src_test)
df.question1 = df.question1.apply(lambda x: ' '.join(x))
spacy_encode(df, lengths, settings, 'lemmatfullcleanSTEMMED_48len')