In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from __future__ import division, unicode_literals, print_function
import warnings
warnings.filterwarnings('ignore')

import spacy
import plac
import ujson as json
import numpy
import pandas as pd
import en_core_web_md
#import en_vectors_glove_md


from pathlib import Path
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
try:
    import cPickle as pickle
except ImportError:
    import pickle

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline
from keras_decomposable_attention import build_model

In [None]:
def get_quora_data(src_train, src_test):
    df_train = pd.read_csv(src_train)
    df_train.dropna(inplace = True)
    df_tr, df_val = train_test_split(df_train, test_size = 0.15, random_state = 111)
    return df_tr, df_val

def evaluate(dev_loc):
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
    nlp = spacy.load('en',
            create_pipeline=create_similarity_pipeline)
    total = 0.
    correct = 0.
    for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        sim = doc1.similarity(doc2)
        if sim.argmax() == label.argmax():
            correct += 1
        total += 1
    return correct, total

def train_mine(shape, settings, savename):
    train_texts1, train_texts2, train_labels = df_tr['question1'], df_tr['question2'], to_categorical(df_tr['is_duplicate'])
    dev_texts1, dev_texts2, dev_labels = df_val['question1'], df_val['question2'], to_categorical(df_val['is_duplicate'])
    
    print("Loading spaCy")
    nlp = en_core_web_md.load()
    #nlp = en_vectors_glove_md.load()
    #nlp = spacy.load('en')
    assert nlp.path is not None
    
    print("Compiling network")
    model = build_model(get_embeddings(nlp.vocab), shape, settings)
    print("Processing texts...")
    Xs = []
    for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
        Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    train_X1, train_X2, dev_X1, dev_X2 = Xs
    print(settings)
    callbacks = [ModelCheckpoint('{}.h5'.format(savename),
                                        monitor='val_loss', 
                                        verbose = 0, save_best_only = True),
                     EarlyStopping(monitor='val_loss', patience = 10, verbose = 1)]
    model.fit(
        [train_X1, train_X2],
        train_labels,
        validation_data=([dev_X1, dev_X2], dev_labels),
        nb_epoch=settings['nr_epoch'],
        batch_size=settings['batch_size'], callbacks = callbacks)
    
    return model

In [None]:
src_train_raw = '../../../data/train.csv'
src_test_raw = '../../../data/test.csv'

src_train = '../../../features/df_train_spacy_lemmat.csv'
src_test = '../../../features/df_test_spacy_lemmat.csv'

settings = {
    'lr': 0.0005,
    'dropout': 0.2,
    'batch_size': 128,
    'nr_epoch': 100,
    'tree_truncate': True,
    'gru_encode': False,
    }

max_length = 128
nr_hidden = 256
shape = (max_length, nr_hidden, 2)
print(shape)

In [None]:
df_tr, df_val = get_quora_data(src_train, src_test)
train_mine(shape, settings, 'decomposable_encoreweb_0.0005LR_treetrunc_BiRNN')