In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from __future__ import division, unicode_literals, print_function
import warnings
warnings.filterwarnings('ignore')

import spacy
import plac
import ujson as json
import numpy as np
import pandas as pd
import en_core_web_md
import en_vectors_glove_md
from tqdm import tqdm

from pathlib import Path
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
try:
    import cPickle as pickle
except ImportError:
    import pickle

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline
from decomposable_merge import build_model

Using TensorFlow backend.


In [2]:
def spacy_encode(df, settings, shape):
    print('Encoding data according to following settings:', settings, '\n', shape)
    train_texts1, train_texts2 = df['question1'], df['question2']
    print("Loading spaCy")
    nlp = en_core_web_md.load()
    assert nlp.path is not None
    print("Processing texts...")
    encoded_data = []
    for texts in tqdm((train_texts1, train_texts2)):
        encoded_data.append(get_word_ids(list(nlp.pipe(texts, n_threads=10, batch_size=5000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    q1, q2 = encoded_data
    return q1, q2

def create_mergevalidset(data_1, data_2, datafeats, labels):
    np.random.seed(1234)
    perm = np.random.permutation(len(data_1))
    idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
    
    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
    dataf_train = np.vstack((datafeats[idx_train], datafeats[idx_train]))
    
    data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
    data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
    labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
    dataf_val = np.vstack((datafeats[idx_val], datafeats[idx_val]))
    return data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val

def create_stratified_split(data_1, data_2, datafeats, labels):
    data1_tr, data1_val, y1_tr, y1_val = train_test_split(data_1, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    data2_tr, data2_val, y2_tr, y2_val = train_test_split(data_2, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    dataf_train, dataf_val, yf_tr, yf_val = train_test_split(datafeats, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    return data1_tr, data2_tr, dataf_train, yf_tr, data1_val, data2_val, dataf_val, yf_val

In [3]:
qsrc = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/'

q1 = np.load(qsrc + 'q1train_spacylemmat_fullclean_170len_treetrunc.npy')
q2 = np.load(qsrc + 'q2train_spacylemmat_fullclean_170len_treetrunc.npy')
X = pd.read_pickle('../../X_train_618cols_02065.pkl')
y = pd.read_pickle('../../y_train.pkl').is_duplicate.values

nlp = en_core_web_md.load()
ncols = X.shape[1]

tr_q1, tr_q2, tr_feats, y_tr, val_q1, val_q2, val_feats, y_val = create_stratified_split(q1, q2, X.values, y)

In [4]:
settings = {
    'lr': 0.0005,
    'dropout': 0.2,
    'batch_size': 128,
    'nr_epoch': 100,
    'tree_truncate': True,
    'gru_encode': False,
    }

max_length = 170
nr_hidden = 256
ncols = X.shape[1]
shape = (max_length, nr_hidden, 2, ncols)

re_weight = True
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [5]:
model = build_model(get_embeddings(nlp.vocab), shape, settings)
callbacks = [ModelCheckpoint('decomposable_merged.h5',
                                    monitor='val_loss', 
                                    verbose = 0, save_best_only = True),
                 EarlyStopping(monitor='val_loss', patience = 10, verbose = 1)]

model.fit([tr_q1, tr_q2, tr_feats], y_tr,
        validation_data=([val_q1, val_q2, val_feats], y_val), class_weight = class_weight,
        nb_epoch=settings['nr_epoch'], batch_size=settings['batch_size'], callbacks = callbacks)

Train on 323432 samples, validate on 80858 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 00018: early stopping


<keras.callbacks.History at 0x7f586c040080>

In [6]:
tr_preds = model.predict([tr_q1, tr_q2, tr_feats], batch_size = 16)
val_preds = model.predict([val_q1, val_q2, val_feats], batch_size = 16)

In [8]:
np.save('train_decomposable_merged_0.3787', tr_preds)
np.save('val_decomposable_merged_0.3787', val_preds)