In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from __future__ import division, unicode_literals, print_function
import warnings
warnings.filterwarnings('ignore')

import spacy
import plac
import ujson as json
import numpy as np
import pandas as pd
import en_core_web_md
import en_vectors_glove_md
from tqdm import tqdm

from pathlib import Path
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
try:
    import cPickle as pickle
except ImportError:
    import pickle

from spacy_hook import get_embeddings, get_word_ids
from spacy_hook import create_similarity_pipeline
from decomposable_merge import build_model

Using TensorFlow backend.


In [2]:
from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
import keras.backend as K
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge
from keras.layers.advanced_activations import PReLU
from keras.layers.merge import concatenate

In [3]:
def spacy_encode(df, settings, shape):
    print('Encoding data according to following settings:', settings, '\n', shape)
    train_texts1, train_texts2 = df['question1'], df['question2']
    print("Loading spaCy")
    nlp = en_core_web_md.load()
    assert nlp.path is not None
    print("Processing texts...")
    encoded_data = []
    for texts in tqdm((train_texts1, train_texts2)):
        encoded_data.append(get_word_ids(list(nlp.pipe(texts, n_threads=10, batch_size=5000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    q1, q2 = encoded_data
    return q1, q2

def get_train():
    abhishek_feats = pd.read_csv('../../../../data/features/abhishek/train_features.csv',
                      encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../../../data/features/spacylemmat_fullclean/train_whq_with_jaccard_feats.csv')
    eda_feats = pd.read_csv('../../../../data/features/spacylemmat_fullclean/train_eda_features.csv')
    mephisto_feats = pd.read_csv('../../../../data/features/spacylemmat_fullclean/train_mephistopeheles_features.csv')
    turkewitz_feats = pd.read_csv('../../../../data/features/spacylemmat_fullclean/train_turkewitz_features.csv')
    srk_feats = pd.read_csv('../../../../data/features/spacylemmat_fullclean/train_SRKgrams_features.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]

    df = pd.concat([mephisto_feats, abhishek_feats, turkewitz_feats], axis = 1)
    df2 = pd.concat([eda_feats, text_feats, srk_feats], axis = 1)
    df = df.merge(df2, on = 'id', how = 'left')
    print('Original shape:', df.shape)
    df.fillna(-999, inplace = True)
    
    y = df['is_duplicate_y']
    
    dfc = df.iloc[0:1000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    
    df.drop(['is_duplicate_x',], axis = 1, inplace = True)
    X = df.iloc[:, 6:]
    X.drop(['question1_y','question2_y'], axis = 1, inplace = True)
    print('Train data loaded.', '\n', 'Training data shape:', X.shape)
    return X, y

def create_mergevalidset(data_1, data_2, datafeats, labels):
    np.random.seed(1234)
    perm = np.random.permutation(len(data_1))
    idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
    
    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
    dataf_train = np.vstack((datafeats[idx_train], datafeats[idx_train]))
    
    data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
    data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
    labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
    dataf_val = np.vstack((datafeats[idx_val], datafeats[idx_val]))
    return data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val

def create_stratified_split(data_1, data_2, datafeats, labels):
    data1_tr, data1_val, y1_tr, y1_val = train_test_split(data_1, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    data2_tr, data2_val, y2_tr, y2_val = train_test_split(data_2, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    dataf_train, dataf_val, yf_tr, yf_val = train_test_split(datafeats, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    return data1_tr, data2_tr, dataf_train, yf_tr, data1_val, data2_val, dataf_val, yf_val

In [4]:
src_train_raw = '../../../data/train.csv'
src_test_raw = '../../../data/test.csv'

src_train = '../../../features/df_train_spacylemmat_fullclean.csv'
src_test = '../../../features/df_test_spacylemmat_fullclean.csv'


q1 = np.load('../../../features/q1train_spacylemmat_fullclean_170len_treetrunc.npy')
q2 = np.load('../../../features/q2train_spacylemmat_fullclean_170len_treetrunc.npy')
X, y = get_train()

nlp = en_core_web_md.load()
ncols = X.shape[1]

#y = to_categorical(y)
tr_q1, tr_q2, tr_feats, y_tr, val_q1, val_q2, val_feats, y_val = create_stratified_split(q1, q2, X.values, y)

Original shape: (404290, 119)
Dropping duplicate columns: ['common_unigrams_len', 'common_unigrams_ratio', 'is_duplicate_y', 'len_q1', 'len_q2', 'm_q1_q2_tf_svd1', 'qid1_y', 'qid2_y', 'test_id_y']
Final shape: (404290, 110)
Train data loaded. 
 Training data shape: (404290, 101)


In [7]:
settings = {
    'lr': 0.0005,
    'dropout': 0.2,
    'batch_size': 128,
    'nr_epoch': 100,
    'tree_truncate': True,
    'gru_encode': False,
    }

max_length = 170
nr_hidden = 256
ncols = X.shape[1]
shape = (max_length, nr_hidden, 2, ncols)

re_weight = True
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [8]:
model = build_model(get_embeddings(nlp.vocab), shape, settings)

In [None]:
callbacks = [ModelCheckpoint('decomposable_merged.h5',
                                    monitor='val_loss', 
                                    verbose = 0, save_best_only = True),
                 EarlyStopping(monitor='val_loss', patience = 10, verbose = 1)]

model.fit([tr_q1, tr_q2, tr_feats], y_tr,
        validation_data=([val_q1, val_q2, val_feats], y_val), class_weight = class_weight,
        nb_epoch=settings['nr_epoch'], batch_size=settings['batch_size'], callbacks = callbacks)

Train on 323432 samples, validate on 80858 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100