In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import re
import csv
import codecs
import numpy as np
import pandas as pd
import gc
import sys
import gensim
import time

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import train_test_split

In [None]:
def merged_lstm():
    embedding_layer = Embedding(nb_words,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=seq_length,
            trainable=False)
    
    lstm_layer = LSTM(128, dropout=0.25, recurrent_dropout=0.2,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(seq_length,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(seq_length,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    dense_input = Input(shape = (ncols,))
    d = Dense(256, kernel_initializer = 'he_normal')(dense_input)
    d = PReLU()(d)
    d = BatchNormalization()(d)
    d = Dropout(0.4)(d)
    
    d2 = Dense(512, kernel_initializer = 'he_normal')(d)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.2)(d2)
    
    d3 = Dense(512, kernel_initializer = 'he_normal')(d2)
    d3 = PReLU()(d3)
    d3 = BatchNormalization()(d3)
    d3 = Dropout(0.2)(d3)
    
    merged = concatenate([x1, y1, d3])
    merged = Dropout(0.25)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(256)(merged)
    merged = PReLU()(merged)
    merged = Dropout(0.25)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[sequence_1_input, sequence_2_input, dense_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [None]:
seq_length = 128
embedding_dim = 300
nb_words = 120594

data_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/transformed/keras_tokenizer/'
word_embedding_matrix = np.load(data_src + 'embedding_matrix.npy')

q_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

q1 = np.load(q_src + 'q1train_NER_128len.npy')
q2 = np.load(q_src + 'q2train_NER_128len.npy')
X_train = pd.read_pickle('Xtrain_814colsBest.pkl', compression = 'bz2')
X_train = X_train.astype('float32')
y = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')['is_duplicate'].values

test = False
if test:
    q1_te = np.load(q_src + 'q1test_NER_128len.npy')
    q2_te = np.load(q_src + 'q2test_NER_128len.npy')
    X_test = pd.read_pickle('Xtest_814colsBest.pkl', compression = 'bz2')
    X_test = X_test.astype('float32')

In [None]:
def lstm_foldrun(X, q1, q2, y, X_test = None, q1_test = None, q2_test = None, start_fold = 0):
    
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(X_test, pd.core.frame.DataFrame):
        X_test = X_test.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    
    i = 0
    losses = []
    train_splits = []
    val_splits = []
    for tr_index, val_index in skf.split(X, y):
        train_splits.append(tr_index)
        val_splits.append(val_index)
        
    oof_train = np.zeros((404290))
    oof_test = np.zeros((10, 2345796))
    
    for i in range(start_fold, 10):
        X_tr, X_val = X[train_splits[i]], X[val_splits[i]]
        q1_tr, q1_val = q1[train_splits[i]], q1[val_splits[i]]
        q2_tr, q2_val = q2[train_splits[i]], q2[val_splits[i]]
        y_tr, y_val = y[train_splits[i]], y[val_splits[i]]

        t = time.time()
        print('Start training on fold: {}'.format(i))
        callbacks = [ModelCheckpoint('checks/mergedlstm_10SKF_fold{}.h5'.format(i),
                                    monitor='val_loss', 
                                    verbose = 0, save_best_only = True),
                 EarlyStopping(monitor='val_loss', patience = 4, verbose = 1)]
        
        model = merged_lstm()
        model.fit([q1_tr, q2_tr, X_tr], y_tr, validation_data=([q1_val, q2_val, X_val], y_val),
                epochs=200, batch_size=512, callbacks = callbacks)
        
        val_pred = model.predict([q1_val, q2_val, X_val], batch_size = 64)
        oof_train[val_splits[i]] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        print('Predicting training set.')
        if X_test is not None:
            print('Predicting test set.')
            test_preds = model.predict([q1_te, q2_te, X_test], batch_size = 64)
            oof_test[i, :] = test_preds
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        del X_tr, X_val, q1_tr, q1_val, q2_tr, q2_val
        gc.collect()
        i += 1
        
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['merged_lstm_prob']
    oof_train.to_pickle('OOF_preds/train_MergedLSTMpreds_fold{}.pkl'.format(i))
    if X_test is not None:
        oof_test = pd.DataFrame(oof_test)
        oof_test.columns = ['merged_lstm_prob']
        oof_test.to_pickle('OOF_preds/test_MergedLSTMpreds_fold{}.pkl'.format(i))
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0))
    return

In [None]:
ncols = X_train.shape[1]
lstm_foldrun(X_train, q1, q2, y)