In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import pandas as pd
import numpy as np
import time

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model, Sequential, load_model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam, Nadam, SGD


from models_utils_fe import *
from models_utils_skf import *

In [None]:
def dense_model(ncols):
    dense_input = Input(shape = (ncols,))
    d = Dense(256, kernel_initializer = 'he_normal')(dense_input)
    d = PReLU()(d)
    d = BatchNormalization()(d)
    d = Dropout(0.4)(d)
    
    d2 = Dense(512, kernel_initializer = 'he_normal')(d)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.4)(d2)
    
    d2 = Dense(512, kernel_initializer = 'he_normal')(d2)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.4)(d2)
    
    d3 = Dense(512, kernel_initializer = 'he_normal')(d2)
    d3 = PReLU()(d3)
    d3 = Dropout(0.4)(d3)
    preds = Dense(1, activation='sigmoid')(d3)
    
    model = Model(inputs=[dense_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr = 1e-4), metrics=['acc'])
    return model

def train_mlp(X, y, X_test = None, name = 'MLP_1sttry', save = True):
    
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running MLP model.')
    
    i = 0
    losses = []
    oof_train = np.zeros((X.shape[0]))
    oof_test = np.zeros((10, 2345796))
    os.makedirs('saved_models/MLP/SKF/{}'.format(name), exist_ok = True)
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()

        callbacks = [ModelCheckpoint('saved_models/MLP/SKF/{}/{}_fold{}.h5'.format(name, name, i),
                                    monitor='val_loss', 
                                    verbose = 0, save_best_only = True),
                 EarlyStopping(monitor='val_loss', patience = 7, verbose = 1)]
        
        ncols = X_train.shape[1]
        model = dense_model(ncols)
        
        hist = model.fit(X_tr, y_tr, validation_data=(X_val, y_val), 
                         epochs=1, batch_size=128,
                         verbose=1, shuffle=True, callbacks=callbacks)
        val_pred = model.predict(X_val)
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        if X_test is not None:
            test_preds = model.predict(X_test, batch_size = 128)
            oof_test[i, :] = test_preds[:, 0]
        print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0), '\n')
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    oof_test = oof_test.mean(axis = 0)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
        oof_test.to_pickle('OOF_preds/test/test_preds_{}.pkl'.format(name))
    return model

In [None]:
def train_mlp_predict(X, q1, q2, y, X_test = None, q1_test = None, q2_test = None,
                      name = 'MLP_1sttry', save = True):
    if isinstance(X_test, pd.core.frame.DataFrame):
        X_test = X_test.values
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running predictions on NN model.')
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    
    i = 0
    losses = []
    train_splits = []
    val_splits = []
    oof_train = np.zeros((X.shape[0]))
    oof_test = np.zeros((10, 2345796))
    for tr_index, val_index in skf.split(X, y):
        train_splits.append(tr_index)
        val_splits.append(val_index)
    
    for i in range(10):
        X_tr, X_val = X[train_splits[i]], X[val_splits[i]]
        q1_tr, q1_val = q1[train_splits[i]], q1[val_splits[i]]
        q2_tr, q2_val = q2[train_splits[i]], q2[val_splits[i]]
        y_tr, y_val = y[train_splits[i]], y[val_splits[i]]
        
        print(i)
        t = time.time()
        model = load_model('saved_models/LSTM/SKF/{}/{}_fold{}.h5'.format(name, name, i))
        val_pred = model.predict([q1_val, q2_val, X_val], batch_size = 256)
        print('Validation predictions done.')
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        if X_test is not None:
            test_preds = model.predict([q1_te, q2_te, X_test], batch_size = 256)[:, 0]
            print(test_preds.shape)
            test_preds = np.apply_along_axis(transform, 0, test_preds)
            oof_test[i, :] = test_preds
        print('Time it took to train and predict:', time.time() - t)
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0), '\n')
    oof_test = oof_test.mean(axis = 0)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['{}_prob'.format(name)]
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
        oof_test.to_pickle('OOF_preds/test/test_preds_{}_transformed.pkl'.format(name))
    return oof_train, oof_test

In [None]:
q_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

q1 = np.load(q_src + 'q1train_NER_128len.npy')
q2 = np.load(q_src + 'q2train_NER_128len.npy')
q1_te = np.load(q_src + 'q1test_NER_128len.npy')
q2_te = np.load(q_src + 'q2test_NER_128len.npy')

X_train = pd.read_pickle('Xtrain_866BestColsDropped.pkl')
X_train = X_train.astype('float32')
X_train = X_train.replace(np.nan, -999)
X_train = X_train.replace(np.inf, 999)
print(X_train.shape)

test = True
if test:
    X_test = pd.read_pickle('Xtest_866BestColsDropped.pkl')
    X_test = X_test.astype('float32')
    X_test = X_test.replace(np.nan, -999)
    X_test = X_test.replace(np.inf, 999)
    print(X_test.shape)

In [None]:
tr, te = train_mlp_predict(X_train, q1, q2, y_train, X_test, q1_te, q2_te,
                           name = 'LSTM_merged866cols')

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

X_train = pd.read_pickle('Xtrain_866BestColsDropped.pkl')
X_train = X_train.astype('float32')
X_train = X_train.replace(np.nan, -999)
X_train = X_train.replace(np.inf, 999)
print(X_train.shape)

test = False
if test:
    X_test = pd.read_pickle('Xtest_866BestColsDropped.pkl')
    X_test = X_test.astype('float32')
    X_test = X_test.replace(np.nan, -999)
    X_test = X_test.replace(np.inf, 999)
    print(X_test.shape)

xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

del xgb_feats
gc.collect()

In [None]:
train_mlp(X_train, y_train)