In [18]:
import os
import time

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss

In [19]:
def load_train_pred():
    preds_list = [x for x in os.listdir(src) if 'train' in x]
    preds = []
    for i in preds_list:
        preds.append(np.load(src + i))
    return np.array(preds).T

def load_val_pred():
    preds_list = [x for x in os.listdir(src) if 'val' in x]
    preds = []
    for i in preds_list:
        preds.append(np.load(src + i))
    return np.array(preds).T

def load_test_pred():
    preds_list = [x for x in os.listdir(src) if 'test' in x]
    preds = []
    for i in preds_list:
        preds.append(np.load(src + i))
    return np.array(preds).T

In [20]:
src = 'model_predictions/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

X_tr = load_train_pred()
X_val = load_val_pred()

y_tr = np.load('y_tr.npy')
y_val = np.load('y_val.npy')

In [21]:
params = {
    'seed': 1337,
    'colsample_bytree': 0.48,
    'silent': 1,
    'subsample': 0.74,
    'eta': 0.01,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'min_child_weight': 20,
    'nthread': 8,
    'tree_method': 'hist',
    }

t = time.time()
dtrain = xgb.DMatrix(X_tr, label = y_tr)
dval = xgb.DMatrix(X_val, label = y_val)
watchlist = [(dtrain, 'train'), (dval, 'valid')]

print('Start training...')
gbm = xgb.train(params, dtrain, 100000, watchlist, 
                early_stopping_rounds = 100, verbose_eval = 100)

print('Start predicting...')
#train_pred = gbm.predict(xgb.DMatrix(X_tr), ntree_limit=gbm.best_ntree_limit)
val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
#gbm.save_model('saved_models/XGB/XGB_500cols_furtherExperiments.txt')

Start training...
[0]	train-logloss:0.689324	valid-logloss:0.689947
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.310504	valid-logloss:0.375833
[200]	train-logloss:0.181208	valid-logloss:0.288919
[300]	train-logloss:0.118577	valid-logloss:0.261873
[400]	train-logloss:0.092022	valid-logloss:0.258012
Stopping. Best iteration:
[393]	train-logloss:0.093565	valid-logloss:0.257841

Start predicting...
Final score: 0.257840969482 
 Time it took to train and predict: 9.972392559051514
