In [1]:
import os
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss
from bayes_opt import BayesianOptimization

from models_utils_skf import *

In [2]:
def load_oof(mode = 'train'):
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/OOF_preds/'
    oof_preds = pd.DataFrame()
    files = sorted([x for x in os.listdir(src + '{}/'.format(mode)) if '.pkl' in x or '.csv' in x
                   and 'stack' not in x])
    print('\n', 'Loading OOF preds:', files, '\n', 'Numer of files to load:', len(files), '\n')
    for i in files:
        if 'pkl'in i:
            df_preds = pd.read_pickle('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        if '.csv'in i:
            df_preds = pd.read_csv('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        oof_preds = pd.concat([oof_preds, df_preds], axis = 1)
    return oof_preds

def transform(x):
    a = 0.165 / 0.37
    b =  (1 - 0.165) / (1 - 0.37)
    xt = a * x / (a * x + b * (1 - x))
    return xt

def inv_pred_transform(preds):
    a = 0.165 / 0.37
    b = (1 - 0.165) / (1 - 0.37)
    return b * preds / (b * preds + a * (1 - preds))

def testOOF_transform(X_test2):
    X_test = X_test2.copy()
    for i in range(X_test.shape[1]):
        X_test.iloc[:, i] = X_test.iloc[:, i].apply(inv_pred_transform)
    return X_test

def predict_test_lgbm(test_preds, model_name, transform_preds = True):
    print('Predicting on test set with LightGBM.')
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    if transform_preds:
        sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [3]:
def lgb_foldrun_BO(max_depth, min_data_in_leaf, subsample, colsample_bytree, feature_fraction):
    
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'binary',
        'metric' : {'binary_logloss'},
        'learning_rate' : 0.01,
        'feature_fraction' : max(min(feature_fraction, 1), 0),
        'bagging_fraction': 0.9,
        'bagging_freq': 100,
        'num_leaves' : 255,
        'max_depth': int(max_depth),
        'min_data_in_leaf': int(min_data_in_leaf),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'silent': 1,
        'random_state': 1337,
        'verbose': 1,
        'nthread': 4,
    }
    i = 1
    losses = []
    oof_train = np.zeros((X.shape[0]))
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_val = lgb.Dataset(X_val, y_val)
        print('Start training on fold: {}'.format(i))
        gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                        early_stopping_rounds = 200, verbose_eval = False)
        print('Start predicting...')
        val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        print('Final score for fold {} :'.format(i), score, '\n')
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0))
    return -np.array(losses).mean(axis = 0)



In [9]:
agbm = np.load('OOF_preds/h2o_gbm_test.npy')
agbm = agbm.T
agbm2 = np.apply_along_axis(transform, 1, agbm)
agbm3 = agbm2.mean(axis = 1)
agbm3 = pd.DataFrame(agbm3)
agbm3.columns = ['is_duplicate_AlexGBM']
agbm3.to_pickle('OOF_preds/test_preds_GBMAlex.pkl')

Step |   Time |      Value |   colsample_bytree |   feature_fraction |   max_depth |   min_data_in_leaf |   subsample | 

1 | 06m13s |   -0.17474 |             0.6913 |             0.6128 |      3.7855 |            10.9742 |      0.7600 | 

In [20]:
X = load_oof()
y = pd.read_pickle('y_train.pkl')

if isinstance(X, pd.core.frame.DataFrame):
    X = X.values
if isinstance(y, pd.core.frame.DataFrame):
    y = y.is_duplicate.values
if isinstance(y, pd.core.frame.Series):
    y = y.values

lgbBO = BayesianOptimization(lgb_foldrun_BO, {
        'max_depth': (2, 5),
        'min_data_in_leaf': (5, 25),
        'subsample': (0.6, 0.88),
        'colsample_bytree': (0.4, 0.75),
        'feature_fraction': (0.4, 0.85),
    })

num_iter = 15
init_points = 15
lgbBO.maximize(init_points=init_points, n_iter=num_iter)
print('lgb: %f' % lgbBO.res['max']['max_val'])


 Loading OOF preds: ['train_preds_866cols_xgbparams1.pkl', 'train_preds_866cols_xgbparams2.pkl', 'train_preds_866cols_xgbparams3.pkl', 'train_preds_866cols_xgbparams4.pkl', 'train_preds_AttentionClean_preds.pkl', 'train_preds_AttentionNER_preds.pkl', 'train_preds_MLP_1sttry.pkl', 'train_preds_lgb_0.1807_20170603_1943.csv', 'train_preds_lgb_0.1842_20170527_1246.csv', 'train_preds_newNetworks_currentBest.pkl', 'train_preds_xgb_0.1800_20170601_1113.csv', 'train_preds_xgb_0.1808_20170602_0246.csv', 'train_preds_xgb_0.1812_20170603_0506.csv', 'train_preds_xgb_0.1813_20170530_0249.csv'] 
 Numer of files to load: 14 

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   feature_fraction |   max_depth |   min_data_in_leaf |   subsample | 
Start training on fold: 1
Start predicting...
Final score for fold 1 : 0.171137294402 

Start training o

Start predicting...
Final score for fold 1 : 0.171426312256 

Start training on fold: 2
Start predicting...
Final score for fold 2 : 0.1789686155 

Start training on fold: 3
Start predicting...
Final score for fold 3 : 0.173524076397 

Start training on fold: 4
Start predicting...
Final score for fold 4 : 0.172513515947 

Start training on fold: 5
Start predicting...
Final score for fold 5 : 0.177672025578 

Start training on fold: 6
Start predicting...
Final score for fold 6 : 0.175631515301 

Start training on fold: 7
Start predicting...
Final score for fold 7 : 0.174751446517 

Start training on fold: 8
Start predicting...
Final score for fold 8 : 0.171603730858 

Start training on fold: 9
Start predicting...
Final score for fold 9 : 0.175562407082 

Start training on fold: 10
Start predicting...
Final score for fold 10 : 0.176133759698 

Mean logloss for model in 10-folds SKF: 0.174778740513
    8 | 04m54s |   -0.17478 |             0.5895 |             0.7823 |      4.5504 |      

Start predicting...
Final score for fold 10 : 0.176159352256 

Mean logloss for model in 10-folds SKF: 0.174863955801
   15 | 04m57s |   -0.17486 |             0.5702 |             0.5194 |      2.1807 |            15.6990 |      0.6125 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   feature_fraction |   max_depth |   min_data_in_leaf |   subsample | 
Start training on fold: 1
Start predicting...
Final score for fold 1 : 0.171348305473 

Start training on fold: 2
Start predicting...
Final score for fold 2 : 0.178960657172 

Start training on fold: 3
Start predicting...
Final score for fold 3 : 0.17353557041 

Start training on fold: 4
Start predicting...
Final score for fold 4 : 0.172448259658 

Start training on fold: 5
Start predicting...
Final score for fold 5 : 0.177711695705 

Start training on fold: 6
Start predic

Start predicting...
Final score for fold 4 : 0.172555599431 

Start training on fold: 5
Start predicting...
Final score for fold 5 : 0.177872869835 

Start training on fold: 6
Start predicting...
Final score for fold 6 : 0.175848432284 

Start training on fold: 7
Start predicting...
Final score for fold 7 : 0.174998676306 

Start training on fold: 8
Start predicting...
Final score for fold 8 : 0.171684288619 

Start training on fold: 9
Start predicting...
Final score for fold 9 : 0.175647007319 

Start training on fold: 10
Start predicting...
Final score for fold 10 : 0.176345368805 

Mean logloss for model in 10-folds SKF: 0.174901122372
   23 | 04m50s |   -0.17490 |             0.5535 |             0.4374 |      4.8727 |            10.8365 |      0.7353 | 
Start training on fold: 1
Start predicting...
Final score for fold 1 : 0.171260826187 

Start training on fold: 2
Start predicting...
Final score for fold 2 : 0.179104763763 

Start training on fold: 3
Start predicting...
Final sco