In [1]:
import gc
import time
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from bayes_opt import BayesianOptimization

In [16]:
def xgb_bo(max_depth, min_child_weight, subsample, colsample_bytree):
    
    t = time.time()
    params = {
    'seed': 1337,
    'colsample_bytree': max(min(colsample_bytree, 1), 0),
    'silent': 1,
    'subsample': max(min(subsample, 1), 0),
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': int(max_depth),
    'min_child_weight': int(min_child_weight),
    'nthread': 4,
    'tree_method': 'hist',
    }
    
    dtrain = xgb.DMatrix(X_tr, label = y_tr)
    dval = xgb.DMatrix(X_val, label = y_val)
    watchlist = [(dtrain, 'train'), (dval, 'valid')]

    print('Start training...')
    gbm = xgb.train(params, dtrain, 100000, watchlist, 
                    early_stopping_rounds = 100, verbose_eval = 100)
    val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
    score = log_loss(y_val, val_pred)
    print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
    return -score


def lgb_bo(max_depth, min_data_in_leaf, subsample, colsample_bytree, feature_fraction):
    params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'binary',
        'metric' : {'binary_logloss'},
        'learning_rate' : 0.05,
        'feature_fraction' : max(min(feature_fraction, 1), 0),
        'bagging_fraction': 0.9,
        'bagging_freq': 100,
        'num_leaves' : 255,
        'max_depth': int(max_depth),
        'min_data_in_leaf': int(min_data_in_leaf),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'silent': 1,
        'random_state': 1337,
        'verbose': 1,
        'nthread': 9,
    }

    lgb_train = lgb.Dataset(X_tr, y_tr.is_duplicate.values)
    lgb_val = lgb.Dataset(X_val, y_val.is_duplicate.values)
    t = time.time()
    print('Start training...')
    gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                    early_stopping_rounds = 100, verbose_eval = 100)
    val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    score = log_loss(y_val, val_pred)
    print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
    return -score

In [6]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

#X_train = pd.read_pickle('Xtrain_814colsBest.pkl', compression = 'bz2')
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                    test_size = 0.2, random_state = 111)

del xgb_feats, X_train
gc.collect()

1547

 Step |   Time |      Value |   colsample_bytree |   feature_fraction |   max_depth |   min_data_in_leaf |   subsample | 

  20 | 05m16s |   -0.20453 |             0.4089 |             0.5045 |      8.0778 |            23.0794 |      0.6507 
  
   23 | 06m22s |   -0.20448 |             0.4017 |             0.5290 |      8.0352 |            27.5745 |      0.8080 | 
   
  19 | 05m29s |   -0.20479 |             0.6450 |             0.5302 |      8.0956 |            21.4192 |      0.8488 | 

In [15]:
lgbBO = BayesianOptimization(lgb_bo, {
        'max_depth': (8, 15),
        'min_data_in_leaf': (8, 28),
        'subsample': (0.65, 0.85),
        'colsample_bytree': (0.4, 0.65),
        'feature_fraction': (0.5, 0.95),
    })

num_iter = 15
init_points = 10
lgbBO.maximize(init_points=init_points, n_iter=num_iter)
print('lgb: %f' % lgbBO.res['max']['max_val'])

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   feature_fraction |   max_depth |   min_data_in_leaf |   subsample | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.229931
[200]	valid_0's binary_logloss: 0.216166
[300]	valid_0's binary_logloss: 0.21122
[400]	valid_0's binary_logloss: 0.208599
[500]	valid_0's binary_logloss: 0.207171
[600]	valid_0's binary_logloss: 0.206397
[700]	valid_0's binary_logloss: 0.205852
[800]	valid_0's binary_logloss: 0.205352
[900]	valid_0's binary_logloss: 0.205371
Early stopping, best iteration is:
[865]	valid_0's binary_logloss: 0.205265
Final score: 0.205276358316 
 Time it took to train and predict: 373.49435925483704
    1 | 06m13s | [35m  -0.20528[0m | [32m            0.4130[0m | [32m            0.9248[0m | [32m     8.9347[0m | 

   12 | 06m47s | [35m  -0.20499[0m | [32m            0.6318[0m | [32m            0.8781[0m | [32m     8.0205[0m | [32m           27.9379[0m | [32m     0.6543[0m | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.230284
[200]	valid_0's binary_logloss: 0.216782
[300]	valid_0's binary_logloss: 0.211977
[400]	valid_0's binary_logloss: 0.209541
[500]	valid_0's binary_logloss: 0.207918
[600]	valid_0's binary_logloss: 0.206994
[700]	valid_0's binary_logloss: 0.206479
[800]	valid_0's binary_logloss: 0.206049
[900]	valid_0's binary_logloss: 0.205952
[1000]	valid_0's binary_logloss: 0.205934
Early stopping, best iteration is:
[947]	valid_0's binary_logloss: 0.205746
Final score: 0.205752076008 
 Time it took to train and predict: 388.84168434143066




   13 | 06m52s |   -0.20575 |             0.6447 |             0.8808 |      8.0796 |            12.3303 |      0.6784 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.220779
[200]	valid_0's binary_logloss: 0.210712
[300]	valid_0's binary_logloss: 0.207313
[400]	valid_0's binary_logloss: 0.206145
[500]	valid_0's binary_logloss: 0.206329
Early stopping, best iteration is:
[407]	valid_0's binary_logloss: 0.206071
Final score: 0.206078519844 
 Time it took to train and predict: 438.6605553627014


  " state: %s" % convergence_dict)


   14 | 07m51s |   -0.20608 |             0.4026 |             0.9438 |     14.9293 |            21.9516 |      0.7436 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.230162
[200]	valid_0's binary_logloss: 0.216445
[300]	valid_0's binary_logloss: 0.21125
[400]	valid_0's binary_logloss: 0.20871
[500]	valid_0's binary_logloss: 0.207218
[600]	valid_0's binary_logloss: 0.206401
[700]	valid_0's binary_logloss: 0.205761
[800]	valid_0's binary_logloss: 0.20531
Early stopping, best iteration is:
[796]	valid_0's binary_logloss: 0.205263
Final score: 0.205268229263 
 Time it took to train and predict: 371.3395538330078


  " state: %s" % convergence_dict)


   15 | 06m34s |   -0.20527 |             0.6336 |             0.9284 |      8.1254 |            24.1174 |      0.6608 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.226724
[200]	valid_0's binary_logloss: 0.21449
[300]	valid_0's binary_logloss: 0.210238
[400]	valid_0's binary_logloss: 0.208015
[500]	valid_0's binary_logloss: 0.207127
[600]	valid_0's binary_logloss: 0.206717
[700]	valid_0's binary_logloss: 0.206927
Early stopping, best iteration is:
[627]	valid_0's binary_logloss: 0.206661
Final score: 0.20666404826 
 Time it took to train and predict: 461.2034044265747
   16 | 08m16s |   -0.20666 |             0.6300 |             0.9267 |      9.9942 |             8.2104 |      0.6513 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.222806
[200]	valid_0's binary_logloss: 0.211541
[300]	valid_0's binary_logloss: 0.20788
[400]	valid_0's binary_logloss: 0.206449
[50

  " state: %s" % convergence_dict)


   18 | 07m09s |   -0.20560 |             0.4284 |             0.9248 |      8.0646 |            15.9829 |      0.8443 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.229888
[200]	valid_0's binary_logloss: 0.216594
[300]	valid_0's binary_logloss: 0.211609
[400]	valid_0's binary_logloss: 0.209338
[500]	valid_0's binary_logloss: 0.207726
[600]	valid_0's binary_logloss: 0.206492
[700]	valid_0's binary_logloss: 0.205664
[800]	valid_0's binary_logloss: 0.205135
[900]	valid_0's binary_logloss: 0.20493
[1000]	valid_0's binary_logloss: 0.204946
Early stopping, best iteration is:
[923]	valid_0's binary_logloss: 0.204787
Final score: 0.204794163891 
 Time it took to train and predict: 299.51048970222473
   19 | 05m29s | [35m  -0.20479[0m | [32m            0.6450[0m | [32m            0.5302[0m | [32m     8.0956[0m | [32m           21.4192[0m | [32m     0.8488[0m | 
Start training...
Train until valid scores didn't improve i

  " state: %s" % convergence_dict)


   21 | 05m32s |   -0.20505 |             0.4133 |             0.5195 |      8.0410 |            10.2447 |      0.8379 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.224017
[200]	valid_0's binary_logloss: 0.212515
[300]	valid_0's binary_logloss: 0.208185
[400]	valid_0's binary_logloss: 0.20654
[500]	valid_0's binary_logloss: 0.205621
[600]	valid_0's binary_logloss: 0.205788
Early stopping, best iteration is:
[539]	valid_0's binary_logloss: 0.205497
Final score: 0.205511489123 
 Time it took to train and predict: 262.5262417793274
   22 | 04m55s |   -0.20551 |             0.4192 |             0.5076 |     10.8824 |            12.4620 |      0.6522 | 
Start training...
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's binary_logloss: 0.230374
[200]	valid_0's binary_logloss: 0.216548
[300]	valid_0's binary_logloss: 0.21135
[400]	valid_0's binary_logloss: 0.208854
[500]	valid_0's binary_logloss: 0.206996
[6

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   25 | 06m34s |   -0.20523 |             0.4407 |             0.5080 |     14.9286 |            12.5507 |      0.6762 | 
lgb: -0.204484


 Step |   Time |      Value |   colsample_bytree |   max_depth |   min_child_weight |   subsample | 

* 13 | 37m14s |   -0.20249 |             0.4124 |      8.1496 |            17.5251 |      0.8456 | 
  
* 15 | 27m07s |   -0.20227 |             0.4243 |      8.4558 |            19.5977 |      0.8486 | 

* 2 | 26m23s |   -0.20295 |             0.5589 |     10.5015 |            21.8423 |      0.8243 | 

In [None]:
xgbBO = BayesianOptimization(xgb_bo, {
        'max_depth': (8, 15),
        'min_child_weight': (8, 28),
        'subsample': (0.65, 0.85),
        'colsample_bytree': (0.4, 0.65),
    })

num_iter = 15
init_points = 10
xgbBO.maximize(init_points=init_points, n_iter=num_iter)
print('XGB: %f' % xgbBO.res['max']['max_val'])

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   min_child_weight |   subsample | 
Start training...
[0]	train-logloss:0.661081	valid-logloss:0.661342
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.21886	valid-logloss:0.23045
[200]	train-logloss:0.192722	valid-logloss:0.217202
[300]	train-logloss:0.176578	valid-logloss:0.212179
[400]	train-logloss:0.163423	valid-logloss:0.209578
[500]	train-logloss:0.15271	valid-logloss:0.207983
[600]	train-logloss:0.143403	valid-logloss:0.2069
[700]	train-logloss:0.135151	valid-logloss:0.206017
[800]	train-logloss:0.127028	valid-logloss:0.205272
[900]	train-logloss:0.119983	valid-logloss:0.204507
[1000]	train-logloss:0.113577	valid-logloss:0.204169
[1100]	train-logloss:0.107496	

[0]	train-logloss:0.659927	valid-logloss:0.660615
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.186458	valid-logloss:0.221379
[200]	train-logloss:0.147287	valid-logloss:0.210854
[300]	train-logloss:0.122315	valid-logloss:0.207414
[400]	train-logloss:0.103355	valid-logloss:0.205813
[500]	train-logloss:0.088769	valid-logloss:0.20486
[600]	train-logloss:0.077031	valid-logloss:0.204677
[700]	train-logloss:0.067276	valid-logloss:0.204414
[800]	train-logloss:0.058638	valid-logloss:0.204595
Stopping. Best iteration:
[726]	train-logloss:0.064943	valid-logloss:0.204322

Final score: 0.204322113779 
 Time it took to train and predict: 1450.4375236034393
    8 | 24m10s |   -0.20432 |             0.4145 |     12.8761 |            16.7429 |      0.7786 | 
Start training...
[0]	train-logloss:0.660308	valid-logloss:0.660758
Multiple eval metrics have been passed: 'valid-logl

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   min_child_weight |   subsample | 
Start training...
[0]	train-logloss:0.659843	valid-logloss:0.660573
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.18088	valid-logloss:0.220866
[200]	train-logloss:0.144119	valid-logloss:0.210647
[300]	train-logloss:0.120296	valid-logloss:0.207447
[400]	train-logloss:0.102178	valid-logloss:0.205689
[500]	train-logloss:0.087944	valid-logloss:0.204855
[600]	train-logloss:0.07673	valid-logloss:0.204424
[700]	train-logloss:0.067008	valid-logloss:0.204355
Stopping. Best iteration:
[656]	train-logloss:0.070956	valid-logloss:0.204323

Final score: 0.204322676716 
 Time it took to train and predict: 1420.6970834732056


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   11 | 24m41s |   -0.20432 |             0.4147 |     14.9599 |            27.8336 |      0.8382 | 
Start training...
[0]	train-logloss:0.661114	valid-logloss:0.661371
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.220102	valid-logloss:0.231212
[200]	train-logloss:0.194147	valid-logloss:0.217458
[300]	train-logloss:0.178744	valid-logloss:0.212602
[400]	train-logloss:0.166809	valid-logloss:0.209942
[500]	train-logloss:0.156701	valid-logloss:0.208421
[600]	train-logloss:0.147399	valid-logloss:0.207188
[700]	train-logloss:0.139803	valid-logloss:0.206165
[800]	train-logloss:0.132441	valid-logloss:0.205419
[900]	train-logloss:0.125735	valid-logloss:0.204877
[1000]	train-logloss:0.119592	valid-logloss:0.204497
[1100]	train-logloss:0.113856	valid-logloss:0.204256
[1200]	train-logloss:0.10826	valid-logloss:0.203876
[1300]	train-logloss:0.103049	valid-logloss:0.203662


  " state: %s" % convergence_dict)


   12 | 36m42s |   -0.20309 |             0.4898 |      8.1480 |            27.9895 |      0.8380 | 
Start training...
[0]	train-logloss:0.661022	valid-logloss:0.661304
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.218338	valid-logloss:0.23044
[200]	train-logloss:0.191604	valid-logloss:0.217231
[300]	train-logloss:0.174564	valid-logloss:0.212039
[400]	train-logloss:0.161073	valid-logloss:0.209227
[500]	train-logloss:0.150979	valid-logloss:0.207639
[600]	train-logloss:0.141455	valid-logloss:0.206445
[700]	train-logloss:0.13267	valid-logloss:0.205392
[800]	train-logloss:0.124825	valid-logloss:0.204763
[900]	train-logloss:0.11747	valid-logloss:0.204221
[1000]	train-logloss:0.111072	valid-logloss:0.20378
[1100]	train-logloss:0.105033	valid-logloss:0.203459
[1200]	train-logloss:0.099152	valid-logloss:0.203194
[1300]	train-logloss:0.093767	valid-logloss:0.203005
[14

  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   13 | 37m14s | [35m  -0.20249[0m | [32m            0.4124[0m | [32m     8.1496[0m | [32m           17.5251[0m | [32m     0.8456[0m | 
Start training...
[0]	train-logloss:0.65905	valid-logloss:0.660389
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.152349	valid-logloss:0.218612
[200]	train-logloss:0.10397	valid-logloss:0.209209
[300]	train-logloss:0.075934	valid-logloss:0.206724
[400]	train-logloss:0.056969	valid-logloss:0.20667
Stopping. Best iteration:
[355]	train-logloss:0.064327	valid-logloss:0.206468

Final score: 0.206467570969 
 Time it took to train and predict: 1180.0786819458008


  " state: %s" % convergence_dict)


   14 | 20m04s |   -0.20647 |             0.4110 |     14.7764 |             8.0311 |      0.8391 | 
Start training...
[0]	train-logloss:0.66107	valid-logloss:0.661323
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.218422	valid-logloss:0.230321
[200]	train-logloss:0.191995	valid-logloss:0.217002
[300]	train-logloss:0.175714	valid-logloss:0.212014
[400]	train-logloss:0.162301	valid-logloss:0.209233
[500]	train-logloss:0.152167	valid-logloss:0.207414
[600]	train-logloss:0.142129	valid-logloss:0.206144
[700]	train-logloss:0.133907	valid-logloss:0.205265
[800]	train-logloss:0.126027	valid-logloss:0.204575
[900]	train-logloss:0.118711	valid-logloss:0.203844
[1000]	train-logloss:0.111884	valid-logloss:0.203376
[1100]	train-logloss:0.105505	valid-logloss:0.203004
[1200]	train-logloss:0.099742	valid-logloss:0.202735
[1300]	train-logloss:0.094174	valid-logloss:0.20253
[

  " state: %s" % convergence_dict)


   15 | 27m07s | [35m  -0.20227[0m | [32m            0.4243[0m | [32m     8.4558[0m | [32m           19.5977[0m | [32m     0.8486[0m | 
Start training...
[0]	train-logloss:0.660707	valid-logloss:0.661036
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.211434	valid-logloss:0.22772
[200]	train-logloss:0.182043	valid-logloss:0.215035
[300]	train-logloss:0.162723	valid-logloss:0.210134
[400]	train-logloss:0.148663	valid-logloss:0.207879
[500]	train-logloss:0.136971	valid-logloss:0.206427
[600]	train-logloss:0.126185	valid-logloss:0.205165
[700]	train-logloss:0.116983	valid-logloss:0.20446
[800]	train-logloss:0.108245	valid-logloss:0.203843
[900]	train-logloss:0.100276	valid-logloss:0.203442
[1000]	train-logloss:0.093177	valid-logloss:0.202941
[1100]	train-logloss:0.08684	valid-logloss:0.202684
[1200]	train-logloss:0.081207	valid-logloss:0.20269
[1300]	trai

  " state: %s" % convergence_dict)


   16 | 25m08s |   -0.20265 |             0.4001 |      9.4882 |            20.5449 |      0.8479 | 
Start training...
[0]	train-logloss:0.660908	valid-logloss:0.66118
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.218937	valid-logloss:0.230349
[200]	train-logloss:0.192589	valid-logloss:0.216864
[300]	train-logloss:0.175922	valid-logloss:0.211887
[400]	train-logloss:0.162866	valid-logloss:0.20914
[500]	train-logloss:0.152061	valid-logloss:0.207336
[600]	train-logloss:0.142926	valid-logloss:0.20624
[700]	train-logloss:0.134463	valid-logloss:0.205354
[800]	train-logloss:0.126287	valid-logloss:0.20473
[900]	train-logloss:0.118922	valid-logloss:0.204164
