In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./train.csv', sep='\t')
test = pd.read_csv('./test.csv', sep='\t')

In [3]:
features = list(train.columns)

#remove index and target
features.remove('Unnamed: 0')
features.remove('0')
features.remove('160')
features.remove('164')

print('Length of features {}'.format(len(features)))

Length of features 343


In [4]:
X = train[features] 
y = train['0']

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [473]:
parameters = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'max_depth': 8,
    'num_threads': 4,
    'metric': 'auc',
    'seed': 42,
#     'n_estimators': 580,
    
        #regularization
    'colsample_bytree': 0.65,
    'subsample': 0.8,
    'subsample_freq': 2,
    'min_data_in_leaf': 20,
}

n_rounds = 10000

In [7]:
def check_train_score(params, lgb_data, data, target, kf, num_rounds):
    roc_auc = []
    for train, val in kf.split(data,target):
        temp_lgb_train = lgb_data.subset(train)
        temp_lgb_val = lgb_data.subset(val)
        temp_model = lgb.train(params, temp_lgb_train, num_rounds, verbose_eval=num_rounds)
        roc_auc.append([roc_auc_score(target[train],temp_model.predict(data.loc[train])), roc_auc_score(target[val],temp_model.predict(data.loc[val]))])
    return np.mean(roc_auc, axis=0)

## Baseline model

In [30]:
lgb_train = lgb.Dataset(X, label=y, free_raw_data=False)

In [31]:
result = lgb.cv(params=parameters, 
                train_set=lgb_train, 
                num_boost_round=n_rounds, 
                folds=skf.split(X, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.665037 + 0.00282258
[2]	cv_agg's auc: 0.685472 + 0.00285262
[3]	cv_agg's auc: 0.693012 + 0.00372857
[4]	cv_agg's auc: 0.698374 + 0.00320271
[5]	cv_agg's auc: 0.700174 + 0.00161312
[6]	cv_agg's auc: 0.700999 + 0.00180219
[7]	cv_agg's auc: 0.703862 + 0.00240576
[8]	cv_agg's auc: 0.704951 + 0.00238761
[9]	cv_agg's auc: 0.705607 + 0.00231823
[10]	cv_agg's auc: 0.706994 + 0.00198517
[11]	cv_agg's auc: 0.70776 + 0.00190211
[12]	cv_agg's auc: 0.70805 + 0.0022182
[13]	cv_agg's auc: 0.708389 + 0.0019957
[14]	cv_agg's auc: 0.708562 + 0.00250676
[15]	cv_agg's auc: 0.708433 + 0.00260187
[16]	cv_agg's auc: 0.708516 + 0.00206687
[17]	cv_agg's auc: 0.708889 + 0.00228626
[18]	cv_agg's auc: 0.708709 + 0.00229953
[19]	cv_agg's auc: 0.708756 + 0.00201591
[20]	cv_agg's auc: 0.709384 + 0.00165509
[21]	cv_agg's auc: 0.709511 + 0.00179418
[22]	cv_agg's auc: 0.709703 + 0.00177565
[23]	cv_agg's auc: 0.709855 + 0.00182883
[24]	cv_agg's auc: 0.709645 + 0.00193918
[25]	cv_agg's auc: 0.709465 +

[203]	cv_agg's auc: 0.728794 + 0.00228227
[204]	cv_agg's auc: 0.728862 + 0.00222988
[205]	cv_agg's auc: 0.72891 + 0.00223831
[206]	cv_agg's auc: 0.728935 + 0.00222905
[207]	cv_agg's auc: 0.728977 + 0.00226757
[208]	cv_agg's auc: 0.729064 + 0.00226107
[209]	cv_agg's auc: 0.729093 + 0.00228634
[210]	cv_agg's auc: 0.729175 + 0.00229719
[211]	cv_agg's auc: 0.729305 + 0.00232444
[212]	cv_agg's auc: 0.729401 + 0.00236799
[213]	cv_agg's auc: 0.729496 + 0.00238659
[214]	cv_agg's auc: 0.729596 + 0.00237966
[215]	cv_agg's auc: 0.7297 + 0.00237083
[216]	cv_agg's auc: 0.729761 + 0.00236181
[217]	cv_agg's auc: 0.729784 + 0.00236683
[218]	cv_agg's auc: 0.729839 + 0.00238197
[219]	cv_agg's auc: 0.729922 + 0.00244099
[220]	cv_agg's auc: 0.73006 + 0.0024868
[221]	cv_agg's auc: 0.730184 + 0.00246676
[222]	cv_agg's auc: 0.730227 + 0.00242779
[223]	cv_agg's auc: 0.730304 + 0.00245834
[224]	cv_agg's auc: 0.730339 + 0.00247429
[225]	cv_agg's auc: 0.73043 + 0.00239719
[226]	cv_agg's auc: 0.730473 + 0.0024232

[400]	cv_agg's auc: 0.738253 + 0.00244655
[401]	cv_agg's auc: 0.738257 + 0.00245084
[402]	cv_agg's auc: 0.73827 + 0.00247539
[403]	cv_agg's auc: 0.738283 + 0.00246931
[404]	cv_agg's auc: 0.738297 + 0.00247949
[405]	cv_agg's auc: 0.738341 + 0.0024512
[406]	cv_agg's auc: 0.738354 + 0.00246975
[407]	cv_agg's auc: 0.738389 + 0.0025024
[408]	cv_agg's auc: 0.738404 + 0.00252582
[409]	cv_agg's auc: 0.738439 + 0.0024803
[410]	cv_agg's auc: 0.738465 + 0.00245057
[411]	cv_agg's auc: 0.738511 + 0.00243002
[412]	cv_agg's auc: 0.738499 + 0.00243784
[413]	cv_agg's auc: 0.738518 + 0.00245521
[414]	cv_agg's auc: 0.738557 + 0.00248716
[415]	cv_agg's auc: 0.738583 + 0.00248246
[416]	cv_agg's auc: 0.738597 + 0.00244526
[417]	cv_agg's auc: 0.738604 + 0.00243482
[418]	cv_agg's auc: 0.738633 + 0.00243253
[419]	cv_agg's auc: 0.738653 + 0.00243975
[420]	cv_agg's auc: 0.73871 + 0.00242918
[421]	cv_agg's auc: 0.738742 + 0.00244421
[422]	cv_agg's auc: 0.738809 + 0.00242977
[423]	cv_agg's auc: 0.738867 + 0.002436

In [36]:
lgb_train = lgb.Dataset(X, label=y, free_raw_data=False)
check_train_score(parameters, lgb_train, X, y, skf, 537)

array([0.8519005 , 0.74065943])

Baseline Validation : 0.74065943

## 1. Dimensionality Reduction

### 1.1 PCA

In [27]:
from sklearn import decomposition

In [46]:
pca = decomposition.PCA(n_components=340)
pca.fit(X)
X_pca = pca.transform(X)

In [48]:
lgb_train_pca = lgb.Dataset(X_pca, label=y, free_raw_data=False)

In [49]:
result = lgb.cv(params=parameters, 
                train_set=lgb_train_pca, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_pca, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.6191 + 0.000874852
[2]	cv_agg's auc: 0.668757 + 0.00281048
[3]	cv_agg's auc: 0.67604 + 0.0021754
[4]	cv_agg's auc: 0.686449 + 0.000595102
[5]	cv_agg's auc: 0.689497 + 0.00305158
[6]	cv_agg's auc: 0.692032 + 0.00457807
[7]	cv_agg's auc: 0.693639 + 0.00385564
[8]	cv_agg's auc: 0.694771 + 0.00417544
[9]	cv_agg's auc: 0.69525 + 0.00480137
[10]	cv_agg's auc: 0.696317 + 0.00475217
[11]	cv_agg's auc: 0.696785 + 0.00377184
[12]	cv_agg's auc: 0.69685 + 0.00285936
[13]	cv_agg's auc: 0.69752 + 0.00341855
[14]	cv_agg's auc: 0.697723 + 0.00332013
[15]	cv_agg's auc: 0.698172 + 0.00271627
[16]	cv_agg's auc: 0.698339 + 0.00258628
[17]	cv_agg's auc: 0.698853 + 0.00247024
[18]	cv_agg's auc: 0.698967 + 0.00267528
[19]	cv_agg's auc: 0.69903 + 0.00273964
[20]	cv_agg's auc: 0.699731 + 0.00251415
[21]	cv_agg's auc: 0.700329 + 0.00227855
[22]	cv_agg's auc: 0.700587 + 0.00184844
[23]	cv_agg's auc: 0.701158 + 0.00163204
[24]	cv_agg's auc: 0.701259 + 0.0012086
[25]	cv_agg's auc: 0.701378 + 0.

[197]	cv_agg's auc: 0.717867 + 0.000942459
[198]	cv_agg's auc: 0.717905 + 0.000944155
[199]	cv_agg's auc: 0.718044 + 0.000949633
[200]	cv_agg's auc: 0.718094 + 0.00100605
[201]	cv_agg's auc: 0.718216 + 0.00100391
[202]	cv_agg's auc: 0.718259 + 0.000968342
[203]	cv_agg's auc: 0.718319 + 0.000979175
[204]	cv_agg's auc: 0.718418 + 0.00100232
[205]	cv_agg's auc: 0.718454 + 0.00107099
[206]	cv_agg's auc: 0.718548 + 0.00103986
[207]	cv_agg's auc: 0.71864 + 0.00100193
[208]	cv_agg's auc: 0.718658 + 0.000945394
[209]	cv_agg's auc: 0.718753 + 0.000905397
[210]	cv_agg's auc: 0.718827 + 0.000874035
[211]	cv_agg's auc: 0.71894 + 0.000865619
[212]	cv_agg's auc: 0.71903 + 0.000893513
[213]	cv_agg's auc: 0.719153 + 0.000866087
[214]	cv_agg's auc: 0.719276 + 0.000880853
[215]	cv_agg's auc: 0.719388 + 0.000841429
[216]	cv_agg's auc: 0.719478 + 0.0008597
[217]	cv_agg's auc: 0.719562 + 0.000866226
[218]	cv_agg's auc: 0.719591 + 0.000845237
[219]	cv_agg's auc: 0.719682 + 0.000887806
[220]	cv_agg's auc: 0.

[389]	cv_agg's auc: 0.728217 + 0.000989894
[390]	cv_agg's auc: 0.72823 + 0.000988849
[391]	cv_agg's auc: 0.728266 + 0.00104672
[392]	cv_agg's auc: 0.728337 + 0.00103475
[393]	cv_agg's auc: 0.728367 + 0.00104813
[394]	cv_agg's auc: 0.728372 + 0.00105929
[395]	cv_agg's auc: 0.728408 + 0.00102483
[396]	cv_agg's auc: 0.728409 + 0.00104329
[397]	cv_agg's auc: 0.728431 + 0.000983446
[398]	cv_agg's auc: 0.728471 + 0.00102032
[399]	cv_agg's auc: 0.728557 + 0.00099339
[400]	cv_agg's auc: 0.728631 + 0.000942613
[401]	cv_agg's auc: 0.72865 + 0.000983689
[402]	cv_agg's auc: 0.728715 + 0.000950187
[403]	cv_agg's auc: 0.728804 + 0.000973077
[404]	cv_agg's auc: 0.728818 + 0.000973612
[405]	cv_agg's auc: 0.728833 + 0.000970089
[406]	cv_agg's auc: 0.728838 + 0.000975678
[407]	cv_agg's auc: 0.728844 + 0.000993503
[408]	cv_agg's auc: 0.728843 + 0.00104267
[409]	cv_agg's auc: 0.72886 + 0.00104322
[410]	cv_agg's auc: 0.72885 + 0.00107385
[411]	cv_agg's auc: 0.72892 + 0.00108998
[412]	cv_agg's auc: 0.728906

In [50]:
lgb_train_pca = lgb.Dataset(pd.DataFrame(X_pca), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_pca, pd.DataFrame(X_pca), y, skf, 665)

array([0.9315783 , 0.73238324])

PCA witn n_components=342  
Validation : 0.73336679

PCA with n_components=340  
Validation : 0.73238324

### 1.2 ICA

In [51]:
ica = decomposition.FastICA(n_components=342)

In [52]:
ica.fit(X)
X_ica = ica.transform(X)

In [53]:
lgb_train_ica = lgb.Dataset(X_ica, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_ica, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_ica, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.631125 + 0.00162304
[2]	cv_agg's auc: 0.64098 + 0.000325672
[3]	cv_agg's auc: 0.64759 + 0.00147267
[4]	cv_agg's auc: 0.650468 + 0.00122736
[5]	cv_agg's auc: 0.653421 + 0.00158582
[6]	cv_agg's auc: 0.657637 + 0.00102235
[7]	cv_agg's auc: 0.660096 + 0.00090699
[8]	cv_agg's auc: 0.660394 + 0.00109727
[9]	cv_agg's auc: 0.660858 + 0.00109953
[10]	cv_agg's auc: 0.661115 + 0.000984674
[11]	cv_agg's auc: 0.661635 + 0.000601108
[12]	cv_agg's auc: 0.662269 + 0.00138228
[13]	cv_agg's auc: 0.664778 + 0.00247679
[14]	cv_agg's auc: 0.664592 + 0.00283766
[15]	cv_agg's auc: 0.665412 + 0.00331063
[16]	cv_agg's auc: 0.66674 + 0.00336794
[17]	cv_agg's auc: 0.667789 + 0.00316678
[18]	cv_agg's auc: 0.66845 + 0.00351745
[19]	cv_agg's auc: 0.668769 + 0.00344363
[20]	cv_agg's auc: 0.668947 + 0.00332755
[21]	cv_agg's auc: 0.669332 + 0.00357377
[22]	cv_agg's auc: 0.669411 + 0.0035758
[23]	cv_agg's auc: 0.669921 + 0.00317009
[24]	cv_agg's auc: 0.671037 + 0.0032158
[25]	cv_agg's auc: 0.671273 

[201]	cv_agg's auc: 0.694843 + 0.00338627
[202]	cv_agg's auc: 0.694944 + 0.00340333
[203]	cv_agg's auc: 0.695047 + 0.00338977
[204]	cv_agg's auc: 0.695139 + 0.00346322
[205]	cv_agg's auc: 0.695244 + 0.00343146
[206]	cv_agg's auc: 0.695384 + 0.00350016
[207]	cv_agg's auc: 0.695452 + 0.00352207
[208]	cv_agg's auc: 0.695513 + 0.00353236
[209]	cv_agg's auc: 0.695676 + 0.00344448
[210]	cv_agg's auc: 0.695837 + 0.00344401
[211]	cv_agg's auc: 0.695879 + 0.00345653
[212]	cv_agg's auc: 0.696051 + 0.00346369
[213]	cv_agg's auc: 0.696122 + 0.00347829
[214]	cv_agg's auc: 0.69622 + 0.00339078
[215]	cv_agg's auc: 0.69629 + 0.00341043
[216]	cv_agg's auc: 0.696393 + 0.00342488
[217]	cv_agg's auc: 0.696517 + 0.00350292
[218]	cv_agg's auc: 0.69662 + 0.00347461
[219]	cv_agg's auc: 0.696665 + 0.0034405
[220]	cv_agg's auc: 0.696727 + 0.00352284
[221]	cv_agg's auc: 0.696821 + 0.00348539
[222]	cv_agg's auc: 0.69692 + 0.00352077
[223]	cv_agg's auc: 0.697011 + 0.00359079
[224]	cv_agg's auc: 0.697087 + 0.003572

[399]	cv_agg's auc: 0.709384 + 0.00286746
[400]	cv_agg's auc: 0.709423 + 0.00281887
[401]	cv_agg's auc: 0.70944 + 0.00282361
[402]	cv_agg's auc: 0.709486 + 0.00279121
[403]	cv_agg's auc: 0.709497 + 0.00277128
[404]	cv_agg's auc: 0.709532 + 0.00278507
[405]	cv_agg's auc: 0.709569 + 0.00275603
[406]	cv_agg's auc: 0.709642 + 0.0027452
[407]	cv_agg's auc: 0.709713 + 0.00273405
[408]	cv_agg's auc: 0.709759 + 0.00273431
[409]	cv_agg's auc: 0.709896 + 0.00275301
[410]	cv_agg's auc: 0.709976 + 0.00275356
[411]	cv_agg's auc: 0.710035 + 0.00272291
[412]	cv_agg's auc: 0.710112 + 0.00269829
[413]	cv_agg's auc: 0.710129 + 0.0026975
[414]	cv_agg's auc: 0.710171 + 0.00264983
[415]	cv_agg's auc: 0.710189 + 0.00265097
[416]	cv_agg's auc: 0.710256 + 0.00263974
[417]	cv_agg's auc: 0.710254 + 0.00266109
[418]	cv_agg's auc: 0.710315 + 0.00263499
[419]	cv_agg's auc: 0.710374 + 0.00262069
[420]	cv_agg's auc: 0.7104 + 0.00257921
[421]	cv_agg's auc: 0.710418 + 0.00262659
[422]	cv_agg's auc: 0.710426 + 0.002614

[596]	cv_agg's auc: 0.715821 + 0.00188845
[597]	cv_agg's auc: 0.715884 + 0.00190471
[598]	cv_agg's auc: 0.715853 + 0.00190852
[599]	cv_agg's auc: 0.715904 + 0.00196017
[600]	cv_agg's auc: 0.71595 + 0.00193223
[601]	cv_agg's auc: 0.71595 + 0.00193495
[602]	cv_agg's auc: 0.715987 + 0.00190174
[603]	cv_agg's auc: 0.715965 + 0.00188922
[604]	cv_agg's auc: 0.715959 + 0.00190141
[605]	cv_agg's auc: 0.715982 + 0.00190387
[606]	cv_agg's auc: 0.715975 + 0.00192265
[607]	cv_agg's auc: 0.715963 + 0.0018921
[608]	cv_agg's auc: 0.715985 + 0.00189391
[609]	cv_agg's auc: 0.716013 + 0.00190718
[610]	cv_agg's auc: 0.71604 + 0.00195387
[611]	cv_agg's auc: 0.716031 + 0.0019436
[612]	cv_agg's auc: 0.716023 + 0.00196975
[613]	cv_agg's auc: 0.716031 + 0.0019435
[614]	cv_agg's auc: 0.716093 + 0.00191179
[615]	cv_agg's auc: 0.716151 + 0.0018878
[616]	cv_agg's auc: 0.716178 + 0.00188225
[617]	cv_agg's auc: 0.716181 + 0.00188256
[618]	cv_agg's auc: 0.716182 + 0.0018842
[619]	cv_agg's auc: 0.71618 + 0.00192147
[

In [55]:
lgb_train_ica = lgb.Dataset(pd.DataFrame(X_ica), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_ica, pd.DataFrame(X_ica), y, skf, 756)

array([0.92460382, 0.71837447])

ICA witn n_components=342  
Validation : 0.71837447

## 2. Clustering

### 2.1 KMeans

In [56]:
from sklearn.cluster import KMeans

In [72]:
kmeans = KMeans(n_clusters=4, random_state=0)
X_kmeans = X.copy()

In [73]:
X_kmeans['kmean_feature'] = kmeans.fit_predict(X_kmeans)

In [74]:
lgb_train_kmeans = lgb.Dataset(X_kmeans, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_kmeans, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_kmeans, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.669438 + 0.0071077
[2]	cv_agg's auc: 0.685631 + 0.0110085
[3]	cv_agg's auc: 0.692958 + 0.00660302
[4]	cv_agg's auc: 0.69891 + 0.00680699
[5]	cv_agg's auc: 0.699979 + 0.00723841
[6]	cv_agg's auc: 0.701762 + 0.00637306
[7]	cv_agg's auc: 0.702789 + 0.00625639
[8]	cv_agg's auc: 0.704877 + 0.00578682
[9]	cv_agg's auc: 0.706038 + 0.00541678
[10]	cv_agg's auc: 0.706708 + 0.00523751
[11]	cv_agg's auc: 0.706187 + 0.00530664
[12]	cv_agg's auc: 0.706265 + 0.00508548
[13]	cv_agg's auc: 0.707356 + 0.00501091
[14]	cv_agg's auc: 0.708059 + 0.00530133
[15]	cv_agg's auc: 0.708088 + 0.00532168
[16]	cv_agg's auc: 0.707885 + 0.00517052
[17]	cv_agg's auc: 0.707877 + 0.00555832
[18]	cv_agg's auc: 0.707824 + 0.00531904
[19]	cv_agg's auc: 0.708109 + 0.00550402
[20]	cv_agg's auc: 0.708149 + 0.00534412
[21]	cv_agg's auc: 0.708519 + 0.00487692
[22]	cv_agg's auc: 0.708494 + 0.00483829
[23]	cv_agg's auc: 0.709008 + 0.00489014
[24]	cv_agg's auc: 0.708987 + 0.0047563
[25]	cv_agg's auc: 0.708866 +

[200]	cv_agg's auc: 0.728354 + 0.00212737
[201]	cv_agg's auc: 0.728398 + 0.00214644
[202]	cv_agg's auc: 0.728523 + 0.00209128
[203]	cv_agg's auc: 0.728667 + 0.0021864
[204]	cv_agg's auc: 0.728804 + 0.0021554
[205]	cv_agg's auc: 0.728892 + 0.00217136
[206]	cv_agg's auc: 0.728947 + 0.00215787
[207]	cv_agg's auc: 0.729035 + 0.00216084
[208]	cv_agg's auc: 0.729052 + 0.00215424
[209]	cv_agg's auc: 0.729051 + 0.00217587
[210]	cv_agg's auc: 0.729101 + 0.00214318
[211]	cv_agg's auc: 0.729185 + 0.0021314
[212]	cv_agg's auc: 0.729258 + 0.00221623
[213]	cv_agg's auc: 0.729318 + 0.00226008
[214]	cv_agg's auc: 0.729436 + 0.00225238
[215]	cv_agg's auc: 0.729506 + 0.00221908
[216]	cv_agg's auc: 0.729507 + 0.0021786
[217]	cv_agg's auc: 0.729575 + 0.00217741
[218]	cv_agg's auc: 0.729589 + 0.00220231
[219]	cv_agg's auc: 0.729678 + 0.00224008
[220]	cv_agg's auc: 0.729767 + 0.00227661
[221]	cv_agg's auc: 0.729839 + 0.00220881
[222]	cv_agg's auc: 0.729952 + 0.00220218
[223]	cv_agg's auc: 0.729969 + 0.00223

[398]	cv_agg's auc: 0.737473 + 0.00177985
[399]	cv_agg's auc: 0.737445 + 0.00174414
[400]	cv_agg's auc: 0.737444 + 0.00170078
[401]	cv_agg's auc: 0.737452 + 0.00170886
[402]	cv_agg's auc: 0.737461 + 0.00175175
[403]	cv_agg's auc: 0.737516 + 0.00175488
[404]	cv_agg's auc: 0.737524 + 0.0017766
[405]	cv_agg's auc: 0.737522 + 0.00179929
[406]	cv_agg's auc: 0.737592 + 0.00179285
[407]	cv_agg's auc: 0.737577 + 0.00180646
[408]	cv_agg's auc: 0.737602 + 0.00178864
[409]	cv_agg's auc: 0.737632 + 0.00178835
[410]	cv_agg's auc: 0.737663 + 0.00176102
[411]	cv_agg's auc: 0.737728 + 0.00172235
[412]	cv_agg's auc: 0.737743 + 0.00173267
[413]	cv_agg's auc: 0.737764 + 0.00172851
[414]	cv_agg's auc: 0.73778 + 0.00171979
[415]	cv_agg's auc: 0.737766 + 0.00175908
[416]	cv_agg's auc: 0.737816 + 0.00176461
[417]	cv_agg's auc: 0.737815 + 0.00178663
[418]	cv_agg's auc: 0.737829 + 0.00179885
[419]	cv_agg's auc: 0.737873 + 0.00183606
[420]	cv_agg's auc: 0.737887 + 0.00183671
[421]	cv_agg's auc: 0.737881 + 0.001

[596]	cv_agg's auc: 0.740639 + 0.00181284
[597]	cv_agg's auc: 0.740649 + 0.00182731
[598]	cv_agg's auc: 0.740673 + 0.00183787
[599]	cv_agg's auc: 0.740659 + 0.0018476


In [75]:
lgb_train_kmeans = lgb.Dataset(pd.DataFrame(X_kmeans), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_kmeans, pd.DataFrame(X_kmeans), y, skf, 599)

array([0.86141731, 0.74065918])

KMeans with n_clusters=2  
Validation : 0.74185447  
KMeans with n_clusters=3  
Validation : 0.74210237  
KMeans with n_clusters=4  
Validation : 0.74065918

### 2.2 hdbscan

In [76]:
import hdbscan

In [77]:
hdb = hdbscan.HDBSCAN(min_cluster_size=10)
X_hdb = X.copy()

X_hdb['kmean_feature'] = hdb.fit_predict(X_hdb)

In [78]:
lgb_train_hdb = lgb.Dataset(X_hdb, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_hdb, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_hdb, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.664995 + 0.00287782
[2]	cv_agg's auc: 0.68874 + 0.00276704
[3]	cv_agg's auc: 0.693585 + 0.00292665
[4]	cv_agg's auc: 0.698297 + 0.00443025
[5]	cv_agg's auc: 0.699403 + 0.00510607
[6]	cv_agg's auc: 0.701459 + 0.00427963
[7]	cv_agg's auc: 0.70264 + 0.00515113
[8]	cv_agg's auc: 0.703533 + 0.00541551
[9]	cv_agg's auc: 0.705491 + 0.00605187
[10]	cv_agg's auc: 0.706978 + 0.00533739
[11]	cv_agg's auc: 0.707321 + 0.00449444
[12]	cv_agg's auc: 0.707828 + 0.00417777
[13]	cv_agg's auc: 0.708909 + 0.00408884
[14]	cv_agg's auc: 0.709661 + 0.00408055
[15]	cv_agg's auc: 0.709761 + 0.00400287
[16]	cv_agg's auc: 0.710243 + 0.00411002
[17]	cv_agg's auc: 0.710745 + 0.00408414
[18]	cv_agg's auc: 0.710719 + 0.00364471
[19]	cv_agg's auc: 0.710991 + 0.00379982
[20]	cv_agg's auc: 0.711234 + 0.00350267
[21]	cv_agg's auc: 0.711445 + 0.00339028
[22]	cv_agg's auc: 0.711037 + 0.00354335
[23]	cv_agg's auc: 0.711395 + 0.00345935
[24]	cv_agg's auc: 0.711194 + 0.00347994
[25]	cv_agg's auc: 0.711169

[200]	cv_agg's auc: 0.728959 + 0.00132938
[201]	cv_agg's auc: 0.729019 + 0.00135783
[202]	cv_agg's auc: 0.729132 + 0.0013124
[203]	cv_agg's auc: 0.729256 + 0.00136818
[204]	cv_agg's auc: 0.729369 + 0.001347
[205]	cv_agg's auc: 0.729436 + 0.00133838
[206]	cv_agg's auc: 0.729503 + 0.00130946
[207]	cv_agg's auc: 0.729565 + 0.00131361
[208]	cv_agg's auc: 0.729606 + 0.0013262
[209]	cv_agg's auc: 0.729664 + 0.00134552
[210]	cv_agg's auc: 0.729717 + 0.00131332
[211]	cv_agg's auc: 0.729783 + 0.00133794
[212]	cv_agg's auc: 0.729907 + 0.00139592
[213]	cv_agg's auc: 0.729936 + 0.00138072
[214]	cv_agg's auc: 0.730071 + 0.0013648
[215]	cv_agg's auc: 0.73016 + 0.00135263
[216]	cv_agg's auc: 0.730165 + 0.00131205
[217]	cv_agg's auc: 0.730201 + 0.00131646
[218]	cv_agg's auc: 0.730221 + 0.00135595
[219]	cv_agg's auc: 0.730317 + 0.00138825
[220]	cv_agg's auc: 0.730392 + 0.00141133
[221]	cv_agg's auc: 0.730523 + 0.00133522
[222]	cv_agg's auc: 0.730603 + 0.00129033
[223]	cv_agg's auc: 0.730701 + 0.0013071

[396]	cv_agg's auc: 0.73824 + 0.000785534
[397]	cv_agg's auc: 0.738277 + 0.000780072
[398]	cv_agg's auc: 0.738304 + 0.000807387
[399]	cv_agg's auc: 0.738293 + 0.000786494
[400]	cv_agg's auc: 0.7383 + 0.000758477
[401]	cv_agg's auc: 0.738306 + 0.000765939
[402]	cv_agg's auc: 0.738335 + 0.000801501
[403]	cv_agg's auc: 0.738381 + 0.000816856
[404]	cv_agg's auc: 0.738407 + 0.000833783
[405]	cv_agg's auc: 0.73841 + 0.00088298
[406]	cv_agg's auc: 0.738474 + 0.000871282
[407]	cv_agg's auc: 0.738491 + 0.000897878
[408]	cv_agg's auc: 0.738506 + 0.000880568
[409]	cv_agg's auc: 0.738525 + 0.000880185
[410]	cv_agg's auc: 0.738556 + 0.000891978
[411]	cv_agg's auc: 0.738616 + 0.000866889
[412]	cv_agg's auc: 0.738595 + 0.000866619
[413]	cv_agg's auc: 0.738626 + 0.000858962
[414]	cv_agg's auc: 0.738673 + 0.000887207
[415]	cv_agg's auc: 0.738668 + 0.000911645
[416]	cv_agg's auc: 0.738672 + 0.000874283
[417]	cv_agg's auc: 0.738676 + 0.000907965
[418]	cv_agg's auc: 0.738694 + 0.000882487
[419]	cv_agg's a

In [79]:
lgb_train_hdb = lgb.Dataset(pd.DataFrame(X_hdb), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_hdb, pd.DataFrame(X_hdb), y, skf, 540)

array([0.85214939, 0.7408506 ])

Validation Score : 0.7408506

### 2.3 Spectral Clustering

In [5]:
from sklearn.cluster import SpectralClustering

In [10]:
spectral = SpectralClustering(n_clusters=10, random_state=42, n_jobs=-1, affinity='nearest_neighbors')
X_spectral = X.copy()

X_spectral['spectral_feature'] = spectral.fit_predict(X_spectral)

In [11]:
lgb_train_spectral = lgb.Dataset(X_spectral, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_spectral, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_spectral, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.664997 + 0.00287903
[2]	cv_agg's auc: 0.688801 + 0.00285282
[3]	cv_agg's auc: 0.693589 + 0.00293203
[4]	cv_agg's auc: 0.698144 + 0.00454719
[5]	cv_agg's auc: 0.699323 + 0.0051323
[6]	cv_agg's auc: 0.701427 + 0.00427373
[7]	cv_agg's auc: 0.702733 + 0.00514339
[8]	cv_agg's auc: 0.703616 + 0.00543116
[9]	cv_agg's auc: 0.705543 + 0.00607216
[10]	cv_agg's auc: 0.707066 + 0.00538292
[11]	cv_agg's auc: 0.70739 + 0.0045292
[12]	cv_agg's auc: 0.707871 + 0.00425695
[13]	cv_agg's auc: 0.708971 + 0.0041753
[14]	cv_agg's auc: 0.709679 + 0.00415819
[15]	cv_agg's auc: 0.709693 + 0.00410291
[16]	cv_agg's auc: 0.710227 + 0.00419599
[17]	cv_agg's auc: 0.71071 + 0.00416793
[18]	cv_agg's auc: 0.710763 + 0.00388942
[19]	cv_agg's auc: 0.711095 + 0.00399733
[20]	cv_agg's auc: 0.711269 + 0.00372107
[21]	cv_agg's auc: 0.711434 + 0.00361017
[22]	cv_agg's auc: 0.711012 + 0.00378624
[23]	cv_agg's auc: 0.711324 + 0.00369445
[24]	cv_agg's auc: 0.711182 + 0.00364241
[25]	cv_agg's auc: 0.711116 + 

[200]	cv_agg's auc: 0.728854 + 0.00136447
[201]	cv_agg's auc: 0.72893 + 0.00139341
[202]	cv_agg's auc: 0.729062 + 0.00134943
[203]	cv_agg's auc: 0.729195 + 0.00141664
[204]	cv_agg's auc: 0.729322 + 0.00137558
[205]	cv_agg's auc: 0.729414 + 0.00136992
[206]	cv_agg's auc: 0.729495 + 0.0013417
[207]	cv_agg's auc: 0.729549 + 0.00135999
[208]	cv_agg's auc: 0.7296 + 0.00137023
[209]	cv_agg's auc: 0.729662 + 0.0014124
[210]	cv_agg's auc: 0.72972 + 0.00141218
[211]	cv_agg's auc: 0.729754 + 0.00143355
[212]	cv_agg's auc: 0.729835 + 0.00151742
[213]	cv_agg's auc: 0.729921 + 0.00153654
[214]	cv_agg's auc: 0.730035 + 0.00151989
[215]	cv_agg's auc: 0.730086 + 0.00151669
[216]	cv_agg's auc: 0.730128 + 0.00145555
[217]	cv_agg's auc: 0.730158 + 0.00145588
[218]	cv_agg's auc: 0.73019 + 0.00149778
[219]	cv_agg's auc: 0.730322 + 0.00152549
[220]	cv_agg's auc: 0.730387 + 0.00157279
[221]	cv_agg's auc: 0.730478 + 0.00151046
[222]	cv_agg's auc: 0.730573 + 0.00145759
[223]	cv_agg's auc: 0.730642 + 0.00145629

[400]	cv_agg's auc: 0.738432 + 0.00123934
[401]	cv_agg's auc: 0.738438 + 0.00124347
[402]	cv_agg's auc: 0.738462 + 0.00127884
[403]	cv_agg's auc: 0.738508 + 0.00130341
[404]	cv_agg's auc: 0.738495 + 0.00132113
[405]	cv_agg's auc: 0.738494 + 0.00135151
[406]	cv_agg's auc: 0.738571 + 0.00133473
[407]	cv_agg's auc: 0.73858 + 0.001344
[408]	cv_agg's auc: 0.738595 + 0.00133812
[409]	cv_agg's auc: 0.738638 + 0.0013579
[410]	cv_agg's auc: 0.738664 + 0.00135238
[411]	cv_agg's auc: 0.738736 + 0.00133412
[412]	cv_agg's auc: 0.738722 + 0.00134073
[413]	cv_agg's auc: 0.738704 + 0.00138661
[414]	cv_agg's auc: 0.738728 + 0.00139762
[415]	cv_agg's auc: 0.738733 + 0.00142565
[416]	cv_agg's auc: 0.738721 + 0.00141875
[417]	cv_agg's auc: 0.738717 + 0.00144178
[418]	cv_agg's auc: 0.738751 + 0.00144016
[419]	cv_agg's auc: 0.73881 + 0.00146807
[420]	cv_agg's auc: 0.738849 + 0.00146547
[421]	cv_agg's auc: 0.738879 + 0.00147435
[422]	cv_agg's auc: 0.738919 + 0.00148721
[423]	cv_agg's auc: 0.738929 + 0.001499

SpectralClustering with n_clusters=2  
Validation : 0.741314  
SpectralClustering with n_clusters=3  
Validation : 0.740755  
SpectralClustering with n_clusters=4 
Validation : 0.740825  
SpectralClustering with n_clusters=10 
Validation : 0.741192


### 2.4 Agglomerative

In [6]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
agglomerative = AgglomerativeClustering(n_clusters=2)
X_agglomerative = X.iloc[:, 0:100]

X_agglomerative['agglomerative_feature'] = agglomerative.fit_predict(X_agglomerative)

In [None]:
lgb_train_agglomerative = lgb.Dataset(X_agglomerative, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_agglomerative, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_agglomerative, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

### 2.5 Affinity

In [93]:
from sklearn.cluster import AffinityPropagation

In [94]:
affinity = AffinityPropagation()
X_affinity = X.copy()

X_affinity['affinity_feature'] = affinity.fit_predict(X_affinity)

MemoryError: 

In [None]:
lgb_train_affinity = lgb.Dataset(X_affinity, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_affinity, 
                num_boost_round=n_rounds, 
                folds=skf.split(X_affinity, y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

# 3. Features Selection

### 3.1 VarianceThreshold

In [6]:
from sklearn.feature_selection import VarianceThreshold

In [140]:
variance = VarianceThreshold(0.010)
X_variance = X.copy()

X_variance = variance.fit_transform(X_variance)

X_variance.shape

(30500, 184)

In [141]:
lgb_train_variance = lgb.Dataset(X_variance, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_variance, 
                num_boost_round=n_rounds, 
                folds=skf.split(pd.DataFrame(X_variance), y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.662964 + 0.0069295
[2]	cv_agg's auc: 0.694186 + 0.00144507
[3]	cv_agg's auc: 0.699875 + 0.00173324
[4]	cv_agg's auc: 0.70093 + 0.00174648
[5]	cv_agg's auc: 0.704214 + 0.000618131
[6]	cv_agg's auc: 0.705534 + 0.00108824
[7]	cv_agg's auc: 0.705808 + 0.000994496
[8]	cv_agg's auc: 0.70657 + 0.000870181
[9]	cv_agg's auc: 0.707 + 0.000958192
[10]	cv_agg's auc: 0.707588 + 0.000648508
[11]	cv_agg's auc: 0.707659 + 0.000149942
[12]	cv_agg's auc: 0.707591 + 0.000216653
[13]	cv_agg's auc: 0.70883 + 0.00119952
[14]	cv_agg's auc: 0.70864 + 0.00143827
[15]	cv_agg's auc: 0.709109 + 0.00164416
[16]	cv_agg's auc: 0.709149 + 0.00204762
[17]	cv_agg's auc: 0.709264 + 0.00246031
[18]	cv_agg's auc: 0.709143 + 0.00221229
[19]	cv_agg's auc: 0.709158 + 0.00203656
[20]	cv_agg's auc: 0.709472 + 0.00265598
[21]	cv_agg's auc: 0.709338 + 0.00258947
[22]	cv_agg's auc: 0.709273 + 0.00266718
[23]	cv_agg's auc: 0.709924 + 0.00253615
[24]	cv_agg's auc: 0.710375 + 0.00256015
[25]	cv_agg's auc: 0.71050

[201]	cv_agg's auc: 0.728557 + 0.00187315
[202]	cv_agg's auc: 0.728624 + 0.00189099
[203]	cv_agg's auc: 0.728649 + 0.00188591
[204]	cv_agg's auc: 0.728719 + 0.00183002
[205]	cv_agg's auc: 0.728769 + 0.00190156
[206]	cv_agg's auc: 0.72884 + 0.00188614
[207]	cv_agg's auc: 0.728865 + 0.00191132
[208]	cv_agg's auc: 0.72891 + 0.00188093
[209]	cv_agg's auc: 0.728999 + 0.00187275
[210]	cv_agg's auc: 0.729062 + 0.0018574
[211]	cv_agg's auc: 0.729161 + 0.00184704
[212]	cv_agg's auc: 0.729273 + 0.00181956
[213]	cv_agg's auc: 0.729372 + 0.00179752
[214]	cv_agg's auc: 0.72946 + 0.00179413
[215]	cv_agg's auc: 0.729529 + 0.00174681
[216]	cv_agg's auc: 0.729589 + 0.00179048
[217]	cv_agg's auc: 0.729721 + 0.00170162
[218]	cv_agg's auc: 0.729841 + 0.00167868
[219]	cv_agg's auc: 0.729886 + 0.00171702
[220]	cv_agg's auc: 0.729988 + 0.00172722
[221]	cv_agg's auc: 0.730005 + 0.00171155
[222]	cv_agg's auc: 0.730035 + 0.00166743
[223]	cv_agg's auc: 0.730133 + 0.00165106
[224]	cv_agg's auc: 0.730166 + 0.00166

[398]	cv_agg's auc: 0.738009 + 0.00254169
[399]	cv_agg's auc: 0.738013 + 0.00250265
[400]	cv_agg's auc: 0.738027 + 0.00249947
[401]	cv_agg's auc: 0.738017 + 0.00250534
[402]	cv_agg's auc: 0.738032 + 0.00252879
[403]	cv_agg's auc: 0.738051 + 0.00250159
[404]	cv_agg's auc: 0.73807 + 0.00245968
[405]	cv_agg's auc: 0.73805 + 0.00248831
[406]	cv_agg's auc: 0.738033 + 0.0024767
[407]	cv_agg's auc: 0.738019 + 0.00250779
[408]	cv_agg's auc: 0.738057 + 0.00248939
[409]	cv_agg's auc: 0.738044 + 0.00250445
[410]	cv_agg's auc: 0.738081 + 0.00251943
[411]	cv_agg's auc: 0.738099 + 0.00247387
[412]	cv_agg's auc: 0.738119 + 0.00249145
[413]	cv_agg's auc: 0.738148 + 0.00248971
[414]	cv_agg's auc: 0.738173 + 0.00249623
[415]	cv_agg's auc: 0.738235 + 0.00246004
[416]	cv_agg's auc: 0.738266 + 0.0024657
[417]	cv_agg's auc: 0.738282 + 0.00245996
[418]	cv_agg's auc: 0.738275 + 0.002455
[419]	cv_agg's auc: 0.738347 + 0.00242289
[420]	cv_agg's auc: 0.738347 + 0.00243028
[421]	cv_agg's auc: 0.738394 + 0.0024413

[599]	cv_agg's auc: 0.74112 + 0.00283207
[600]	cv_agg's auc: 0.741127 + 0.00284964
[601]	cv_agg's auc: 0.741126 + 0.00286016
[602]	cv_agg's auc: 0.741153 + 0.00288303
[603]	cv_agg's auc: 0.741159 + 0.00288885
[604]	cv_agg's auc: 0.741162 + 0.00288505
[605]	cv_agg's auc: 0.741193 + 0.0029187
[606]	cv_agg's auc: 0.741228 + 0.00289379
[607]	cv_agg's auc: 0.741241 + 0.00290418
[608]	cv_agg's auc: 0.741272 + 0.00291238
[609]	cv_agg's auc: 0.741287 + 0.00291268
[610]	cv_agg's auc: 0.7413 + 0.00290998
[611]	cv_agg's auc: 0.741303 + 0.00293244
[612]	cv_agg's auc: 0.741327 + 0.00294455
[613]	cv_agg's auc: 0.741344 + 0.00297447
[614]	cv_agg's auc: 0.741317 + 0.00298116
[615]	cv_agg's auc: 0.74135 + 0.00300115
[616]	cv_agg's auc: 0.741357 + 0.002994
[617]	cv_agg's auc: 0.741376 + 0.00297934
[618]	cv_agg's auc: 0.741361 + 0.00298044
[619]	cv_agg's auc: 0.741379 + 0.00296178
[620]	cv_agg's auc: 0.741415 + 0.00295158
[621]	cv_agg's auc: 0.741427 + 0.00292717
[622]	cv_agg's auc: 0.741411 + 0.00293358

In [142]:
lgb_train_variance = lgb.Dataset(pd.DataFrame(X_variance), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_variance, pd.DataFrame(X_variance), y, skf, 690)

array([0.86684548, 0.74191795])

In [145]:
X_test_variance = variance.transform(test[features])

parameters_var = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'max_depth': 8,
    'num_threads': 4,
    'metric': 'auc',
    'seed': 42,
    'n_estimators': 690,
    
        #regularization
    'colsample_bytree': 0.65,
    'subsample': 0.8,
    'subsample_freq': 2,
    'min_data_in_leaf': 20,
}

model = LGBMClassifier(**parameters_var)

model.fit(X_variance, y)

y_pred_lgb_var = model.predict_proba(X_test_variance)[:, 1]

test_df = test[['Unnamed: 0', '0']]

test_df['0'] = y_pred_lgb_var

test_df.to_csv('Variance.csv', index=False)

VarianceTreshold(0.005) -> Validation : 0.74154313  
VarianceTreshold(0.010) -> Validation : 0.74191795 -> LB : 0.75942757

### 3.2 SelectKBest

In [61]:
from sklearn.feature_selection import SelectKBest, f_classif

In [146]:
kbest = SelectKBest(f_classif, k=220)
X_kbest = X.copy()

X_kbest = kbest.fit_transform(X_kbest, y)

X_kbest.shape

(30500, 220)

In [147]:
lgb_train_kbest = lgb.Dataset(X_kbest, label=y, free_raw_data=False)

result = lgb.cv(params=parameters, 
                train_set=lgb_train_kbest, 
                num_boost_round=n_rounds, 
                folds=skf.split(pd.DataFrame(X_kbest), y), 
                early_stopping_rounds=10, 
                verbose_eval=1, 
)

[1]	cv_agg's auc: 0.666385 + 0.00277014
[2]	cv_agg's auc: 0.694398 + 0.00232483
[3]	cv_agg's auc: 0.701161 + 0.00196066
[4]	cv_agg's auc: 0.700968 + 0.00166277
[5]	cv_agg's auc: 0.704632 + 0.00212273
[6]	cv_agg's auc: 0.705637 + 0.00379021
[7]	cv_agg's auc: 0.705662 + 0.00268601
[8]	cv_agg's auc: 0.706991 + 0.00382434
[9]	cv_agg's auc: 0.7086 + 0.00426126
[10]	cv_agg's auc: 0.708957 + 0.00416108
[11]	cv_agg's auc: 0.708088 + 0.0040817
[12]	cv_agg's auc: 0.708766 + 0.00435704
[13]	cv_agg's auc: 0.708394 + 0.00469359
[14]	cv_agg's auc: 0.708446 + 0.00465594
[15]	cv_agg's auc: 0.708392 + 0.00469136
[16]	cv_agg's auc: 0.709031 + 0.00441739
[17]	cv_agg's auc: 0.709064 + 0.00394834
[18]	cv_agg's auc: 0.709278 + 0.00398895
[19]	cv_agg's auc: 0.709492 + 0.004011
[20]	cv_agg's auc: 0.709586 + 0.0038562
[21]	cv_agg's auc: 0.709646 + 0.0036281
[22]	cv_agg's auc: 0.709676 + 0.00357004
[23]	cv_agg's auc: 0.710283 + 0.00371446
[24]	cv_agg's auc: 0.710274 + 0.00363539
[25]	cv_agg's auc: 0.710544 + 0.

[200]	cv_agg's auc: 0.728051 + 0.00266065
[201]	cv_agg's auc: 0.728159 + 0.00262299
[202]	cv_agg's auc: 0.728188 + 0.00258278
[203]	cv_agg's auc: 0.728245 + 0.00263781
[204]	cv_agg's auc: 0.728367 + 0.00267428
[205]	cv_agg's auc: 0.728351 + 0.0027264
[206]	cv_agg's auc: 0.728405 + 0.00267734
[207]	cv_agg's auc: 0.728454 + 0.00268064
[208]	cv_agg's auc: 0.728509 + 0.00268009
[209]	cv_agg's auc: 0.728549 + 0.00267944
[210]	cv_agg's auc: 0.728628 + 0.00270992
[211]	cv_agg's auc: 0.728748 + 0.00269655
[212]	cv_agg's auc: 0.728916 + 0.00270717
[213]	cv_agg's auc: 0.729021 + 0.00270319
[214]	cv_agg's auc: 0.729153 + 0.00271837
[215]	cv_agg's auc: 0.729242 + 0.00270537
[216]	cv_agg's auc: 0.72929 + 0.00269893
[217]	cv_agg's auc: 0.729376 + 0.00266828
[218]	cv_agg's auc: 0.729453 + 0.0026489
[219]	cv_agg's auc: 0.729567 + 0.00265524
[220]	cv_agg's auc: 0.729615 + 0.00263692
[221]	cv_agg's auc: 0.729705 + 0.00260775
[222]	cv_agg's auc: 0.729752 + 0.00256998
[223]	cv_agg's auc: 0.729791 + 0.0025

[398]	cv_agg's auc: 0.737843 + 0.00193967
[399]	cv_agg's auc: 0.73788 + 0.00198737
[400]	cv_agg's auc: 0.737883 + 0.00196785
[401]	cv_agg's auc: 0.737891 + 0.0019891
[402]	cv_agg's auc: 0.737932 + 0.00200034
[403]	cv_agg's auc: 0.737941 + 0.00197638
[404]	cv_agg's auc: 0.737946 + 0.00194199
[405]	cv_agg's auc: 0.737964 + 0.00195489
[406]	cv_agg's auc: 0.737983 + 0.00196253
[407]	cv_agg's auc: 0.738011 + 0.0019505
[408]	cv_agg's auc: 0.738024 + 0.0019477
[409]	cv_agg's auc: 0.738062 + 0.00195057
[410]	cv_agg's auc: 0.738101 + 0.00194384
[411]	cv_agg's auc: 0.738098 + 0.00193556
[412]	cv_agg's auc: 0.738126 + 0.00192716
[413]	cv_agg's auc: 0.738142 + 0.00193903
[414]	cv_agg's auc: 0.738185 + 0.00195187
[415]	cv_agg's auc: 0.738218 + 0.00195633
[416]	cv_agg's auc: 0.73822 + 0.00195917
[417]	cv_agg's auc: 0.73823 + 0.00195897
[418]	cv_agg's auc: 0.738228 + 0.00199497
[419]	cv_agg's auc: 0.738235 + 0.00195651
[420]	cv_agg's auc: 0.738244 + 0.00195384
[421]	cv_agg's auc: 0.738257 + 0.0019395

[595]	cv_agg's auc: 0.741241 + 0.00140948
[596]	cv_agg's auc: 0.741254 + 0.00141878
[597]	cv_agg's auc: 0.74127 + 0.00143974
[598]	cv_agg's auc: 0.741273 + 0.0014513
[599]	cv_agg's auc: 0.741295 + 0.00144653
[600]	cv_agg's auc: 0.741266 + 0.00143065
[601]	cv_agg's auc: 0.741249 + 0.00142331
[602]	cv_agg's auc: 0.741223 + 0.0014271
[603]	cv_agg's auc: 0.741261 + 0.00144041
[604]	cv_agg's auc: 0.74128 + 0.0014239
[605]	cv_agg's auc: 0.741294 + 0.0013991
[606]	cv_agg's auc: 0.741327 + 0.00138394
[607]	cv_agg's auc: 0.741334 + 0.001397
[608]	cv_agg's auc: 0.741345 + 0.00139804
[609]	cv_agg's auc: 0.741371 + 0.00138007
[610]	cv_agg's auc: 0.741397 + 0.00138057
[611]	cv_agg's auc: 0.741423 + 0.00142592
[612]	cv_agg's auc: 0.741434 + 0.00143653
[613]	cv_agg's auc: 0.741453 + 0.0014475
[614]	cv_agg's auc: 0.741458 + 0.00144078
[615]	cv_agg's auc: 0.741464 + 0.00148443
[616]	cv_agg's auc: 0.741465 + 0.00151951
[617]	cv_agg's auc: 0.741472 + 0.00152781
[618]	cv_agg's auc: 0.741465 + 0.00154088
[

[795]	cv_agg's auc: 0.742845 + 0.0012647
[796]	cv_agg's auc: 0.742873 + 0.00129216
[797]	cv_agg's auc: 0.742873 + 0.00132165
[798]	cv_agg's auc: 0.742866 + 0.00133134
[799]	cv_agg's auc: 0.742874 + 0.00133588
[800]	cv_agg's auc: 0.742873 + 0.00132722
[801]	cv_agg's auc: 0.742859 + 0.00133944
[802]	cv_agg's auc: 0.742897 + 0.00133769
[803]	cv_agg's auc: 0.742887 + 0.00132748
[804]	cv_agg's auc: 0.742912 + 0.00134766
[805]	cv_agg's auc: 0.74292 + 0.00136881
[806]	cv_agg's auc: 0.742919 + 0.00139583
[807]	cv_agg's auc: 0.742931 + 0.00138992
[808]	cv_agg's auc: 0.74295 + 0.00137671
[809]	cv_agg's auc: 0.742951 + 0.00138555
[810]	cv_agg's auc: 0.742948 + 0.00137231
[811]	cv_agg's auc: 0.742953 + 0.00136794
[812]	cv_agg's auc: 0.742955 + 0.00138886
[813]	cv_agg's auc: 0.742943 + 0.00136787
[814]	cv_agg's auc: 0.742953 + 0.00136089
[815]	cv_agg's auc: 0.742943 + 0.00136157
[816]	cv_agg's auc: 0.742942 + 0.00135412
[817]	cv_agg's auc: 0.742963 + 0.00134617
[818]	cv_agg's auc: 0.742956 + 0.0013

In [148]:
lgb_train_kbest = lgb.Dataset(pd.DataFrame(X_kbest), label=y, free_raw_data=False)
check_train_score(parameters, lgb_train_kbest, pd.DataFrame(X_kbest), y, skf, 827)

array([0.88339355, 0.74296296])

In [149]:
parameters_kbest = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'max_depth': 8,
    'num_threads': 4,
    'metric': 'auc',
    'seed': 42,
    'n_estimators': 827,
    
        #regularization
    'colsample_bytree': 0.65,
    'subsample': 0.8,
    'subsample_freq': 2,
    'min_data_in_leaf': 20,
}

model = LGBMClassifier(**parameters_kbest)

In [151]:
model.fit(X_kbest, y)

X_test_kbest = kbest.transform(test[features])

y_pred_lgb_kbest = model.predict_proba(X_test_kbest)[:, 1]

test_df = test[['Unnamed: 0', '0']]

test_df['0'] = y_pred_lgb_kbest

test_df.to_csv('KBest.csv', index=False)

SelectKBest(f_classif, k=200) -> Validation: 0.74243861  
SelectKBest(f_classif, k=220) -> Validation: 0.74296296 -> LB : 0.75098050

### 3.3 SequentialFeatureSelector

In [5]:
from mlxtend.feature_selection import SequentialFeatureSelector
from lightgbm.sklearn import LGBMClassifier

from sklearn.linear_model import LogisticRegression

In [8]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(
    estimator=LogisticRegression(),
    X=X,
    y=y,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

np.mean(score)

0.7245851565835882

In [7]:
%%time

selector = SequentialFeatureSelector(
    LogisticRegression(), 
    scoring='roc_auc', 
    verbose=1, 
    k_features=342, 
    forward=False, 
    n_jobs=-1,
    cv=skf,
)

selector.fit(X.values, y.values)
print(selector.k_score_)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 343 out of 343 | elapsed: 20.5min finished
Features: 342/342

0.7250287069970364
CPU times: user 1min 41s, sys: 1.6 s, total: 1min 43s
Wall time: 20min 39s


# 4. Target encoding of categorical features

1:3 -> first feature  
4:6 -> second feature     !
14:39 -> third feature    !
128:193 -> fourth feature !  
216:220 -> fifth feature  !
224:310 -> sixth feature  !

In [385]:
X.loc[:, '1':'3'].apply(np.sum, axis=1).value_counts()

1    30500
dtype: int64

In [386]:
temp = pd.DataFrame([x for x in np.where(X.loc[:, '1':'3'] == 1, X.loc[:, '1':'3'].columns, '').flatten().tolist() if len(x) >0],columns= (["new"]) )


In [389]:
X_new = X.copy()
print(X_new.shape)

X_new.drop(['1', '2', '3'], axis=1, inplace=True)

X_new['OHE_inverse'] = temp['new']

print(X_new.shape)

(30500, 345)
(30500, 343)


In [390]:
import time


class SemenovEncoding:
    def __init__(self, C=10):
        self.C = C
        self.cpu_k = 3
        self.global_mean = 0
        self.features = 'all'
        self.cat_columns = []
        self.y = 0
        self.values = dict()

    def fit(self, data, y, features='all'):

        self.y = y
        
        if features == 'all':
            self.cat_columns = sorted([i for i in data.columns if data[i].dtype == 'O'])
            self.features = self.cat_columns
        else:
            self.features = features

        self.global_mean = np.mean(y)

        f = {'y': ['size', 'mean']}

        for col in self.features:
            self.values[col] = dict()
            temp = pd.DataFrame({'y': y, col: data[col]}).groupby([col]).agg(f)

            self.values[col] = (
                (temp['y']['mean'] * temp['y']['size'] + self.global_mean * self.C) / 
                (temp['y']['size'] + self.C)
            ).to_dict()
            
        return self.values

    def fit_transform(self, data, y, features='all', inplace=True):

        self.fit(data, y, features)
        return self.transform(data, inplace=inplace)

    def transform(self, data, inplace=True):
        import warnings

        if inplace:
            for col in self.values:
                if col in data.columns:
                    temp = pd.DataFrame.from_dict(
                        self.values[col], orient='index').reset_index()
                    temp.columns = [col, 'value']
                    data = pd.merge(data, temp, how='left').fillna(self.global_mean)
                    data[col] = data['value']
                    del data['value']
                    data[col] = data[col].astype('float32')

                else:
                    warnings.warn('Column ' + col + ' is missed in this dataset.')
        else:
            new_data = data.copy()
            for col in self.values:
                if col in new_data.columns:
                    temp = pd.DataFrame.from_dict(
                        self.values[col], orient='index').reset_index()
                    temp.columns = [col, 'value']
                    new_data = pd.merge(
                        new_data, temp, how='left').fillna(self.global_mean)
                    new_data[col] = new_data['value']
                    del new_data['value']
                    new_data[col] = new_data[col].astype('float32')

                else:
                    warnings.warn('Column ' + col + ' is missed in this dataset.')
            return new_data

In [394]:
def create_new_df_with_categorical_encodings(new_train, new_train_y, new_val, cols):
    se = SemenovEncoding()
    new_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    val_dfs = []
    # 2)
    for new_train_split, new_val_split in new_skf.split(new_train, new_train_y):
        # 3)
        se.fit(
            new_train.iloc[new_train_split], 
            new_train_y.iloc[new_train_split], 
            features=cols
        )
        val_dfs.append(
            se.transform(new_train.iloc[new_val_split], inplace=False)
        )
    # 4)
    se.fit(new_train, new_train_y, features=cols)
    main_val = se.transform(new_val, inplace=False)
    return val_dfs, main_val

In [395]:
%%time

new_train_dfs = []
new_val_dfs = []
main_train_dfs = []

for train_split, val_split in skf.split(X_new, y): 
    # 5)
    temp_train_dfs, temp_val_df = create_new_df_with_categorical_encodings(
        X_new.iloc[train_split], 
        y.iloc[train_split], 
        X_new.iloc[val_split], 
        ['OHE_inverse']
    )
    # 6)
    new_train_dfs.append(temp_train_dfs)
    new_val_dfs.append(temp_val_df)
    # 7)
    se = SemenovEncoding()
    se.fit(X_new.iloc[train_split], y.iloc[train_split], ['OHE_inverse'])
    main_train_dfs.append(
        se.transform(X_new.iloc[val_split], inplace=False)
    )
    
se.fit(X_new, y, features=['OHE_inverse'])

CPU times: user 1.34 s, sys: 76 ms, total: 1.42 s
Wall time: 1.56 s


In [434]:
new_val_dfs[2]["OHE_inverse"].value_counts()

0.179992    9953
0.087320     212
0.127022       1
Name: OHE_inverse, dtype: int64

In [435]:
temp = pd.concat([X_new.iloc[train_split], y.iloc[train_split]], axis=1)
temp.groupby(["OHE_inverse"])["0"].mean()

OHE_inverse
1    0.179993
2    0.085339
3    0.000000
Name: 0, dtype: float64

In [436]:
temp.groupby(["OHE_inverse"])["0"].size()

OHE_inverse
1    19873
2      457
3        4
Name: 0, dtype: int64

In [437]:
temp["0"].mean()

0.17783023507425985

In [438]:
(
    (0.179992 * 19873 + 10*0.17783023507425985) / (10 + 19873),
    (0.087320 * 457 + 10*0.17783023507425985) / (10 + 457),
    (0.127022 * 4 + 10*0.17783023507425985) / (10 + 4)
)

(0.17999091275716658, 0.08925812066540172, 0.16331359648161417)

# 5. xgbfir

In [468]:
# !pip install xgbfir

Collecting xgbfir
  Downloading xgbfir-0.3.1-py2.py3-none-any.whl
Collecting xlsxwriter>=0.9.3 (from xgbfir)
  Downloading XlsxWriter-1.0.2-py2.py3-none-any.whl (139kB)
[K    100% |████████████████████████████████| 143kB 1.4MB/s ta 0:00:01
[?25hInstalling collected packages: xlsxwriter, xgbfir
Successfully installed xgbfir-0.3.1 xlsxwriter-1.0.2


In [469]:
import xgbfir
import xgboost as xgb

In [480]:
xgb_train = xgb.DMatrix(X, label=y, feature_names=X.columns)

In [490]:
params_xgb = {
   
    'objective': 'binary:logistic',
    'eta': 0.01,
    'silent': 1,
    "nthread": 4,
    "random_seed": 17,
    "eval_metric": 'auc',
   
    
    'max_depth':  8,
    'max_leaves': 75,
    'subsample': 0.85, 
    'colsample_bytree': 0.66,

    'tree_method': 'hist',
    'grow_policy': 'lossguide'
   
}

In [491]:
results = xgb.cv(
    parameters_xgb, 
    xgb_train, 
    n_rounds, 
    early_stopping_rounds=10,
    folds=skf, 
    verbose_eval=1
)

[0]	train-auc:0.713341+0.000717048	test-auc:0.682254+0.00315133
[1]	train-auc:0.736725+0.0039925	test-auc:0.698479+0.00566366
[2]	train-auc:0.744438+0.00559581	test-auc:0.705604+0.00311531
[3]	train-auc:0.749229+0.00542606	test-auc:0.706829+0.00405604
[4]	train-auc:0.754909+0.00491976	test-auc:0.70976+0.00371571
[5]	train-auc:0.757913+0.00579971	test-auc:0.711154+0.00279469
[6]	train-auc:0.760146+0.00630635	test-auc:0.713131+0.00262315
[7]	train-auc:0.761047+0.00636517	test-auc:0.713129+0.00293988
[8]	train-auc:0.762486+0.00619109	test-auc:0.713865+0.00342701
[9]	train-auc:0.763857+0.00642995	test-auc:0.714989+0.00322021
[10]	train-auc:0.764929+0.00692233	test-auc:0.715175+0.00332381
[11]	train-auc:0.766037+0.00614633	test-auc:0.71548+0.0034476
[12]	train-auc:0.767531+0.00543102	test-auc:0.71599+0.00357236
[13]	train-auc:0.768271+0.00511832	test-auc:0.716127+0.0036519
[14]	train-auc:0.768363+0.00485874	test-auc:0.716327+0.00386521
[15]	train-auc:0.76906+0.00550445	test-auc:0.716575+0.0

[129]	train-auc:0.811636+0.00219773	test-auc:0.727255+0.00335052
[130]	train-auc:0.811942+0.00215236	test-auc:0.727346+0.00331241
[131]	train-auc:0.812203+0.00219203	test-auc:0.727444+0.00325793
[132]	train-auc:0.812501+0.00204983	test-auc:0.727518+0.00331004
[133]	train-auc:0.812896+0.00191474	test-auc:0.727656+0.00335158
[134]	train-auc:0.813314+0.00199438	test-auc:0.727663+0.00329686
[135]	train-auc:0.813782+0.00198992	test-auc:0.727736+0.0032049
[136]	train-auc:0.814097+0.00195161	test-auc:0.727802+0.00318511
[137]	train-auc:0.814359+0.00191976	test-auc:0.727879+0.00323984
[138]	train-auc:0.814778+0.00188369	test-auc:0.727848+0.00326314
[139]	train-auc:0.815085+0.00188314	test-auc:0.727889+0.00326055
[140]	train-auc:0.815392+0.0019821	test-auc:0.728019+0.0032074
[141]	train-auc:0.815788+0.0019644	test-auc:0.728001+0.00319549
[142]	train-auc:0.816157+0.00206371	test-auc:0.728064+0.00316945
[143]	train-auc:0.816734+0.00222821	test-auc:0.728163+0.00318489
[144]	train-auc:0.817073+0.00

[256]	train-auc:0.856667+0.00113131	test-auc:0.735516+0.00177748
[257]	train-auc:0.856968+0.00111959	test-auc:0.735567+0.00174989
[258]	train-auc:0.85723+0.00110392	test-auc:0.735584+0.00175776
[259]	train-auc:0.857467+0.00113275	test-auc:0.735617+0.00174202
[260]	train-auc:0.857752+0.00112578	test-auc:0.73561+0.00175136
[261]	train-auc:0.857985+0.00113307	test-auc:0.735728+0.00174608
[262]	train-auc:0.858318+0.00115444	test-auc:0.73578+0.00179927
[263]	train-auc:0.858677+0.00113166	test-auc:0.735769+0.00181939
[264]	train-auc:0.859034+0.00105166	test-auc:0.735801+0.00181245
[265]	train-auc:0.8594+0.00105017	test-auc:0.735811+0.00180909
[266]	train-auc:0.85977+0.0010485	test-auc:0.735851+0.00178083
[267]	train-auc:0.860126+0.00101828	test-auc:0.735916+0.00176477
[268]	train-auc:0.860461+0.000970427	test-auc:0.735945+0.00174534
[269]	train-auc:0.860768+0.000972861	test-auc:0.735956+0.00169108
[270]	train-auc:0.861066+0.000997198	test-auc:0.736004+0.00165798
[271]	train-auc:0.861331+0.00

[382]	train-auc:0.890724+0.00104605	test-auc:0.739629+0.00101089
[383]	train-auc:0.890948+0.00101269	test-auc:0.739658+0.000983483
[384]	train-auc:0.891194+0.00105418	test-auc:0.739654+0.000974658
[385]	train-auc:0.891403+0.00104464	test-auc:0.739671+0.00099917
[386]	train-auc:0.891656+0.00108743	test-auc:0.739671+0.00103309
[387]	train-auc:0.891866+0.00106367	test-auc:0.739701+0.00106636
[388]	train-auc:0.892064+0.0010842	test-auc:0.739676+0.00107556
[389]	train-auc:0.892275+0.0010534	test-auc:0.739671+0.0010844
[390]	train-auc:0.892536+0.00108669	test-auc:0.739667+0.00107606
[391]	train-auc:0.892688+0.00110041	test-auc:0.739707+0.00107771
[392]	train-auc:0.892935+0.00112664	test-auc:0.739763+0.00111166
[393]	train-auc:0.893173+0.00119841	test-auc:0.739744+0.0011038
[394]	train-auc:0.893458+0.00125019	test-auc:0.739797+0.00110852
[395]	train-auc:0.893682+0.00123135	test-auc:0.739848+0.00109433
[396]	train-auc:0.893878+0.00121219	test-auc:0.739845+0.00115808
[397]	train-auc:0.894075+0.

[509]	train-auc:0.914388+0.00137673	test-auc:0.741176+0.000810866
[510]	train-auc:0.914471+0.0013939	test-auc:0.741162+0.00081417
[511]	train-auc:0.914614+0.00143606	test-auc:0.741221+0.000785359
[512]	train-auc:0.914747+0.00145332	test-auc:0.741219+0.000806117
[513]	train-auc:0.914909+0.00138997	test-auc:0.741238+0.000800875
[514]	train-auc:0.915028+0.001398	test-auc:0.74124+0.00081928
[515]	train-auc:0.915198+0.00136786	test-auc:0.741243+0.000813909
[516]	train-auc:0.915367+0.00137247	test-auc:0.741249+0.000824002
[517]	train-auc:0.91555+0.00140696	test-auc:0.741224+0.00085286
[518]	train-auc:0.915811+0.00138857	test-auc:0.741225+0.000843616
[519]	train-auc:0.915993+0.00131699	test-auc:0.741232+0.000834851
[520]	train-auc:0.916176+0.00135917	test-auc:0.741232+0.000819613
[521]	train-auc:0.916308+0.00140536	test-auc:0.741236+0.000830325
[522]	train-auc:0.916482+0.0013968	test-auc:0.741253+0.000806578
[523]	train-auc:0.916651+0.00136795	test-auc:0.741239+0.000823984
[524]	train-auc:0.9

In [492]:
model = xgb.train(parameters, xgb_train, num_boost_round=561)

In [494]:
xgbfir.saveXgbFI(
    model, 
    feature_names=X.columns, 
    OutputXlsxFile="xgbfir_importance.xlsx"
)

In [560]:
X_train = X.copy()
X_train['329|91'] = X['329'] / X['91'] 

X_test = test[features]
X_test['329|91'] = X_test['329'] / X_test['91'] 

In [561]:
xgb_train_new = xgb.DMatrix(X_train, label=y, feature_names=X_train.columns)

results = xgb.cv(
    parameters_xgb, 
    xgb_train_new, 
    n_rounds, 
    early_stopping_rounds=10,
    folds=skf, 
    verbose_eval=1
)

[0]	train-auc:0.712914+0.00132856	test-auc:0.680349+0.00485402
[1]	train-auc:0.737217+0.00199767	test-auc:0.701095+0.003564
[2]	train-auc:0.744956+0.00161182	test-auc:0.704852+0.00285755
[3]	train-auc:0.751331+0.0018358	test-auc:0.706665+0.00306401
[4]	train-auc:0.754728+0.00231316	test-auc:0.708707+0.00317447
[5]	train-auc:0.756182+0.00289708	test-auc:0.710391+0.00340363
[6]	train-auc:0.758946+0.00312612	test-auc:0.711858+0.0034207
[7]	train-auc:0.760359+0.00325374	test-auc:0.712394+0.00305912
[8]	train-auc:0.761411+0.00320015	test-auc:0.712657+0.00271049
[9]	train-auc:0.761966+0.00356929	test-auc:0.712598+0.00260304
[10]	train-auc:0.762262+0.00309739	test-auc:0.713127+0.00304597
[11]	train-auc:0.763376+0.00252342	test-auc:0.714138+0.00371196
[12]	train-auc:0.764628+0.00237462	test-auc:0.714907+0.00418039
[13]	train-auc:0.765719+0.00288866	test-auc:0.715553+0.00440156
[14]	train-auc:0.766465+0.00198094	test-auc:0.716072+0.00480872
[15]	train-auc:0.76705+0.00179956	test-auc:0.715991+0.

[129]	train-auc:0.810724+0.000935005	test-auc:0.72677+0.00330603
[130]	train-auc:0.811076+0.000814993	test-auc:0.726767+0.00329093
[131]	train-auc:0.811274+0.000865278	test-auc:0.726839+0.00326745
[132]	train-auc:0.811615+0.000918884	test-auc:0.726921+0.00328103
[133]	train-auc:0.812067+0.00103816	test-auc:0.727041+0.00326131
[134]	train-auc:0.812413+0.0009712	test-auc:0.727163+0.00329254
[135]	train-auc:0.812817+0.00111545	test-auc:0.727208+0.00334795
[136]	train-auc:0.813092+0.00117679	test-auc:0.727257+0.00336719
[137]	train-auc:0.813544+0.00101806	test-auc:0.727297+0.0032679
[138]	train-auc:0.81385+0.00105149	test-auc:0.727324+0.00324553
[139]	train-auc:0.814134+0.00105185	test-auc:0.727321+0.00322518
[140]	train-auc:0.814435+0.00103882	test-auc:0.727328+0.00320184
[141]	train-auc:0.814814+0.00106445	test-auc:0.727471+0.00321922
[142]	train-auc:0.815302+0.000898774	test-auc:0.727597+0.00325459
[143]	train-auc:0.815717+0.000917032	test-auc:0.72766+0.0031769
[144]	train-auc:0.816288+

[255]	train-auc:0.855832+0.00121312	test-auc:0.735012+0.00162442
[256]	train-auc:0.856145+0.00122243	test-auc:0.735056+0.00157496
[257]	train-auc:0.856439+0.00126803	test-auc:0.735135+0.00162473
[258]	train-auc:0.856697+0.00127092	test-auc:0.735148+0.00161302
[259]	train-auc:0.857079+0.00126752	test-auc:0.735213+0.00160701
[260]	train-auc:0.857433+0.00121973	test-auc:0.735218+0.00165646
[261]	train-auc:0.857752+0.00122016	test-auc:0.735194+0.00169829
[262]	train-auc:0.858015+0.00119555	test-auc:0.735258+0.00167097
[263]	train-auc:0.858271+0.00114959	test-auc:0.735265+0.00166516
[264]	train-auc:0.858638+0.00114173	test-auc:0.735294+0.00168489
[265]	train-auc:0.85895+0.00115177	test-auc:0.735353+0.00169737
[266]	train-auc:0.859279+0.00112349	test-auc:0.735379+0.00168979
[267]	train-auc:0.859556+0.00114674	test-auc:0.73539+0.0017009
[268]	train-auc:0.859947+0.00112307	test-auc:0.735424+0.00164993
[269]	train-auc:0.860228+0.00111802	test-auc:0.735427+0.00167758
[270]	train-auc:0.860521+0.0

[381]	train-auc:0.890309+0.000538414	test-auc:0.739135+0.00129266
[382]	train-auc:0.890569+0.000520875	test-auc:0.73912+0.00133044
[383]	train-auc:0.890856+0.000584355	test-auc:0.739097+0.00132544
[384]	train-auc:0.891106+0.00063822	test-auc:0.73913+0.00133482
[385]	train-auc:0.891365+0.000655591	test-auc:0.739157+0.00136876
[386]	train-auc:0.891603+0.000678147	test-auc:0.739204+0.00136453
[387]	train-auc:0.89181+0.000689923	test-auc:0.73921+0.00137889
[388]	train-auc:0.892096+0.000690884	test-auc:0.739237+0.00139695
[389]	train-auc:0.892363+0.000719608	test-auc:0.739255+0.00137378
[390]	train-auc:0.892577+0.000727093	test-auc:0.739241+0.00133648
[391]	train-auc:0.892782+0.000688375	test-auc:0.739244+0.00133391
[392]	train-auc:0.892991+0.000611687	test-auc:0.73928+0.00136591
[393]	train-auc:0.893178+0.000586254	test-auc:0.739295+0.00138398
[394]	train-auc:0.89339+0.000546307	test-auc:0.739326+0.00141881
[395]	train-auc:0.893659+0.000539315	test-auc:0.739373+0.00145896
[396]	train-auc:0

Validation Score for Baseline XGB model better then for model fitted on selected features via xgbfi :(

LB on baseline XGB : 0.76017604  
LB with xgbfi feature : 0.75646361

In [562]:
xgb_model = xgb.train(parameters_xgb, xgb_train_new, 475)

In [563]:
xgb_test = xgb.DMatrix(X_test)

xgb_predictions = xgb_model.predict(xgb_test)

In [564]:
test_df = test[['Unnamed: 0', '0']]

test_df['0'] = xgb_predictions

test_df.to_csv('XGB_baseline.csv', index=False)