In [None]:
import matplotlib.pyplot as plt

import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

from bayes_opt import BayesianOptimization

import warnings

%matplotlib inline

In [8]:
train__df = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
test = pd.read_csv('data/test.csv').drop("ID_code",axis=1)

In [3]:
f = [x for x  in train.columns if x != 'target']
columns_cnt = []
cat_features = []
for column in f:
    columns_cnt.append(len(train[column].unique()))
    if (len(train[column].unique()) / len(train) < .05):
        cat_features.append(column)
cat_features

['var_12', 'var_68', 'var_91', 'var_103', 'var_108']

In [4]:
X = train.iloc[:, 1:]
y = train.iloc[:, 0]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50)

In [6]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [7]:
bounds_LGB = {
    'num_leaves': (5, 20), 
    'min_data_in_leaf': (5, 20),  
    'learning_rate': (0.01, 0.3),
    'min_sum_hessian_in_leaf': (0.00001, 0.01),    
    'feature_fraction': (0.05, 0.5),
    'lambda_l1': (0, 5.0), 
    'lambda_l2': (0, 5.0), 
    'min_gain_to_split': (0, 1.0),
    'max_depth':(3,15),
}

In [29]:
def LGB_bayesian(
    num_leaves:int,
    min_data_in_leaf:int,  
    learning_rate:int,
    min_sum_hessian_in_leaf:int,    
    feature_fraction:int,
    lambda_l1:int,
    lambda_l2:int,
    min_gain_to_split:int,
    max_depth:int):
    
    # LightGBM expects next three parameters need to be integer. So we make them integer
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': -1,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
         "metric" : "auc",
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,   
        'categorical_feature': [68],
        "tree_learner": "serial",
        "boost_from_average": "false",
        "bagging_seed" : 10,
        "verbosity" : 1,

    }    
    
    
    xg_train = lgb.Dataset(X_train.values,
                           label=y_train.values,
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(X_test.values,
                           label=y_test.values,
                           free_raw_data = False
                           )   

    num_round = 5000
    clf = lgb.train(param, xg_train, num_round, valid_sets = [xg_valid], verbose_eval=250, 
                    early_stopping_rounds = 100)
    
    predictions = clf.predict(X_test, num_iteration=clf.best_iteration)   
    
    score = roc_auc_score(y_test, predictions)
    
    return score

In [30]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)

In [31]:
init_points = 5
n_iter = 5

In [32]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[250]	valid_0's auc: 0.886966
Early stopping, best iteration is:
[201]	valid_0's auc: 0.887332
| [0m 1       [0m | [0m 0.8873  [0m | [0m 0.4     [0m | [0m 1.188   [0m | [0m 4.121   [0m | [0m 0.2901  [0m | [0m 14.67   [0m | [0m 11.8    [0m | [0m 0.609   [0m | [0m 0.007758[0m | [0m 14.62   [0m |
Training until validation scores don't improve for 100 rounds.
[250]	valid_0's auc: 0.840194
[500]	valid_0's auc: 0.867028
[750]	valid_0's auc: 0.879277
[1000]	valid_0's auc: 0.885724
[1250]	valid_0's auc: 0.88994
[1500]	valid_0's auc: 0.892467
[1750]	valid_0's auc: 0.894272
[2000]	valid_0's auc: 0.895262
[2250]	valid_0's auc: 0.896124
[2500]	valid_0

In [33]:
LGB_BO.max['target']

0.8980763693473784

In [35]:
LGB_BO.max['params']

{'feature_fraction': 0.05423574653643624,
 'lambda_l1': 1.7916689135248487,
 'lambda_l2': 4.745470908391052,
 'learning_rate': 0.07319071264818977,
 'max_depth': 6.8326963965643746,
 'min_data_in_leaf': 18.76658579000881,
 'min_gain_to_split': 0.03190366643989473,
 'min_sum_hessian_in_leaf': 0.0006601945250547198,
 'num_leaves': 14.447434986617345}