# LightGBM parameter tuning using Baysian optimization

In [19]:
from fastai import *
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [9]:
TRAIN = 'data/train.csv'
TEST = 'data/test.csv'
SAMPLE = 'data/sample_submission.csv'
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)
X = train.drop(['ID_code', 'target'], axis=1)
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
??gp_minimize

In [5]:
baysian_space  = [
          Integer(5, 20, name='num_leaves'),
          Integer(50, 200, name='min_child_samples'),
          Integer(20, 50,  name='min_data_in_leaf'),
          Integer(1, 5, name='bagging_freq'),
          Real(0.6, 0.9, name='subsample'),
          Real(0.01, 0.1, name='feature_fraction'),
          Real(0.001, 0.01, name='learning_rate'),
          Real(0.1, 0.5, name='bagging_fraction'),
         ]

In [10]:
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_test, label=y_test)

In [17]:
def lgb_optim(values):
    params = {'num_leaves': values[0], 
          'min_child_samples': values[1], 
          'min_data_in_leaf': values[2], 
          'bagging_freq': values[3],
            'subsample': values[4],
            'feature_fraction': values[5],
             'learning_rate':values[6],
             'bagging_fraction': values[7],
             'boosting_type': 'gbdt',
             'objective': 'binary',
              'max_depth':-1,
              'metric':'auc',
              'boost_from_average':'false',
               'verbosity': -1,
              'objective': 'binary',
              'tree_learner': 'serial',
               'gpu_platform_id': -1,
             }
    print('\nNext set of params.....',params)
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_test, label=y_test)
    evals_results = {}
    model_lgb     = lgb.train(params, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    auc = -roc_auc_score(y_train, model_lgb.predict(X_train))
    return  auc

In [18]:
res_gp = gp_minimize(lgb_optim, baysian_space, n_calls=20,
                     random_state=0,n_random_starts=10)


Next set of params..... {'num_leaves': 14, 'min_child_samples': 177, 'min_data_in_leaf': 46, 'bagging_freq': 4, 'subsample': 0.7870691090357917, 'feature_fraction': 0.044594353656343, 'learning_rate': 0.003677811458900251, 'bagging_fraction': 0.12268519092697729, 'boosting_type': 'gbdt', 'objective': 'binary', 'max_depth': -1, 'metric': 'auc', 'boost_from_average': 'false', 'verbosity': -1, 'tree_learner': 'serial', 'gpu_platform_id': -1}
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.920242	valid_1's auc: 0.894864
[10000]	training's auc: 0.93113	valid_1's auc: 0.89807
[15000]	training's auc: 0.940321	valid_1's auc: 0.898608
[20000]	training's auc: 0.9484	valid_1's auc: 0.898751
Early stopping, best iteration is:
[17179]	training's auc: 0.943988	valid_1's auc: 0.898935

Next set of params..... {'num_leaves': 9, 'min_child_samples': 122, 'min_data_in_leaf': 44, 'bagging_freq': 3, 'subsample': 0.7178354388302489, 'feature_fraction': 0.0852470887

[15000]	training's auc: 0.922202	valid_1's auc: 0.898322
[20000]	training's auc: 0.928414	valid_1's auc: 0.899286
[25000]	training's auc: 0.934128	valid_1's auc: 0.899549
[30000]	training's auc: 0.939421	valid_1's auc: 0.899561
Early stopping, best iteration is:
[29583]	training's auc: 0.938981	valid_1's auc: 0.899591

Next set of params..... {'num_leaves': 20, 'min_child_samples': 50, 'min_data_in_leaf': 50, 'bagging_freq': 1, 'subsample': 0.9, 'feature_fraction': 0.1, 'learning_rate': 0.001, 'bagging_fraction': 0.5, 'boosting_type': 'gbdt', 'objective': 'binary', 'max_depth': -1, 'metric': 'auc', 'boost_from_average': 'false', 'verbosity': -1, 'tree_learner': 'serial', 'gpu_platform_id': -1}
Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.890409	valid_1's auc: 0.865891
[10000]	training's auc: 0.906373	valid_1's auc: 0.87754
[15000]	training's auc: 0.918606	valid_1's auc: 0.885474
[20000]	training's auc: 0.927773	valid_1's auc: 0.890648
[25000]

KeyboardInterrupt: 

In [None]:
clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)

# RandomForest parameter tuning using Baysian optimization

In [23]:
rf_baysian_space  = [
          Integer(15, 200, name='min_samples_leaf'),
          Integer(100, 250, name='n_estimators'),
          Real(0.1, 0.7, name='max_features'),
         ]

In [29]:
??RandomForestClassifier

In [24]:
def rf_optim(values):
    params = {'min_samples_leaf': values[0], 
          'n_estimators': values[1], 
          'max_features': values[2], 
           'n_jobs' : -1,
           'random_state' : 42,
             }
    print('\nNext set of params.....',params)
    rf = RandomForestClassifier(**params)
    rf.fit(X_train, y_train)
    auc = -roc_auc_score(y_train, rf.predict(X_train))
    return  auc

In [25]:
res_gp = gp_minimize(rf_optim, rf_baysian_space, n_calls=10,
                     random_state=0,n_random_starts=10)


Next set of params..... {'min_samples_leaf': 125, 'n_estimators': 227, 'max_features': 0.6147673705736542, 'n_jobs': -1, 'random_state': 42, 'metric': 'auc'}


ValueError: n_estimators must be an integer, got <class 'dict'>.