# Description
This is the final notebook which I used for my final submission to the competition. It generated 0.805 AMEX public score where 1st place contender scored 0.809. The order of steps are as following:


*   Load and Preprocess
*   Feature Engineering
*   Ensemble Models
*   Autotuning

Note that models has been built based on the tuning results regardless of the order of the steps in the notebook.

For further inital models and analysis, please refer to the other notebooks. 

# Load and preprocess

## Load packages and raw data

In [None]:
import warnings
import gzip, pickle

warnings.filterwarnings('ignore')
from tqdm import tqdm
import pandas as pd
import numpy as np
# mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Load the data from drive 
df_train=pd.read_pickle('/content/drive/MyDrive/int_train.pkl', compression='gzip')
train_labels=pd.read_csv('/content/drive/MyDrive/train_labels.csv')
df_test=pd.read_pickle('/content/drive/MyDrive/int_test.pkl',compression='gzip')

In [None]:
# seperate numeric and categorical variables
# Function for describing both numeric and categorical variables (by Uri Smashnov)
def describe_more(df,normalize_ind=False, weight_column=None, skip_columns=[], dropna=True):
    var = [] ; l = [] ; t = []; unq =[]; min_l = []; max_l = [];
    assert isinstance(skip_columns, list), "Argument skip_columns should be list"
    if weight_column is not None:
        if weight_column not in list(df.columns):
            raise AssertionError('weight_column is not a valid column name in the input DataFrame')
      
    for x in df:
        if x in skip_columns:
            pass
        else:
            var.append( x )
            uniq_counts = len(pd.value_counts(df[x],dropna=dropna))
            uniq_counts = len(pd.value_counts(df[x], dropna=dropna)[pd.value_counts(df[x],dropna=dropna)>0])
            l.append(uniq_counts)
            t.append( df[ x ].dtypes )
            min_l.append(df[x].apply(str).str.len().min())
            max_l.append(df[x].apply(str).str.len().max())
            if weight_column is not None and x not in skip_columns:
                df2 = df.groupby(x).agg({weight_column: 'sum'}).sort_values(weight_column, ascending=False)
                df2['authtrans_vts_cnt']=((df2[weight_column])/df2[weight_column].sum()).round(2)
                unq.append(df2.head(n=100).to_dict()[weight_column])
            else:
                df_cat_d = df[x].value_counts(normalize=normalize_ind,dropna=dropna).round(decimals=2)
                df_cat_d = df_cat_d[df_cat_d>0]
                #unq.append(df[x].value_counts().iloc[0:100].to_dict())
                unq.append(df_cat_d.iloc[0:100].to_dict())
            
    levels = pd.DataFrame( { 'A_Variable' : var , 'Levels' : l , 'Datatype' : t ,
                             'Min Length' : min_l,
                             'Max Length': max_l,
                             'Level_Values' : unq} )
    #levels.sort_values( by = 'Levels' , inplace = True )
    return levels


In [None]:
pd.set_option('display.max_rows', None)
described=describe_more(df_train.iloc[:50000,:])

##Drop redundant features

In [None]:
df_train.groupby(['D_103','D_107'])['customer_ID'].agg('count').reset_index(name='n')

Unnamed: 0,D_103,D_107,n
0,-1,-1,101548
1,0,0,2919859
2,1,1,1875253
3,1,2,520153
4,1,3,93165
5,1,4,16661
6,1,5,3344
7,1,6,827
8,1,7,369
9,1,8,101


In [None]:
df_train.groupby(['D_139','D_145'])['customer_ID'].agg('count').reset_index(name='n')

Unnamed: 0,D_139,D_145,n
0,-1,-1,101548
1,0,0,4485483
2,1,1,370084
3,1,2,177905
4,1,3,82127
5,1,4,73007
6,1,5,46174
7,1,6,40625
8,1,7,28340
9,1,8,26476


D_139 and D_103 are redundant because their values can be explained by other variables. S_2 is also redundant because it is the time of the entry, does not include too much information.

In [None]:
# # suppose features with less than 10 levels are categorical
# categorical=[]
# numeric=[]
# for [col,level] in described[['A_Variable','Levels']].values:
#   if col not in ['S_2','D_139','D_103','customer_ID']:
#     if level<10:
#       categorical.append(col)
#     else: numeric.append(col)
# features=categorical+numeric
# print(categorical)

In [None]:
features=[col for col in df_train.columns if col not in ['S_2','D_139','D_103','customer_ID']]

## Preprocessing and feature engineering

In [None]:
# shift to positive values
for col in features:
  min_val=np.nanmin(df_train[col])
  df_train[col]=df_train[col]-min_val+1
  df_test[col]=df_test[col]-min_val+1

In [None]:
# create a summary functions map
col_map={}
for col in features:
  col_map[col]=[np.nanmean, np.nanstd, np.nanmin, np.nanmax, 'last']
#Summarize the train data for each customer
train_num_agg = df_train.groupby('customer_ID').agg(col_map)
train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
train_num_agg.columns = [col if col[-5:]!='_last' else col[:-5] for col in train_num_agg.columns]

#Summarize the test data for each customer
test_num_agg = df_test.groupby('customer_ID').agg(col_map)
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_num_agg.columns = [col if col[-5:]!='_last' else col[:-5] for col in test_num_agg.columns]

In [None]:
# generate trend variables for train data (percent change):  (mean(last_3_months)-mean(first_3_months))/mean(first_3_months)
col_map={}
for col in features:
  col_map[col]=[np.nanmean]

#Summarize the last 3 rows of data for each customer
train_last = df_train.groupby('customer_ID').tail(3)
train_last = train_last.groupby('customer_ID').agg(col_map)
train_last.columns = ['_'.join(x) for x in train_last.columns]

#Summarize the first 3 rows of data for each customer
train_first = df_train.groupby('customer_ID').head(3)
train_first = train_first.groupby('customer_ID').agg(col_map)
train_first.columns = ['_'.join(x) for x in train_first.columns]

#create trend variables that show the percentage change between first 3 and last 3 months
train_trend=pd.DataFrame(index=train_first.index)
for col in train_first.columns:
  train_trend[col+'trend']=(train_last[col]-train_first[col])/(train_first[col]+0.001)
#---------------------------------------------------------------------------------------#
# repeat the above calculations for test data
#Summarize the last 3 rows of data for each customer
test_last = df_test.groupby('customer_ID').tail(3)
test_last = test_last.groupby('customer_ID').agg(col_map)
test_last.columns = ['_'.join(x) for x in test_last.columns]

#Summarize the first 3 rows of data for each customer
test_first = df_test.groupby('customer_ID').head(3)
test_first = test_first.groupby('customer_ID').agg(col_map)
test_first.columns = ['_'.join(x) for x in test_first.columns]

#create trend variables that show the percentage change between first 3 and last 3 months
test_trend=pd.DataFrame(index=test_first.index)
for col in test_first.columns:
  test_trend[col+'trend']=(test_last[col]-test_first[col])/(test_first[col]+0.001)

In [None]:
# remove the trend features that have to many missing values
for col in train_trend.columns:
  if train_trend[col].isna().sum()>20000:
    train_trend.drop([col],inplace=True,axis=1)
train_trend.shape

(458913, 152)

In [None]:
labels=train_labels.set_index('customer_ID',drop=True)
train_combined=pd.concat([train_num_agg,train_trend,labels],axis=1)
del train_num_agg
del train_first
del train_last
del train_labels
del train_trend

In [None]:
test_combined=pd.concat([test_num_agg,test_trend],axis=1)
del test_num_agg
del test_first
del test_last
del test_trend
test_combined.shape

(924621, 1116)

In [None]:
# create new features
def feature_eng(a):
  # create the numeric features based on the eda analysis
  a['bp12'] = a['B_1']-a['P_2']
  a['pb34'] = a['P_3']/a['B_4']
  a['rs']=a['R_1']*a['S_3'] # risk is weighted by the spending
  a['rd']=a['S_3']*a['D_39']
  a['br']=a['B_1']*a['R_1']
  a['bd']=a['B_1']*a['D_41']
  a['p/bsr']=a['P_2']/((a['B_3']+a['S_3'])*a['R_1'])

  # some addition features
  a['ps']=a['P_2']/a['S_3'] # payment compared to spending
  a['rp']=a['R_1']/a['P_2'] # current risk scaled by the last payment
  a['bps']=a['B_3']+a['S_3']-a['P_2'] #spending and balance excess
  print('new numerical features created...')
  #boolean
  a['r12']=a['R_1']>a['R_2'] # change in risk (assuming it is time series)
  a['r23']=a['R_2']>a['R_3']
  a['p23']=a['P_2']>a['P_3'] # change in payment
  a['b12']=a['B_1']>a['B_2'] # change in balance
  print('new categorical features created...')

  feature_num=['bp12','pb34','rs','rd','br','bd','p/bsr','ps','rp','bps']
  feature_cat=['r12','r23','p23','b12']
  for col in feature_cat:
    a[col]=a[col].astype('int')
  return a
#train_combined=feature_eng(train_combined)
test_combined=feature_eng(test_combined)

new numerical features created...
new categorical features created...


In [None]:
# function for getting lag 1 
def get_lag_1(data, num_features):
    df1 = []
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        # if only one row, then lag is nan
        if len(df)==1:
          diff_df1=np.array([np.nan for i in df[num_features].values[0]])
          df1.append(diff_df1)
          customer_ids.append(customer_id)
        else:
          # Get the lag
          diff_df1 = df[num_features].values[0].astype(np.float32)
          # Append to lists
          df1.append(diff_df1)
          customer_ids.append(customer_id)
    # create lag_frame
    df1 = pd.DataFrame(df1, columns = [col + '_lag1' for col in df[num_features].columns])
    # Add customer id
    df1['customer_ID'] = customer_ids
    return df1

data=df_train.groupby('customer_ID').tail(2).set_index('customer_ID',drop=True).sort_index()
del df_train
# get lag_1
lag_train=get_lag_1(data, features)
lag_train=lag_train.set_index('customer_ID',drop=True).sort_index()


# append lags to the train_combined
train_combined=pd.concat([train_combined,lag_train],axis=1)
del data
del lag_train

In [None]:
data=df_test.groupby('customer_ID').tail(2).set_index('customer_ID',drop=True).sort_index()
del df_test
# get lag_1
lag_test=get_lag_1(data, features)
lag_test=lag_test.set_index('customer_ID',drop=True).sort_index()

# append lags to the test_combined
test_combined=pd.concat([test_combined,lag_test],axis=1)
del data
del lag_test

100%|██████████| 924621/924621 [12:10<00:00, 1265.39it/s]


In [None]:
#create last-mean difference columns for train data
for col in features:
  train_combined[col+'_diff_mean']=train_combined[col]-train_combined[col+'_nanmean']
# save data to pickle object
train_combined.to_pickle('train_amex.pkl',compression='gzip')
train=pd.read_pickle('/content/drive/MyDrive/train_amex.pkl', compression='gzip')

In [None]:
# create last-mean difference columns for test data
for col in features:
  test_combined[col+'_diff_mean']=test_combined[col]-test_combined[col+'_nanmean']

columns=[col for col in train.columns if col!='target']

test_combined=test_combined[columns]
print(train.shape, test_combined.shape)

# save data to pickle object
test_combined.to_pickle('test_amex.pkl',compression='gzip')


(458913, 1469) (924621, 1468)


# Deafault prediction models

In [None]:
## Train test split
from sklearn.model_selection import train_test_split
predictors=[col for col in train.columns if col!='target']
x_train, x_val, y_train, y_val=train_test_split(train[predictors], train[['target']], test_size=0.2, random_state=1221)
print(x_train.shape,x_val.shape)

(367130, 1468) (91783, 1468)


In [None]:
#create the custom metric fot the lgb model 
def amex(preds, train_data) -> float:
    y_true=train_data.get_label()
    y_pred = 1. / (1. + np.exp(-preds))

    def amex_metric_mod(y_true, y_pred):

        labels     = np.transpose(np.array([y_true, y_pred]))
        labels     = labels[labels[:, 1].argsort()[::-1]]
        weights    = np.where(labels[:,0]==0, 20, 1)
        cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
        top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

        gini = [0,0]
        for i in [1,0]:
            labels         = np.transpose(np.array([y_true, y_pred]))
            labels         = labels[labels[:, i].argsort()[::-1]]
            weight         = np.where(labels[:,0]==0, 20, 1)
            weight_random  = np.cumsum(weight / np.sum(weight))
            total_pos      = np.sum(labels[:, 0] *  weight)
            cum_pos_found  = np.cumsum(labels[:, 0] * weight)
            lorentz        = cum_pos_found / total_pos
            gini[i]        = np.sum((lorentz - weight_random) * weight)

        return 0.5 * (gini[1]/gini[0] + top_four)
    return 'amex', amex_metric_mod(y_true,y_pred), True

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)

params = {'bagging_fraction': 0.65,
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'pos_bagging_fraction':0.33,
          # 'neg_bagging_fraction':1,
          # 'feature_fraction_bynode':0.5,
          'boosting_type':'dart',
          # 'drop_rate':0.2,
          # 'bagging_freq': 10,
          "metric": "None",
          "first_metric_only": True,
          'feature_fraction': 0.25,
          # 'lambda_l1': 0.0001,
          # 'lambda_l2': 5,
          'learning_rate': 0.01,
          #  'max_depth': 30,
          #'min_child_samples': 190,
          'min_data_in_leaf': 80,
          # 'min_split_gain': 7e-05,
          'num_leaves': 80,
          'seed':55,
          'n_jobs':-1}
my_model = lgb.train(   
                    params, 
                    num_boost_round=7000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=250, verbose_eval=50
                    )


[50]	training's amex: 0.767773	valid_0's amex: 0.759148
[100]	training's amex: 0.770962	valid_0's amex: 0.76273
[150]	training's amex: 0.773067	valid_0's amex: 0.763997
[200]	training's amex: 0.774634	valid_0's amex: 0.764884
[250]	training's amex: 0.775792	valid_0's amex: 0.766422
[300]	training's amex: 0.777234	valid_0's amex: 0.767799
[350]	training's amex: 0.77837	valid_0's amex: 0.768245
[400]	training's amex: 0.779773	valid_0's amex: 0.769297
[450]	training's amex: 0.780439	valid_0's amex: 0.76974
[500]	training's amex: 0.781191	valid_0's amex: 0.76968
[550]	training's amex: 0.782318	valid_0's amex: 0.77095
[600]	training's amex: 0.783453	valid_0's amex: 0.77095
[650]	training's amex: 0.784842	valid_0's amex: 0.772259
[700]	training's amex: 0.786388	valid_0's amex: 0.772907
[750]	training's amex: 0.787985	valid_0's amex: 0.773222
[800]	training's amex: 0.789016	valid_0's amex: 0.774346
[850]	training's amex: 0.789991	valid_0's amex: 0.775108
[900]	training's amex: 0.791506	valid_

In [None]:
import joblib
# save model
joblib.dump(my_model, 'lgb.pkl')
# load model
gbm_pickle = joblib.load('lgb.pkl')

In [None]:
test=pd.read_pickle('/content/drive/MyDrive/test_amex.pkl', compression='gzip')
import lightgbm as lgb
lgb_holdout=lgb.Dataset(test,free_raw_data=False)


In [None]:
for col in tqdm(test.columns):
  if test[col].dtype in ['float32','float64']:
    test[col]=test[col].astype('float16')

100%|██████████| 1468/1468 [10:07<00:00,  2.42it/s]


In [None]:
# the notebook crashes when predicting using full test dataset, hence we predict the full dataset in 10 batches
preds=[]
start,end=0,100000
for i in tqdm(range(9)):
  preds=preds+list(gbm_pickle.predict(test.iloc[start:end,:]))
  start=end
  end=end+10**5
preds=preds+list(gbm_pickle.predict(test.iloc[start:,:]))

In [None]:
# create the submission file
submission=pd.DataFrame(test.index).set_index('customer_ID',drop=True)
submission['prediction']=preds
submission.head()

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.038573
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.001761
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.050199
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.268645
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.853875


In [None]:
submission["prediction"].to_csv("submission_first.csv",index=True)

## Models with different parameters

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)

params = {'bagging_fraction': 0.65,
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'pos_bagging_fraction':0.33,
          # 'neg_bagging_fraction':1,
          # 'feature_fraction_bynode':0.5,
          'boosting_type':'dart',
          # 'drop_rate':0.2,
          # 'bagging_freq': 10,
          "metric": "None",
          "first_metric_only": True,
          'feature_fraction': 0.15,
          # 'lambda_l1': 0.0001,
          # 'lambda_l2': 5,
          'learning_rate': 0.008,
          #  'max_depth': 30,
          #'min_child_samples': 190,
          'min_data_in_leaf': 150,
          # 'min_split_gain': 7e-05,
          'num_leaves': 60,
          'seed':55,
          'n_jobs':-1}
gbm2 = lgb.train(   
                    params, 
                    num_boost_round=16000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=250, verbose_eval=0
                    )
#callbacks=[lgb.reset_parameter(learning_rate = [0.03]*1500 + [0.02]*1500+[0.01]*2000+[0.0085]*3000+[0.007]*4000+[0.005]*4000) ]

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)

params = {'bagging_fraction': 0.65,
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'pos_bagging_fraction':0.33,
          # 'neg_bagging_fraction':1,
          # 'feature_fraction_bynode':0.5,
          'boosting_type':'dart',
          'drop_rate':0.25,
          'skip_drop':0.2,
          'drop_seed':123,
          # 'bagging_freq': 10,
          "metric": "None",
          "first_metric_only": True,
          'feature_fraction': 0.1,
          # 'lambda_l1': 0.0001,
          # 'lambda_l2': 5,
          # 'learning_rate': 0.008,
          #  'max_depth': 30,
          #'min_child_samples': 190,
          'min_data_in_leaf': 200,
          # 'min_split_gain': 7e-05,
          'num_leaves': 150,
          'seed':55,
          'n_jobs':-1}
gbm2 = lgb.train(   
                    params, 
                    num_boost_round=18000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=250, verbose_eval=0,
                 callbacks=[lgb.reset_parameter(learning_rate = [0.03]*1500 + [0.02]*1500+[0.01]*3000+[0.0085]*4000+[0.0065]*4000+[0.004]*4000) ]
                    )

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)

params = {'bagging_fraction': 0.65,
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'pos_bagging_fraction':0.33,
          # 'neg_bagging_fraction':1,
          # 'feature_fraction_bynode':0.5,
          'boosting_type':'dart',
          # 'drop_rate':0.2,
          'bagging_freq': 20,
          "metric": "None",
          "first_metric_only": True,
          'feature_fraction': 0.075,
          # 'lambda_l1': 0.0001,
          # 'lambda_l2': 5,
          'learning_rate': 0.017,
          #  'max_depth': 30,
          #'min_child_samples': 190,
          'min_data_in_leaf': 100,
          # 'min_split_gain': 7e-05,
          'num_leaves': 100,
          'seed':55,
          'n_jobs':-1}
gbm2 = lgb.train(   
                    params, 
                    num_boost_round=16000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=250, verbose_eval=50
                    )
#callbacks=[lgb.reset_parameter(learning_rate = [0.03]*1500 + [0.02]*1500+[0.01]*2000+[0.0085]*3000+[0.007]*4000+[0.005]*4000) ]

[50]	training's amex: 0.768756	valid_0's amex: 0.761944
[100]	training's amex: 0.773956	valid_0's amex: 0.764768
[150]	training's amex: 0.777368	valid_0's amex: 0.767208
[200]	training's amex: 0.780546	valid_0's amex: 0.768791
[250]	training's amex: 0.783333	valid_0's amex: 0.771723
[300]	training's amex: 0.786087	valid_0's amex: 0.773301
[350]	training's amex: 0.78694	valid_0's amex: 0.773943
[400]	training's amex: 0.788637	valid_0's amex: 0.775385
[450]	training's amex: 0.790191	valid_0's amex: 0.77521
[500]	training's amex: 0.791747	valid_0's amex: 0.775775
[550]	training's amex: 0.79309	valid_0's amex: 0.776817
[600]	training's amex: 0.795042	valid_0's amex: 0.777887
[650]	training's amex: 0.796977	valid_0's amex: 0.778971
[700]	training's amex: 0.798839	valid_0's amex: 0.780063
[750]	training's amex: 0.800573	valid_0's amex: 0.780756
[800]	training's amex: 0.80262	valid_0's amex: 0.782099
[850]	training's amex: 0.804049	valid_0's amex: 0.782532
[900]	training's amex: 0.805725	vali

# AutoTUNE with OPTUNA

In [None]:
!pip install optuna
import lightgbm as lgb
import optuna.integration.lightgbm as lgb
import optuna


Installing collected packages: pyperclip, pbr, stevedore, Mako, cmd2, autopage, colorlog, cmaes, cliff, alembic, optuna
Successfully installed Mako-1.2.2 alembic-1.8.1 autopage-0.5.1 cliff-3.10.1 cmaes-0.8.2 cmd2-2.4.2 colorlog-6.7.0 optuna-3.0.0 pbr-5.10.0 pyperclip-1.8.2 stevedore-3.5.0


In [None]:
def objective(trial):
    # define search intervals for the parameters
    param={
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'verbosity':-1,
          # 'boosting_type':'dart',
          "bagging_fraction": trial.suggest_loguniform("bagging_fraction", 0.4, 0.8),
          # "max_depth": trial.suggest_int("max_depth", 25, 40,4),
          "num_leaves": trial.suggest_int("num_leaves", 60, 120,10),
          "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.15),
          # "bagging_freq": trial.suggest_categorical("bagging_freq", [10]),
          "feature_fraction": trial.suggest_loguniform("feature_fraction", 0.05, 0.4), 
          # 'feature_fraction_bynode':trial.suggest_loguniform("feature_fraction_bynode", 0.3, 0.90), 
          "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 60, 120,10),
          "metric": "None",
          # "first_metric_only": True,
          "seed": 50
            }
    # run model     

    lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
    lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)
    
    gbm = lgb.train(param,
                    num_boost_round=10000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval],
                    feval=amex,
                    early_stopping_rounds=4500,
                    verbose_eval=200
                    )

    cv_score=gbm.best_score['valid_0']['amex']
    return cv_score

# Suppress information only outputs
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_name = "example-study-long"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(direction='maximize',study_name=study_name, storage=storage_name,load_if_exists=True)
study.optimize(objective,timeout=24000)

Training until validation scores don't improve for 4500 rounds.
[200]	valid_0's amex: 0.781134
[400]	valid_0's amex: 0.78965
[600]	valid_0's amex: 0.791275
[800]	valid_0's amex: 0.79288
[1000]	valid_0's amex: 0.793546
[1200]	valid_0's amex: 0.794084
[1400]	valid_0's amex: 0.794102
[1600]	valid_0's amex: 0.794537
[1800]	valid_0's amex: 0.794578
[2000]	valid_0's amex: 0.79381
[2200]	valid_0's amex: 0.794038
[2400]	valid_0's amex: 0.793757
[2600]	valid_0's amex: 0.794025
[2800]	valid_0's amex: 0.793659
[3000]	valid_0's amex: 0.794051
[3200]	valid_0's amex: 0.793764
[3400]	valid_0's amex: 0.794056
[3600]	valid_0's amex: 0.7944
[3800]	valid_0's amex: 0.794269
[4000]	valid_0's amex: 0.794086
[4200]	valid_0's amex: 0.793708
[4400]	valid_0's amex: 0.794052
[4600]	valid_0's amex: 0.794443
[4800]	valid_0's amex: 0.794152
[5000]	valid_0's amex: 0.794453
[5200]	valid_0's amex: 0.794516
[5400]	valid_0's amex: 0.794207
[5600]	valid_0's amex: 0.794571
[5800]	valid_0's amex: 0.794054
[6000]	valid_0's 

In [None]:
# display best parameters
study.best_params

{'bagging_fraction': 0.4591871995031453,
 'feature_fraction': 0.05944201683712933,
 'learning_rate': 0.022770370427204846,
 'min_data_in_leaf': 120,
 'num_leaves': 60}

In [None]:
# Visualise the prediction scores for trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# Visualise the prediction scores as a function of parameters
optuna.visualization.plot_slice(study)


In [None]:
# create the summary dataframe for trial results and parameter values
df = study.trials_dataframe()
values=[]
i=0
while True:
  try:
    values.append(study.get_trials()[i].value)
    i+=1
  except: break
cols=[ 'params_bagging_fraction', 'params_feature_fraction',
       'params_learning_rate', 'params_min_data_in_leaf', 'params_num_leaves','value']
result=df[df.value>0.795][cols]
result.columns=['bag','feat','l_rate','min_d_leaf','num_leaves','value']
result

Unnamed: 0,bag,feat,l_rate,min_d_leaf,num_leaves,value
2,0.787915,0.056001,0.024191,120,70,0.795191
4,0.675756,0.341617,0.035538,70,110,0.795228
7,0.541996,0.050263,0.024133,120,120,0.795759
8,0.413862,0.171354,0.015639,100,110,0.795703
10,0.584066,0.320293,0.025501,90,70,0.795979
11,0.465851,0.053703,0.051108,100,100,0.79636
14,0.589056,0.121504,0.01042,90,80,0.795317
17,0.530753,0.070603,0.047006,110,100,0.795283
20,0.531459,0.180739,0.033013,90,70,0.795889
26,0.437348,0.184457,0.019414,80,80,0.795359


In [None]:
# SECOND TUNING
def objective(trial):
    # 
    param={
          'objective':'binary',
          # 'metric':'binary_logloss',
          # 'verbosity':-1,
          # 'boosting_type':'dart',
          "bagging_fraction": trial.suggest_loguniform("bagging_fraction", 0.4, 0.6),
          # "max_depth": trial.suggest_int("max_depth", 15, 40,5),
          "num_leaves": trial.suggest_int("num_leaves", 30, 70,10),
          "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.025),
          # "bagging_freq": trial.suggest_categorical("bagging_freq", [10]),
          "feature_fraction": trial.suggest_loguniform("feature_fraction", 0.03, 0.07), # increase the upper bound for the next round of
          # 'feature_fraction_bynode':trial.suggest_loguniform("feature_fraction_bynode", 0.3, 0.90), 
          "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 400, 25),
          "metric": "None",
          "first_metric_only": True,
          "seed": 50
            }
    # run model     

    lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
    lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)
    
    gbm = lgb.train(param,
                    num_boost_round=15000,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=5000,
                    verbose_eval=200
                    )

    cv_score=gbm.best_score['valid_0']['amex']
    return cv_score

# Suppress information only outputs
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_name = "example-study-long"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(direction='maximize',study_name=study_name, storage=storage_name,load_if_exists=True)
study.optimize(objective,timeout=7200)

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(x_train, label=y_train,
                         free_raw_data=False)
lgb_eval = lgb.Dataset(x_val, label=y_val, reference=lgb_train,
                        free_raw_data=False)

params={
  'objective':'binary',
  'metric':'cross_entrophy',
  'tree_learner':'voting',
  'is_unbalance':True,
  'boosting':'goss',
  'bagging_fraction': 0.46,
 'feature_fraction': 0.06,
  'learning_rate': 0.0215,
 'min_data_in_leaf': 120,
 'num_leaves': 60,
 'seed': 123}

gbm2 = lgb.train(   params, 
                    num_boost_round=9999,
                    train_set=lgb_train,
                    valid_sets=[lgb_eval,lgb_train],
                    feval=amex,
                    early_stopping_rounds=4500, verbose_eval=50 #,callbacks=[lgb.reset_parameter(learning_rate=learning_rate_decay)]
                    )

Training until validation scores don't improve for 4500 rounds.
[50]	training's binary_logloss: 0.355721	training's amex: 0.765501	valid_0's binary_logloss: 0.35924	valid_0's amex: 0.760253
[100]	training's binary_logloss: 0.282091	training's amex: 0.773606	valid_0's binary_logloss: 0.286527	valid_0's amex: 0.766438
[150]	training's binary_logloss: 0.25036	training's amex: 0.779694	valid_0's binary_logloss: 0.255785	valid_0's amex: 0.770665
[200]	training's binary_logloss: 0.234775	training's amex: 0.784783	valid_0's binary_logloss: 0.241282	valid_0's amex: 0.775187
[250]	training's binary_logloss: 0.226227	training's amex: 0.789345	valid_0's binary_logloss: 0.233861	valid_0's amex: 0.778086
[300]	training's binary_logloss: 0.220735	training's amex: 0.793768	valid_0's binary_logloss: 0.229588	valid_0's amex: 0.779958
[350]	training's binary_logloss: 0.216852	training's amex: 0.797312	valid_0's binary_logloss: 0.226928	valid_0's amex: 0.783194
[400]	training's binary_logloss: 0.213842	t

<!-- <!-- # def learning_rate_decay(current_iter):
#     base_learning_rate = 0.04
#     lr = base_learning_rate  * np.power(.9999, current_iter)
#     return lr if lr > 0.005 else 0.005 -->
<!-- 
tree_learner=voting

is_unbalance=True

sigmoid=1 #default

metric=cross_entropy --> -->
