In [1]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score

In [2]:
# load the data to pandas frame
data = pd.read_csv('./data/creditcard.csv')
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
data.drop('Time', axis=1, inplace=True)
data

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
def binary_class_average_precision_score(y_pred, data):
    y_true = data.get_label()
    return 'average-precision', average_precision_score(y_true, y_pred), True

In [5]:
y = data['Class']
x = data.drop('Class', axis=1)

print('x.shape', x.shape)
print('y.shape', y.shape)

x.shape (284807, 29)
y.shape (284807,)


In [6]:
def objective(trial):
    param = {
        'objective':'binary',
        'verbose':-1,
        'metric': 'average-precision',
        #'device': 'gpu',
        #'is_unbalance': True,
        'num_leaves': trial.suggest_int('num_leaves', 100, 700),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 300),
        'max_bin': trial.suggest_int('max_bin', 200, 4000),
        
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.01, 1.0),
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.0, 80.0),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.0, 80.0),
        'min_gain_to_split': trial.suggest_uniform('min_gain_to_split', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1.0, 10.0), 
        }
    
    if trial.suggest_categorical('do_bagging', [True, False]):
        param['bagging_fraction'] = trial.suggest_uniform('bagging_fraction', 0.01, 0.99)
        param['bagging_freq'] = trial.suggest_int('bagging_freq', 1, 40)    
    
    print('param', param)
    
    boosting_losses = []    
    first_xval_step = True
    
    xval = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    for train_index, test_index in xval.split(x, y):
        x_train = x.iloc[train_index]
        y_train = y.iloc[train_index]
        x_val = x.iloc[test_index]
        y_val = y.iloc[test_index]
        
        train_data = lgb.Dataset(x_train, label=y_train)
        val_data = lgb.Dataset(x_val, label=y_val) 
        
        evals_result = {}
        
        if first_xval_step:
            num_boost_round = 1000
            early_stopping_rounds = 35
            first_xval_step = False
        else:
            num_boost_round = len(boosting_loss)
            early_stopping_rounds = None

        bst = lgb.train(
            param, 
            train_data, 
            valid_sets=[val_data], 
            verbose_eval=False,
            evals_result=evals_result,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            keep_training_booster = True,
            feval=binary_class_average_precision_score,
            )        
        
        boosting_loss = np.asarray(evals_result['valid_0']['average-precision'])
        #print(boosting_loss)
        boosting_losses.append(boosting_loss)

    boosting_losses = np.stack(boosting_losses)
    #print('boosting_losses', boosting_losses)
    mean_losses = np.mean(boosting_losses, axis=0)
    #print('mean_losses', mean_losses)
    max_mean_loss = max(mean_losses)
    best_avg_boosting_round = np.argmax(mean_losses)
    print('best_avg_boosting_round', best_avg_boosting_round)
    print('num_boost_round', num_boost_round)
    print('max_mean_loss', max_mean_loss)
    
    trial.set_user_attr('mean_losses', mean_losses.tolist())
    trial.set_user_attr('best_avg_boosting_round', int(best_avg_boosting_round))

    return max_mean_loss
    
                

In [8]:
study = optuna.create_study(
        direction='maximize', 
        study_name='test_01', 
        storage='sqlite:///data/training_03.db',
        load_if_exists=True,
        )

[I 2020-03-26 20:53:29,551] A new study created with name: test_01


In [9]:
# Hyperparameter Optimierung starten
study.optimize(objective)

param {'objective': 'binary', 'verbose': -1, 'metric': 'average-precision', 'num_leaves': 399, 'min_data_in_leaf': 79, 'max_bin': 2207, 'feature_fraction': 0.09412096600394598, 'lambda_l1': 27.128184898608758, 'lambda_l2': 10.102459018658152, 'min_gain_to_split': 0.025410657231685563, 'scale_pos_weight': 7.647905702472766}
best_avg_boosting_round 337
num_boost_round 373
max_mean_loss 0.7968850133753123


[I 2020-03-26 20:54:02,767] Finished trial#0 resulted in value: 0.7968850133753123. Current best value is 0.7968850133753123 with parameters: {'do_bagging': False, 'feature_fraction': 0.09412096600394598, 'lambda_l1': 27.128184898608758, 'lambda_l2': 10.102459018658152, 'max_bin': 2207, 'min_data_in_leaf': 79, 'min_gain_to_split': 0.025410657231685563, 'num_leaves': 399, 'scale_pos_weight': 7.647905702472766}.


param {'objective': 'binary', 'verbose': -1, 'metric': 'average-precision', 'num_leaves': 227, 'min_data_in_leaf': 280, 'max_bin': 636, 'feature_fraction': 0.0496948347987394, 'lambda_l1': 78.07539654361119, 'lambda_l2': 71.40330397684829, 'min_gain_to_split': 0.6945004573280377, 'scale_pos_weight': 1.0751860149751769}
best_avg_boosting_round 38
num_boost_round 81
max_mean_loss 0.6854292999524422


[I 2020-03-26 20:54:09,211] Finished trial#1 resulted in value: 0.6854292999524422. Current best value is 0.7968850133753123 with parameters: {'do_bagging': False, 'feature_fraction': 0.09412096600394598, 'lambda_l1': 27.128184898608758, 'lambda_l2': 10.102459018658152, 'max_bin': 2207, 'min_data_in_leaf': 79, 'min_gain_to_split': 0.025410657231685563, 'num_leaves': 399, 'scale_pos_weight': 7.647905702472766}.


param {'objective': 'binary', 'verbose': -1, 'metric': 'average-precision', 'num_leaves': 375, 'min_data_in_leaf': 139, 'max_bin': 2402, 'feature_fraction': 0.8711245514283712, 'lambda_l1': 57.037922100954816, 'lambda_l2': 10.944027472859537, 'min_gain_to_split': 0.002575708505765162, 'scale_pos_weight': 7.661446649420739}


KeyboardInterrupt: 