Training LightGBM on the fraud dataset.
- SMOTE for Upsampling
- F1 score for metric
- using early stopping
- Hyperopt for hyperparameter optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE 
from imblearn.combine import SMOTETomek, SMOTEENN
import lightgbm as lgb
import sklearn
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import f1_score


In [2]:
x_train = pd.read_csv('./data/x_train.csv').values
y_train = pd.read_csv('./data/y_train.csv').values[:,0]
x_test = pd.read_csv('./data/x_test.csv').values
y_test = pd.read_csv('./data/y_test.csv').values[:,0]


In [3]:
x_train_resampled, y_train_resampled = x_train, y_train

print('#pos labels unsampled:', sum(y_train == 1))
print('#neg labels unsampled::', sum(y_train == 0))
print('#pos labels resampled:', sum(y_train_resampled == 1))
print('#neg labels resampled::', sum(y_train_resampled == 0))

#pos labels unsampled: 394
#neg labels unsampled:: 227451
#pos labels resampled: 394
#neg labels resampled:: 227451


In [4]:
train_data = lgb.Dataset(x_train_resampled, label=y_train_resampled)
test_data = lgb.Dataset(x_test, label=y_test)


In [5]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True


In [6]:

def objective(params):
    #print(params)
    
    evals_result = {}

    num_leaves = int(params['num_leaves'])
    min_data_in_leaf = int(params['min_data_in_leaf'])
    max_bin = int(params['max_bin'])
    bagging_fraction = params['bagging_fraction']
    bagging_freq = int(params['bagging_freq'])
    feature_fraction = params['feature_fraction']
    lambda_l2 = params['lambda_l2'],
    min_gain_to_split = params['min_gain_to_split']
    scale_pos_weight = params['scale_pos_weight']

    param = {'num_leaves':num_leaves, 
             'min_data_in_leaf':min_data_in_leaf, 
             'max_bin':max_bin,
             'learning_rate':0.1,
             'num_trees':1000, 
             'objective':'binary',
             'bagging_fraction':bagging_fraction,
             'bagging_freq':bagging_freq,
             'feature_fraction':feature_fraction,
             'verbose':-1,
             'lambda_l2':lambda_l2,
             'min_gain_to_split':min_gain_to_split,

             #Cannot set is_unbalance and scale_pos_weight at the same time
             #'is_unbalance' : True, #set this to true if training data are unbalanced
             
             #'scale_pos_weight' : scale_pos_weight,
             #'metric' : 'binary_logloss' # map, MAP, aliases: mean_average_precision
             'scale_pos_weight' : scale_pos_weight,
            }
    
    bst = lgb.train(param, 
                    train_data, 
                    valid_sets=[test_data], 
                    early_stopping_rounds=15, 
                    verbose_eval=False,
                    feval=lgb_f1_score, 
                    evals_result=evals_result,
                   )

    f1 = max(evals_result['valid_0']['f1'])
    
    return -f1


In [7]:
trials = Trials()

space = {
         'num_leaves' : hp.quniform('num_leaves', 100, 700, 10),
         'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 10, 300, 1),
         'max_bin' : hp.quniform('max_bin', 200, 4000, 10),
         'bagging_fraction' : hp.uniform('bagging_fraction', 0.01, 1.0), # 0.0 < bagging_fraction <= 1.0
         'bagging_freq' : hp.quniform('bagging_freq', 0, 20, 1),
         'feature_fraction' :  hp.uniform('feature_fraction', 0.01, 1.0), # 0.0 < feature_fraction <= 1.0
         'lambda_l2' : hp.uniform('lambda_l2', 0.0, 80.0),
         'min_gain_to_split' : hp.uniform('min_gain_to_split', 0.0, 1.0),
         'scale_pos_weight' : hp.uniform('scale_pos_weight', 1.0, 10.0),
        }

best = fmin(objective,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals=800)

print('#best', best)

print('#min(trials.losses())', min(trials.losses()))


  0%|          | 0/800 [00:00<?, ?it/s, best loss: ?]


  'precision', 'predicted', average, warn_for)



100%|██████████| 800/800 [21:20<00:00,  1.60s/it, best loss: -0.8961748633879782]
#best {'bagging_fraction': 0.6716929834546987, 'bagging_freq': 14.0, 'feature_fraction': 0.9577004284877448, 'lambda_l2': 1.7961190036503814, 'max_bin': 3060.0, 'min_data_in_leaf': 120.0, 'min_gain_to_split': 0.06565291104633288, 'num_leaves': 170.0, 'scale_pos_weight': 3.278613861991695}
#min(trials.losses()) -0.8961748633879782


In [8]:
#no oversampling, is_unbalance=False (default)  
# 63%|██████▎   | 507/800 [12:54<07:27,  1.53s/it, best loss: -0.8961748633879782]
#100%|██████████| 800/800 [21:20<00:00,  1.60s/it, best loss: -0.8961748633879782]
#best {'bagging_fraction': 0.6716929834546987, 'bagging_freq': 14.0, 'feature_fraction': 0.9577004284877448, 
#'lambda_l2': 1.7961190036503814, 'max_bin': 3060.0, 'min_data_in_leaf': 120.0, 
#'min_gain_to_split': 0.06565291104633288, 'num_leaves': 170.0, 'scale_pos_weight': 3.278613861991695}
#min(trials.losses()) -0.8961748633879782

#no oversampling, is_unbalance=True
# 88%|████████▊ | 704/800 [47:37<06:29,  4.06s/it, best loss: -0.8118811881188118]
#100%|██████████| 800/800 [53:48<00:00,  4.04s/it, best loss: -0.8118811881188118]
#best {'bagging_fraction': 0.672585863484594, 'bagging_freq': 10.0, 'feature_fraction': 0.9710798696355661, 
#'lambda_l2': 29.495437260844124, 'max_bin': 410.0, 'min_data_in_leaf': 22.0, 
#'min_gain_to_split': 0.15127703222007383, 'num_leaves': 300.0}
#min(trials.losses()) -0.8118811881188118
    
#no oversampling, is_unbalance=False
# 22%|██▏       | 173/800 [02:15<08:12,  1.27it/s, best loss: -0.8839779005524863]
# 84%|████████▍ | 673/800 [10:29<01:58,  1.07it/s, best loss: -0.888888888888889] 
#100%|██████████| 800/800 [12:41<00:00,  1.05it/s, best loss: -0.888888888888889] 
#best {'bagging_fraction': 0.5809891002103391, 'bagging_freq': 7.0, 'feature_fraction': 0.5843200851328278, 
#'lambda_l2': 5.2016757442137145, 'max_bin': 1300.0, 'min_data_in_leaf': 12.0, 
#'min_gain_to_split': 0.32646303021539036, 'num_leaves': 370.0}
#min(trials.losses()) -0.888888888888889

#With BorderlineSMOTE
#best {'bagging_fraction': 0.9496268535571462, 'bagging_freq': 3.0, 'feature_fraction': 0.5258984347286072, 
#'lambda_l2': 4.108394768587879, 'max_bin': 1710.0, 'min_data_in_leaf': 12.0, 
#'min_gain_to_split': 0.029228723501028592, 'num_leaves': 390.0}
#min(trials.losses()) -0.8677248677248677

#With SMOTE
#best {'bagging_fraction': 0.7589063217431224, 'bagging_freq': 3.0, 'feature_fraction': 0.9971560184326493, 
#'lambda_l2': 0.03681947685513025, 'max_bin': 1970.0, 'min_data_in_leaf': 30.0, 
#'min_gain_to_split': 0.00047339850140067086, 'num_leaves': 500.0}
#min(trials.losses()) -0.8659793814432989