Training LightGBM on the fraud dataset.
- SMOTE for Upsampling
- F1 score for metric
- using early stopping
- Hyperopt for hyperparameter optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE 
from imblearn.combine import SMOTETomek, SMOTEENN
import lightgbm as lgb
import sklearn
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import f1_score


In [2]:
x_train = pd.read_csv('./data/x_train.csv').values
y_train = pd.read_csv('./data/y_train.csv').values[:,0]
x_test = pd.read_csv('./data/x_test.csv').values
y_test = pd.read_csv('./data/y_test.csv').values[:,0]


In [3]:
x_train_resampled, y_train_resampled = SMOTE(random_state = 42).fit_resample(x_train, y_train)

print('#pos labels unsampled:', sum(y_train == 1))
print('#neg labels unsampled::', sum(y_train == 0))
print('#pos labels resampled:', sum(y_train_resampled == 1))
print('#neg labels resampled::', sum(y_train_resampled == 0))

#pos labels unsampled: 394
#neg labels unsampled:: 227451
#pos labels resampled: 227451
#neg labels resampled:: 227451


In [4]:
train_data = lgb.Dataset(x_train_resampled, label=y_train_resampled)
test_data = lgb.Dataset(x_test, label=y_test)


In [5]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True


In [6]:

def objective(params):
    #print(params)
    
    evals_result = {}

    num_leaves = int(params['num_leaves'])
    min_data_in_leaf = int(params['min_data_in_leaf'])
    max_bin = int(params['max_bin'])
    bagging_fraction = params['bagging_fraction']
    bagging_freq = int(params['bagging_freq'])
    feature_fraction = params['feature_fraction']
    lambda_l2 = params['lambda_l2'],
    min_gain_to_split = params['min_gain_to_split']

    param = {'num_leaves':num_leaves, 
             'min_data_in_leaf':min_data_in_leaf, 
             'max_bin':max_bin,
             'learning_rate':0.1,
             'num_trees':1000, 
             'objective':'binary',
             'bagging_fraction':bagging_fraction,
             'bagging_freq':bagging_freq,
             'feature_fraction':feature_fraction,
             'verbose':-1,
             'lambda_l2':lambda_l2,
             'min_gain_to_split':min_gain_to_split,
             #'metric' : 'binary_logloss' # map, MAP, aliases: mean_average_precision
            }
    
    bst = lgb.train(param, 
                    train_data, 
                    valid_sets=[test_data], 
                    early_stopping_rounds=15, 
                    verbose_eval=False,
                    feval=lgb_f1_score, 
                    evals_result=evals_result,
                   )

    f1 = max(evals_result['valid_0']['f1'])
    
    return -f1


In [7]:
trials = Trials()

space = {
         'num_leaves' : hp.quniform('num_leaves', 100, 600, 10),
         'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 10, 30, 1),
         'max_bin' : hp.quniform('max_bin', 200, 2000, 10),
         'bagging_fraction' : hp.uniform('bagging_fraction', 0.01, 1.0),
         'bagging_freq' : hp.quniform('bagging_freq', 0, 10, 1),
         'feature_fraction' :  hp.uniform('feature_fraction', 0.5, 1.0),
         'lambda_l2' : hp.uniform('lambda_l2', 0.0, 80.0),
         'min_gain_to_split' : hp.uniform('min_gain_to_split', 0.0, 1.0),
        }

best = fmin(objective,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals=800)

print('#best', best)

print('#min(trials.losses())', min(trials.losses()))


  0%|          | 0/800 [00:00<?, ?it/s, best loss: ?]



100%|██████████| 800/800 [1:22:28<00:00,  6.72s/it, best loss: -0.8659793814432989]
#best {'bagging_fraction': 0.7589063217431224, 'bagging_freq': 3.0, 'feature_fraction': 0.9971560184326493, 'lambda_l2': 0.03681947685513025, 'max_bin': 1970.0, 'min_data_in_leaf': 30.0, 'min_gain_to_split': 0.00047339850140067086, 'num_leaves': 500.0}
#min(trials.losses()) -0.8659793814432989


In [None]:
#16%|█▌        | 125/800 [11:27<1:14:11,  6.59s/it, best loss: -0.8469387755102041]
#19%|█▉        | 153/800 [14:15<1:11:37,  6.64s/it, best loss: -0.8514851485148514]
#20%|██        | 160/800 [15:07<1:18:03,  7.32s/it, best loss: -0.8571428571428571]
#40%|████      | 321/800 [31:41<52:51,  6.62s/it, best loss: -0.8615384615384615]  
#60%|█████▉    | 479/800 [47:53<34:38,  6.48s/it, best loss: -0.864321608040201]  
#90%|█████████ | 724/800 [1:13:52<09:38,  7.61s/it, best loss: -0.8659793814432989]
#100%|██████████| 800/800 [1:22:28<00:00,  6.72s/it, best loss: -0.8659793814432989]
#best {'bagging_fraction': 0.7589063217431224, 'bagging_freq': 3.0, 'feature_fraction': 0.9971560184326493, 'lambda_l2': 0.03681947685513025, 'max_bin': 1970.0, 'min_data_in_leaf': 30.0, 'min_gain_to_split': 0.00047339850140067086, 'num_leaves': 500.0}
#min(trials.losses()) -0.8659793814432989