Training LightGBM on the fraud dataset.
- SMOTE for Upsampling
- F1 score for metric
- using early stopping

In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import f1_score
import warnings


In [2]:
x_train = pd.read_csv('./data/x_train.csv').values
y_train = pd.read_csv('./data/y_train.csv').values[:,0]
x_test = pd.read_csv('./data/x_test.csv').values
y_test = pd.read_csv('./data/y_test.csv').values[:,0]


In [3]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True


In [4]:

def objective(params):
    print(params)
    
    num_leaves = int(params['num_leaves'])
    min_data_in_leaf = int(params['min_data_in_leaf'])
    max_bin = int(params['max_bin'])
    bagging_fraction = params['bagging_fraction']
    bagging_freq = int(params['bagging_freq'])
    feature_fraction = params['feature_fraction']
    lambda_l2 = params['lambda_l2'],
    min_gain_to_split = params['min_gain_to_split']
    scale_pos_weight = params['scale_pos_weight']
    num_fraud = params['num_fraud']

    param = {'num_leaves':num_leaves, 
             'min_data_in_leaf':min_data_in_leaf, 
             'max_bin':max_bin,
             'learning_rate':0.1,
             'num_trees':1000, 
             'objective':'binary',
             'bagging_fraction':bagging_fraction,
             'bagging_freq':bagging_freq,
             'feature_fraction':feature_fraction,
             #'verbose':-1,
             'lambda_l2':lambda_l2,
             'min_gain_to_split':min_gain_to_split,
             'scale_pos_weight' : scale_pos_weight,
            }

    x_train_resampled, y_train_resampled = SMOTE(random_state = 42, 
                                                 sampling_strategy=num_fraud/227451
                                                ).fit_resample(x_train, y_train)
 
    train_data = lgb.Dataset(x_train_resampled, label=y_train_resampled)
    test_data = lgb.Dataset(x_test, label=y_test)    

    bst = lgb.train(param, 
                    train_data, 
                    valid_sets=[test_data], 
                    early_stopping_rounds=15, 
                    #verbose_eval=False,
                    feval=lgb_f1_score, 
                   )

    return bst
    
   


In [5]:
best = {'bagging_fraction': 0.9997070861877592, 'bagging_freq': 20.0, 'feature_fraction': 0.6950759609275808, 
        'lambda_l2': 5.3205080171148165, 'max_bin': 2470.0, 'min_data_in_leaf': 289.0, 
        'min_gain_to_split': 0.6120818152340506, 'num_fraud': 460.0, 'num_leaves': 700.0, 'scale_pos_weight': 8.828004877536069
       }

booster = objective(best)

{'bagging_fraction': 0.9997070861877592, 'bagging_freq': 20.0, 'feature_fraction': 0.6950759609275808, 'lambda_l2': 5.3205080171148165, 'max_bin': 2470.0, 'min_data_in_leaf': 289.0, 'min_gain_to_split': 0.6120818152340506, 'num_fraud': 460.0, 'num_leaves': 700.0, 'scale_pos_weight': 8.828004877536069}




[1]	valid_0's binary_logloss: 0.0232832	valid_0's f1: 0.681034
Training until validation scores don't improve for 15 rounds.
[2]	valid_0's binary_logloss: 0.0199374	valid_0's f1: 0.681034
[3]	valid_0's binary_logloss: 0.0168572	valid_0's f1: 0.681034
[4]	valid_0's binary_logloss: 0.0135264	valid_0's f1: 0.681034
[5]	valid_0's binary_logloss: 0.0102002	valid_0's f1: 0.681034
[6]	valid_0's binary_logloss: 0.00962852	valid_0's f1: 0.681034
[7]	valid_0's binary_logloss: 0.00764848	valid_0's f1: 0.681034
[8]	valid_0's binary_logloss: 0.00661451	valid_0's f1: 0.692308
[9]	valid_0's binary_logloss: 0.00610566	valid_0's f1: 0.692308
[10]	valid_0's binary_logloss: 0.00573049	valid_0's f1: 0.697872
[11]	valid_0's binary_logloss: 0.00548064	valid_0's f1: 0.694915
[12]	valid_0's binary_logloss: 0.00523716	valid_0's f1: 0.694915
[13]	valid_0's binary_logloss: 0.00505158	valid_0's f1: 0.713043
[14]	valid_0's binary_logloss: 0.00492209	valid_0's f1: 0.719298
[15]	valid_0's binary_logloss: 0.00473386	

In [6]:
predictions = booster.predict(x_test)

print('predictions:', predictions)

print('predictions.shape:', predictions.shape)

predictions: [4.63248435e-05 9.81981277e-05 2.61699537e-03 ... 5.95966967e-05
 9.23186404e-05 4.81079780e-04]
predictions.shape: (56962,)


In [7]:
y_hat = np.round(predictions) # scikits f1 doesn't like probabilities
f1_score = f1_score(y_test, y_hat)

print('f1_score:', f1_score)

f1_score: 0.8936170212765957
