In [5]:
import time
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score, log_loss
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [2]:
# actually training the LightBGM model
xtrain_path = 'models/X_train.npy'
xtest_path = 'models/X_test.npy'
ytrain_path = 'models/y_train.npy'
ytest_path = 'models/y_test.npy'
cat_ind_path = 'models/cat_ind.npy'

light_model_path = 'models/light_trained.pkl'

X_train = np.load(xtrain_path)
X_test = np.load(xtest_path)
y_train = np.load(ytrain_path)
y_test = np.load(ytest_path)
categorical_features_indices = list(np.load(cat_ind_path))
categorical_features_indices = [int(i) for i in categorical_features_indices]

print(f'X_train shape : {X_train.shape}')
print(f'X_test shape : {X_test.shape}')
print(f'y_train shape : {y_train.shape}')
print(f'y_test shape : {y_test.shape}')

verbose = True

X_train shape : (503656, 346)
X_test shape : (125914, 346)
y_train shape : (503656,)
y_test shape : (125914,)


In [3]:
early_stopping_callback = lgb.early_stopping(stopping_rounds = 50, verbose = True)
lgbm = lgb.LGBMClassifier(objective = 'binary',
                          metric = 'binary_logloss',
                          n_estimators = 1000, # tuneable
                          learning_rate = 0.05, # tuneable
                          num_leaves = 31, # tuneable
                          max_depth = -1, # no limit on tree depth (hmmmm....)
                          random_state = 42,
                          n_jobs = -1) # use all cores

param_dist = {
    'n_estimators': sp_randint(100, 1500), # Sample number of estimators
    'learning_rate': sp_uniform(0.01, 0.1), # Sample learning rate
    'num_leaves': sp_randint(20, 60), # Sample number of leaves
    'max_depth': [-1, 10, 15, 20], # List of specific values to try
    'feature_fraction': sp_uniform(0.6, 0.4), # Sample between 0.6 and 1.0 (0.6 + 0.4)
    'bagging_fraction': sp_uniform(0.6, 0.4), # Sample between 0.6 and 1.0
    'bagging_freq': sp_randint(1, 10),
    'min_child_samples': sp_randint(10, 50), # Replaces min_data_in_leaf for sklearn API
    'lambda_l1': sp_uniform(0, 1.0), # L1 regularization
    'lambda_l2': sp_uniform(0, 1.0)  # L2 regularization
}

n_iter_search = 5 # Number of parameter settings that are sampled. 
random_search = RandomizedSearchCV(
    estimator = lgbm, # the model we training
    param_distributions = param_dist,
    n_iter = n_iter_search,
    scoring = 'neg_log_loss', # Use neg_log_loss since lower logloss is better
    cv = 2, # Number of cross-validation folds 
    refit = True, # Refit the best model on the whole training data
    random_state=42,
    verbose=1,
    n_jobs=1
)

fit_params = {
    'categorical_feature' : categorical_features_indices,
}

print('Starting training')
t0 = time.time()

random_search.fit (X_train, y_train, **fit_params)


Starting training
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.308371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.307938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.300270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.302288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.319859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.299070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.295193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.293950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.303687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 104290, number of negative: 147538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.364756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 251828, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910




[LightGBM] [Info] Number of positive: 208580, number of negative: 295076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.627623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77455
[LightGBM] [Info] Number of data points in the train set: 503656, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.414132 -> initscore=-0.346910
[LightGBM] [Info] Start training from score -0.346910
Finished training after 634.9907155036926 seconds.
Best parameters found are:

{'bagging_fraction': np.float64(0.9439761626945282), 'bagging_freq': 7, 'feature_fraction': np.float64(0.6682096494749166), 'lambda_l1': np.float64(0.06505159298527952), 'lambda_l2': np.float64(0.9488855372533332), 'learning_rate': np.float64(0.10656320330745593), 'max_depth': 10, 'min_child_samples': 18, 'n_estimators': 1469, 'num_leaves': 21}
Best LightGBM model saved at models/light_trained.pkl.


In [8]:
if verbose:
    print(f'Finished training after {time.time() - t0} seconds.\nBest parameters found are:\n')
    print(random_search.best_params_)

best_lgbm = random_search.best_estimator_
joblib.dump(best_lgbm, light_model_path)
print(f'Best LightGBM model saved at {light_model_path}.')

y_pred = best_lgbm.predict(X_test)

print(f'Classification report\n{classification_report(y_test, y_pred)}')
print(f'AUC SCORES\n{roc_auc_score(y_test, y_pred)}')
print(f'LogLoss score\n{log_loss(y_test, y_pred)}')

Finished training after 992.949738740921 seconds.
Best parameters found are:

{'bagging_fraction': np.float64(0.9439761626945282), 'bagging_freq': 7, 'feature_fraction': np.float64(0.6682096494749166), 'lambda_l1': np.float64(0.06505159298527952), 'lambda_l2': np.float64(0.9488855372533332), 'learning_rate': np.float64(0.10656320330745593), 'max_depth': 10, 'min_child_samples': 18, 'n_estimators': 1469, 'num_leaves': 21}
Best LightGBM model saved at models/light_trained.pkl.




Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73937
           1       1.00      1.00      1.00     51977

    accuracy                           1.00    125914
   macro avg       1.00      1.00      1.00    125914
weighted avg       1.00      1.00      1.00    125914

AUC SCORES
0.9998603713958238
LogLoss score
0.0045800979575416735
