### Initialization

In [15]:
# necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import pprint
import joblib

# for hyperparameter
from scipy.stats import randint
from scipy.stats import uniform

# Model Selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

# Classifiers
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb

# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes

# use all cpu instances
n_jobs=-1

### Loading the Data

In [16]:
# Loading up the data and split it
data = pd.read_csv('agg_table.csv', index_col=0)
X, y = data.drop('label', axis=1), data['label']
X, y = X.to_numpy(), y.to_numpy()
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [17]:
# due to the nature of ROC_AUC it is not fit with the optimization method
# Therefore i modify it

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

In [18]:
# Reporting util for different optimizers

def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params


In [None]:
clf = lgb.LGBMClassifier(boosting_type='gbdt',
                         objective='binary',
                         tree_method = 'gpu_hist',
                         device_type = 'gpu',
                         gpu_use_dp=True,
                         verbose = 100)

param_distributions={"learning_rate": Real(0.001, 1, 'log-uniform', name = 'learning_rate'),
                     "scale_pos_weight": [0.3, 0.4, 0.5],
                     "n_estimators": randint(10, 10000),
                     "num_leaves" : randint(2, 500),
                     "subsample": uniform(0.01, 1),
                     "min_child_weight": randint(0,10),
                     "lambda": Real(1e-5, 1000, 'log-uniform', name ='lambda'),
                     "alpha": Real(1e-3, 100, 'log-uniform', name = 'alpha'),
                     "max_depth": randint(0, 500),
                     "min_child_samples": randint(0, 200),
                     "subsample_freq": randint(0,10),
                     "colsample_bytree": uniform(0.01, 1)
                    }

random_search = RandomizedSearchCV(clf, 
                                   param_distributions= param_distributions,
                                   n_iter=40,
                                   n_jobs=-1,
                                   cv=skf,
                                   scoring=roc_auc,
                                   iid=False, # just return the average score across folds
                                   return_train_score=False,
                                   random_state=0,
                                   verbose = 2)

report_perf(random_search, X_train, y_train, "RandomizedsearchCV")

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
