This notebook enables to train the SGBT model.

In [None]:
import csv
import pickle

import glob2
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import metrics
from sklearn.utils import shuffle
from hyperopt import hp, Trials
from hyperopt import fmin, tpe, space_eval

## Parameters

In [None]:
ROOT_DIR = "/path/to/the/dataset"  # path where we expect to find directories named "postives", "negatives" and a csv file
CHECKPOINTS_DIR = "../../../../data/model_saves/SGBT"  # directory where the model will save its history and checkpoints
FOLDS = 5  # number of folds for the cross-validation
# search space for SGBT hyperparameters
lr_limits = (0.01, 1.0)
nb_limits = (1, 10_000)
np.random.seed(seed=0)  # seed for RNG

## Load data

In [None]:
with open(ROOT_DIR + "/dataset.csv", "r") as f:
    csv_reader = csv.reader(f, delimiter=",")
    lines = list(csv_reader)

pos_f = glob2.glob(f"{ROOT_DIR}/positives/*.npy")
pos_data = {p.split("/")[-1]:np.load(p) for p in pos_f}
neg_f = glob2.glob(f"{ROOT_DIR}/negatives/*.npy")
neg_data = {n.split("/")[-1]:np.load(n) for n in neg_f}

posX, negX = [], []
for line in lines:
    _X_list = posX if line[2]=="positive" else negX
    data = pos_data if line[2]=="positive" else neg_data
    station = line[0].split("/")[-1]
    idx = int(line[1])
    _X_list.append(data[station][idx])
posX, negX = np.array(posX), np.array(negX)
    
print(f"{len(posX)} positive samples and {len(negX)} negative samples found")
    
posY, negY = np.ones(len(posX)), np.zeros(len(negX))
posX, posY = shuffle(posX, posY)
negX, negY = shuffle(negX, negY)

#### Make the datasets for a cross-validation approach

In [None]:
X_trains, Y_trains, X_valids, Y_valids = [], [], [], []
for i in range(FOLDS):
    start_valid_idx = int(len(posX) * i / FOLDS)
    end_valid_idx = int(len(posX) * (i + 1) / FOLDS)
    # unbalanced training set
    X_trains.append(np.concatenate((posX[:start_valid_idx], posX[end_valid_idx:],
                                    negX[:start_valid_idx], negX[end_valid_idx:])))
    Y_trains.append(np.concatenate((posY[:start_valid_idx], posY[end_valid_idx:],
                                    negY[:start_valid_idx], negY[end_valid_idx:])))
    # balanced validation set
    X_valids.append(np.concatenate((posX[start_valid_idx:end_valid_idx],
                                    negX[start_valid_idx:end_valid_idx])))
    Y_valids.append(np.concatenate((posY[start_valid_idx:end_valid_idx],
                                    negY[start_valid_idx:end_valid_idx])))
    X_trains[-1], Y_trains[-1] = shuffle(X_trains[-1], Y_trains[-1])
    X_valids[-1], Y_valids[-1] = shuffle(X_valids[-1], Y_valids[-1])
X_train_all = np.concatenate((posX, negX))
Y_train_all = np.concatenate((posY, negY))

## Training : hyperparameters tuning

In [None]:
# loss function for hyperparameters evaluation : we train a classifier on each FOLD and average the AuC.
def objective(args):
    nb, lr = args
    nb = int(nb)
    val_score = 0
    for i in range(FOLDS):
        classifier = HistGradientBoostingClassifier(learning_rate=lr, max_iter=nb, max_depth=4, random_state=0)
        res = classifier.fit(X_trains[i], Y_trains[i])
        pred = res.predict_proba(X_valids[i])[:,1]
        val_score += metrics.roc_auc_score(Y_valids[i], pred)
    return 1 - val_score / FOLDS

# object to record the values tried by hyperopt
trials = Trials()

# a priori distributions
space = [hp.uniform('nb', nb_limits[0], nb_limits[1]), hp.loguniform('lr', np.log(lr_limits[0]), np.log(lr_limits[1]))]  

# minimize the objective over the space
best = fmin(objective, space, algo=tpe.suggest, max_evals=10, trials=trials)
best_params = space_eval(space, best)

print(best)
print(best_params)  # 2421, 0.10524 were obtained
print(objective((best_params[0], best_params[1])))

# save the history
with open(f"{CHECKPOINTS_DIR}/save_trials_20s", 'wb') as f:
    pickle.dump(trials, f)

## Analysis of the explored space

In [None]:
# load tried hyperparameters history
with open(f"{CHECKPOINTS_DIR}/save_trials_20s", 'rb') as f:
        trials = pickle.load(f)
best = dict.copy(trials.best_trial["misc"]["vals"])
for i in best.keys():
    best[i] = best[i][0]

# get tried points
xy_HO = [np.array([x['misc']['vals']['lr'] for x in trials.trials]), 
         np.array([x['misc']['vals']['nb'] for x in trials.trials]),
         np.array([x['result']['loss'] for x in trials.trials])]

# get best point
best_HO = (-trials.best_trial['result']['loss'], (space_eval(space, best)[1], 
                                                space_eval(space, best)[0]))

# drawing mesh to draw the points tried
xy_mesh = np.meshgrid(np.linspace(*lr_limits, 2001), np.linspace(*nb_limits, 2001))
fct = lambda x, y: np.zeros((len(x),len(y)))
fct_mesh = fct(xy_mesh[0], xy_mesh[1])

fig0 = plt.figure(figsize=(8, 8))

# the color depends on the loss
colors = 1 - xy_HO[2]
plot = plt.scatter(xy_HO[0][:,0], xy_HO[1][:,0], linewidth=0, marker='.', c=colors)

# mark the best result with a cross (which means two lines, one vertical, one horizontal)
plt.plot(lr_limits, [best_HO[1][1]]*2, linewidth=1, linestyle='--', color='red')
plt.plot([best_HO[1][0]]*2, nb_limits, linewidth=1, linestyle='--', color='red')

plt.gca().set_xlim(lr_limits)
plt.gca().set_ylim(nb_limits)
    
_ = fig0.colorbar(plot, ax=plt.gca(), fraction=0.05, pad=0.07, aspect=18)
plt.xlabel('learning rate', fontsize=12)
plt.ylabel('number of trees', fontsize=12)

plt.gca().grid(True)
plt.gca().set_aspect(lr_limits[1]/nb_limits[1])
plt.savefig('../../../figures/SGBT_parameters_space.png')

## Final training on all data

In [None]:
classifier = HistGradientBoostingClassifier(learning_rate=0.10524, max_iter=2421, max_depth=4, random_state=0)
for i in range(100):
    classifier.fit(X_train_all, Y_train_all)

with open(f"{CHECKPOINTS_DIR}/save_model_20s", 'wb') as f:
    pickle.dump(classifier, f)