In [16]:
import pandas as pd
import numpy as np

# Models to use
#import lightgbm as lgb
import catboost as cb

# Importing the metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix

# For measuring the training time taken during the fit process
from sklearn.model_selection import cross_val_score
import time

from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [29]:
from hyperopt import space_eval

In [17]:
df = pd.read_csv('higgs_cleaned.csv')
df.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [18]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1864)

In [19]:
print(f'Train / test split size: train set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}')

Train / test split size: train set size: 78439, test set size: 19610


## The run that have found first optimal params:

In [115]:
def objective_fn(params):
    clf = cb.CatBoostClassifier(**params['clf_params'], verbose=False, task_type="GPU")
    acc = cross_val_score(clf, X_train, y_train, scoring='accuracy').mean()
    return {"loss": -acc, "status": STATUS_OK}

We have considered multiple search spaces in different configurations before finding the one, where we have found the parameters that have outperformed baseline. Some of the hyperparameters that we have also considered, but that are not present in the current search space: colsample_bylevel (not supported on GPU), min_data_in_leaf, num_leaves, num_trees.

In [116]:
classifier_parameters = {
    'l2_leaf_reg': hp.choice('l2_leaf_reg', [3,1,5,10,100]), 
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.uniform('random_strength', 0.0, 100),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

fit_parameters = {
    'early_stopping_rounds': 10,
    'verbose': True
}

ctb_para = dict()
ctb_para['clf_params'] = classifier_parameters
ctb_para['fit_params'] = fit_parameters

In [117]:
trials = Trials()

best = fmin(
    fn=objective_fn,
    space = ctb_para, 
    algo=tpe.suggest, 
    max_evals=200, 
    trials=trials
)

best_params = space_eval(ctb_para, best)
print(best_params)

clf = cb.CatBoostClassifier(**best_params['clf_params'])

clf.fit(X_train, y_train, verbose=False)

preds = clf.predict(X_test)

print(accuracy_score(y_test, preds))

  0%|                                                                          | 0/200 [00:00<?, ?trial/s, best loss=?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

100%|███████████████████████████████████████████| 200/200 [2:25:32<00:00, 43.66s/trial, best loss: -0.7297772498897755]
{'clf_params': {'bagging_temperature': 1.4426651823376004, 'border_count': 128, 'depth': 7, 'eval_metric': 'Accuracy', 'l2_leaf_reg': 10, 'learning_rate': 0.05212984419824801, 'loss_function': 'Logloss', 'random_seed': 1864, 'random_strength': 62.078606789809335}, 'fit_params': {'early_stopping_rounds': 10, 'verbose': True}}
0.7308516063233045


** ! We observe that sinc we pass dictionrary `ctb_para`, when we print `best_params` we also print `fit_params`, but we have not passed them to `cross_val_score` function during training. 

Since the cells above have been copied from other notebook, we will explicitly specify the parameters during training:

In [26]:
clf = cb.CatBoostClassifier(
    bagging_temperature = 1.4426651823376004,
    border_count = 128,
    depth = 7,
    eval_metric = 'Accuracy',
    l2_leaf_reg = 10,
    learning_rate = 0.05212984419824801,
    loss_function = 'Logloss',
    random_seed = 1864,
    random_strength = 62.078606789809335
)

clf.fit(X_train, y_train, verbose=False)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7308516063233045


### Modified class definition (TO DO: pass `fit_params` to `cross_val_score` function and re-run)

In [20]:
class objective_fn(object):
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def ctb_clf(self, params):
        clf = cb.CatBoostClassifier(**params, verbose=False, task_type="GPU")
        acc = cross_val_score(clf, self.X_train, self.y_train, scoring='accuracy').mean()
        return {"loss": -acc, "status": STATUS_OK}   

In [33]:
classifier_parameters = {
    'l2_leaf_reg': hp.choice('l2_leaf_reg', [3,1,5,10,100]), 
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.uniform('random_strength', 0.0, 100),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

In [22]:
obj = objective_fn(X_train, y_train)

trials = Trials()
start = time.time()
best = obj.process(fn_name='ctb_clf', space=classifier_parameters, trials=trials, algo=tpe.suggest, max_evals=300)
end = time.time()

total_time = end - start

100%|███████████████████████████████████████████| 300/300 [4:48:17<00:00, 57.66s/trial, best loss: -0.7291015774648418]


In [37]:
opt_params = space_eval(classifier_parameters, best[0])
opt_params

{'bagging_temperature': 0.2879213699517955,
 'border_count': 128,
 'depth': 10,
 'eval_metric': 'Accuracy',
 'l2_leaf_reg': 5,
 'learning_rate': 0.031731849925222905,
 'loss_function': 'Logloss',
 'random_seed': 1864,
 'random_strength': 59.7185747097968}

In [38]:
clf = cb.CatBoostClassifier(
    **opt_params
)

clf.fit(X_train, y_train, verbose=False)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7316165221825599


We observe the confirmation that `cross validation` in a more pessimistics estimation of the generalization error: for the previous optimal parameters the cross validated accuracy was `0.7297772498897755`, while the actual test accuracy: `0.7308516063233045`, at the same time the cross validated accuracy of our current optimal model was `0.7291015774648418` (which is lower), while the actual test accuracy is `0.7316165221825599`.