In [102]:
import pandas as pd
import numpy as np

# Models to use
import lightgbm as lgb
import catboost as cb

# Importing the metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix

# For measuring the training time taken during the fit process
import time

from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [103]:
df = pd.read_csv('higgs_cleaned.csv')
df.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [104]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1864)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1864)

Scaling is surely an important part of the pipeline but I realized that I forgot doing it in the end. I will add several choices for scalers later on, but for now lets move to designing the search space.

For the meaning of search functions such as hp.uniform, you can take a look at here: http://hyperopt.github.io/hyperopt/getting-started/search_spaces/.

In [105]:
classifier_parameters = {
    'l2_leaf_reg': hp.qloguniform('l2_leaf_reg', 0, 2, 1), # Coefficient of the regularizer for the cost function.
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1), # Self-explanatory. I think this will be quite important since it affects overfitting
    'max_depth': hp.choice('max_depth', [2, 3, 6, 8, 10]), # Tree depths
    'num_trees': hp.choice('num_trees', [25, 50, 100]), # Number of trees to be grown
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1.0), # Fraction of randomly selected features to use at each split
    'min_data_in_leaf': hp.choice('min_data_in_leaf', [2, 3, 4, 6, 8]), # Minimum data required in leaves for a split to occur
    #'num_leaves': hp.choice('num_leaves', [4, 8, 16, 32]), # max number of leaves
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

fit_parameters = {
    'early_stopping_rounds': 10,
    'verbose': False
}

ctb_para = dict()
ctb_para['clf_params'] = classifier_parameters
ctb_para['fit_params'] = fit_parameters

Objective function to optimize. It is a bit wordy for now but defining it as a class rather than a function may come in handy later on, in case we add more boosting models.

In [106]:
class objective_fn(object):
    
    def __init__(self, X_train, X_val, X_test, y_train, y_val, y_test):
        self.X_train = X_train
        self.X_val   = X_val
        self.X_test  = X_test
        
        self.y_train = y_train
        self.y_val   = y_val
        self.y_test  = y_test
        
    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def ctb_clf(self, para):
        clf = cb.CatBoostClassifier(**para['clf_params'])
        return self.train(clf, para)
    
    def train(self, clf, para):
        clf.fit(self.X_train, self.y_train,
                eval_set=[(self.X_train, self.y_train), (self.X_val, self.y_val)],
                **para['fit_params'])
        
        preds = clf.predict(self.X_test)
        acc = accuracy_score(self.y_test, preds)
        return {'loss': -acc, 'status': STATUS_OK}

We can now start the trials. Hyperopt tries to minimize the objective function value so the more negative the accuracy, the better. Playing with clf parameters or the value of maximum number of evaluations may give better results, but I'll keep the searching procedure not too expensive.

In [107]:
obj = objective_fn(X_train, X_val, X_test, y_train, y_val, y_test)

trials = Trials()
start = time.time()
best = obj.process(fn_name='ctb_clf', space=ctb_para, trials=trials, algo=tpe.suggest, max_evals=100)
end = time.time()

total_time = end - start

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:44<00:00,  4.04s/trial, best loss: -0.7218765935747068]


In [108]:
print('Parameters for the best CatBoost Model:')
print()
for k,v in ctb_opt[0].items():
    print(k, ':', v)
print()
print('Time taken for HyperParam Search:', round(total_time/60, 2), 'mins')

Parameters for the best CatBoost Model:

colsample_bylevel : 0.6477787840875192
depth : 5
l2_leaf_reg : 6.0
learning_rate : 0.1479219353409454
min_data_in_leaf : 4
num_trees : 2

Time taken for HyperParam Search: 6.74 mins


We have used the hp.choice function which returns the **index** of the best hyperparameter inside the given list of hyperparameters. So of course, the best hyperparameters for the number of trees is not 2, but 100.

The list below returns the correct model hyperparameters.

In [121]:
best_params = space_eval(ctb_para['clf_params'], trials.argmin)
best_params

{'colsample_bylevel': 0.5740694476873252,
 'eval_metric': 'Accuracy',
 'l2_leaf_reg': 1.0,
 'learning_rate': 0.2727180907413367,
 'loss_function': 'Logloss',
 'max_depth': 6,
 'min_data_in_leaf': 8,
 'num_trees': 100,
 'random_seed': 1864}

In [124]:
clf = cb.CatBoostClassifier(**best_params)

clf.fit(X_train, y_train, verbose=False, early_stopping_rounds=10)

preds = clf.predict(X_test)

print(accuracy_score(y_test, preds))

0.722233554309026


In [126]:
clf.save_model(
    "model.json",
    format="json",
    # pool=pool  # this parameter is required only for models with categorical features.
)