In [1]:
import pandas as pd
import numpy as np

# Models to use
#import lightgbm as lgb
import catboost as cb

# Importing the metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix

# For measuring the training time taken during the fit process
from sklearn.model_selection import cross_val_score
import time

from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [2]:
from hyperopt import space_eval

In [3]:
df = pd.read_csv('higgs_cleaned.csv')
df.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [4]:
X, y = df.drop('class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1864)

In [5]:
print(f'Train / test split size: train set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}')

Train / test split size: train set size: 78439, test set size: 19610


## The run that have found first optimal params:

In [115]:
def objective_fn(params):
    clf = cb.CatBoostClassifier(**params['clf_params'], verbose=False, task_type="GPU")
    acc = cross_val_score(clf, X_train, y_train, scoring='accuracy').mean()
    return {"loss": -acc, "status": STATUS_OK}

We have considered multiple search spaces in different configurations before finding the one, where we have found the parameters that have outperformed baseline. Some of the hyperparameters that we have also considered, but that are not present in the current search space: colsample_bylevel (not supported on GPU), min_data_in_leaf, num_leaves, num_trees.

In [116]:
classifier_parameters = {
    'l2_leaf_reg': hp.choice('l2_leaf_reg', [3,1,5,10,100]), 
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.uniform('random_strength', 0.0, 100),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

fit_parameters = {
    'early_stopping_rounds': 10,
    'verbose': True
}

ctb_para = dict()
ctb_para['clf_params'] = classifier_parameters
ctb_para['fit_params'] = fit_parameters

In [117]:
trials = Trials()

best = fmin(
    fn=objective_fn,
    space = ctb_para, 
    algo=tpe.suggest, 
    max_evals=200, 
    trials=trials
)

best_params = space_eval(ctb_para, best)
print(best_params)

clf = cb.CatBoostClassifier(**best_params['clf_params'])

clf.fit(X_train, y_train, verbose=False)

preds = clf.predict(X_test)

print(accuracy_score(y_test, preds))

  0%|                                                                          | 0/200 [00:00<?, ?trial/s, best loss=?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

100%|███████████████████████████████████████████| 200/200 [2:25:32<00:00, 43.66s/trial, best loss: -0.7297772498897755]
{'clf_params': {'bagging_temperature': 1.4426651823376004, 'border_count': 128, 'depth': 7, 'eval_metric': 'Accuracy', 'l2_leaf_reg': 10, 'learning_rate': 0.05212984419824801, 'loss_function': 'Logloss', 'random_seed': 1864, 'random_strength': 62.078606789809335}, 'fit_params': {'early_stopping_rounds': 10, 'verbose': True}}
0.7308516063233045


** ! We observe that sinc we pass dictionrary `ctb_para`, when we print `best_params` we also print `fit_params`, but we have not passed them to `cross_val_score` function during training. 

Since the cells above have been copied from other notebook, we will explicitly specify the parameters during training:

In [26]:
clf = cb.CatBoostClassifier(
    bagging_temperature = 1.4426651823376004,
    border_count = 128,
    depth = 7,
    eval_metric = 'Accuracy',
    l2_leaf_reg = 10,
    learning_rate = 0.05212984419824801,
    loss_function = 'Logloss',
    random_seed = 1864,
    random_strength = 62.078606789809335
)

clf.fit(X_train, y_train, verbose=False)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7308516063233045


### Modified class definition (TO DO: pass `fit_params` to `cross_val_score` function and re-run)

In [20]:
class objective_fn(object):
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def ctb_clf(self, params):
        clf = cb.CatBoostClassifier(**params, verbose=False, task_type="GPU")
        acc = cross_val_score(clf, self.X_train, self.y_train, scoring='accuracy').mean()
        return {"loss": -acc, "status": STATUS_OK}   

In [33]:
classifier_parameters = {
    'l2_leaf_reg': hp.choice('l2_leaf_reg', [5, 6, 7, 8, 9, 10, 50, 60, 70, 100]), 
    'learning_rate': hp.choice('learning_rate', 0.01, 0.02, 0.03, 0.04, 0.05),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.uniform('random_strength', 60, 150),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

In [22]:
obj = objective_fn(X_train, y_train)

trials = Trials()
start = time.time()
best = obj.process(fn_name='ctb_clf', space=classifier_parameters, trials=trials, algo=tpe.suggest, max_evals=300)
end = time.time()

total_time = end - start

100%|███████████████████████████████████████████| 300/300 [4:48:17<00:00, 57.66s/trial, best loss: -0.7291015774648418]


In [37]:
opt_params = space_eval(classifier_parameters, best[0])
opt_params

{'bagging_temperature': 0.2879213699517955,
 'border_count': 128,
 'depth': 10,
 'eval_metric': 'Accuracy',
 'l2_leaf_reg': 5,
 'learning_rate': 0.031731849925222905,
 'loss_function': 'Logloss',
 'random_seed': 1864,
 'random_strength': 59.7185747097968}

In [38]:
clf = cb.CatBoostClassifier(
    **opt_params
)

clf.fit(X_train, y_train, verbose=False)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7316165221825599


We observe the confirmation that `cross validation` in a more pessimistics estimation of the generalization error: for the previous optimal parameters the cross validated accuracy was `0.7297772498897755`, while the actual test accuracy: `0.7308516063233045`, at the same time the cross validated accuracy of our current optimal model was `0.7291015774648418` (which is lower), while the actual test accuracy is `0.7316165221825599`.

# With early stopping

In [8]:
classifier_parameters = {
    'l2_leaf_reg': hp.choice('l2_leaf_reg', [3,1,5,10,100]), 
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.uniform('random_strength', 0.0, 100),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

fit_parameters = {
    'early_stopping_rounds': 30,
    'verbose': False
}

ctb_para = dict()
ctb_para['clf_params'] = classifier_parameters
ctb_para['fit_params'] = fit_parameters

In [15]:
class objective_fn(object):
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def ctb_clf(self, params):
        clf = cb.CatBoostClassifier(**params['clf_params'], verbose=False, task_type="GPU")
        acc = cross_val_score(clf, self.X_train, self.y_train, scoring='accuracy', fit_params=params['fit_params']).mean()
        return {"loss": -acc, "status": STATUS_OK}   

In [16]:
obj = objective_fn(X_train, y_train)

trials = Trials()
start = time.time()
best = obj.process(fn_name='ctb_clf', space=ctb_para, trials=trials, algo=tpe.suggest, max_evals=300)
end = time.time()

total_time = end - start

  0%|                                                                          | 0/300 [00:00<?, ?trial/s, best loss=?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

100%|███████████████████████████████████████████| 300/300 [4:01:42<00:00, 48.34s/trial, best loss: -0.7288848504920339]


In [17]:
opt_params = space_eval(classifier_parameters, best[0])
opt_params

{'bagging_temperature': 0.4669272871123269,
 'border_count': 128,
 'depth': 10,
 'eval_metric': 'Accuracy',
 'l2_leaf_reg': 5,
 'learning_rate': 0.033961530095772206,
 'loss_function': 'Logloss',
 'random_seed': 1864,
 'random_strength': 47.24693835569566}

In [21]:
clf = cb.CatBoostClassifier(
    **opt_params
)

clf.fit(X_train, y_train, **fit_parameters)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7306986231514533


In [24]:
clf = cb.CatBoostClassifier(
    **opt_params
)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

0:	learn: 0.5816239	total: 59.3ms	remaining: 59.2s
1:	learn: 0.6118130	total: 116ms	remaining: 58.1s
2:	learn: 0.6234016	total: 172ms	remaining: 57s
3:	learn: 0.6355894	total: 231ms	remaining: 57.5s
4:	learn: 0.6504035	total: 297ms	remaining: 59.1s
5:	learn: 0.6494856	total: 355ms	remaining: 58.7s
6:	learn: 0.6510792	total: 403ms	remaining: 57.1s
7:	learn: 0.6512449	total: 458ms	remaining: 56.8s
8:	learn: 0.6470888	total: 518ms	remaining: 57s
9:	learn: 0.6568034	total: 575ms	remaining: 56.9s
10:	learn: 0.6531190	total: 633ms	remaining: 56.9s
11:	learn: 0.6531445	total: 691ms	remaining: 56.9s
12:	learn: 0.6557197	total: 749ms	remaining: 56.8s
13:	learn: 0.6618519	total: 806ms	remaining: 56.7s
14:	learn: 0.6605898	total: 864ms	remaining: 56.7s
15:	learn: 0.6631268	total: 925ms	remaining: 56.9s
16:	learn: 0.6660462	total: 993ms	remaining: 57.4s
17:	learn: 0.6657658	total: 1.05s	remaining: 57.4s
18:	learn: 0.6662120	total: 1.11s	remaining: 57.4s
19:	learn: 0.6670406	total: 1.17s	remaining:

In [6]:
current_opt = {'bagging_temperature': 0.2879213699517955,
             'border_count': 128,
             'depth': 10,
             'eval_metric': 'Accuracy',
             'l2_leaf_reg': 5,
             'learning_rate': 0.031731849925222905,
             'loss_function': 'Logloss',
             'random_seed': 1864,
             'random_strength': 59.7185747097968}

In [11]:
clf = cb.CatBoostClassifier(
    **current_opt, od_type='Iter', od_wait=20
)

clf.fit(X_train, y_train, **fit_parameters)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Test accuracy of the current optimal catboost model: 0.7316165221825599


In [None]:
{'bagging_temperature': 0.2879213699517955,
 'border_count': 128,
 'depth': 10,
 'eval_metric': 'Accuracy',
 'l2_leaf_reg': 5,
 'learning_rate': 0.031731849925222905,
 'loss_function': 'Logloss',
 'random_seed': 1864,
 'random_strength': 59.7185747097968}

# New search space

In [27]:
classifier_parameters = {
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 3, 30, 1), 
    'learning_rate': hp.loguniform('learning_rate', -5.0, -0.7),
    'depth': hp.choice('depth', [6, 7, 8, 9, 10]),
    'random_strength': hp.quniform('random_strength', 1, 100, 1),
    'border_count': hp.choice('border_count', [128, 254]),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.),
    'iterations': hp.choice('iterations', [500, 600, 700, 800, 900, 1000]),
    'eval_metric': 'Accuracy',
    'loss_function':'Logloss',
    'random_seed': 1864
}

fit_parameters = {
    'early_stopping_rounds': 30,
    'verbose': False
}

ctb_para = dict()
ctb_para['clf_params'] = classifier_parameters
ctb_para['fit_params'] = fit_parameters

In [28]:
class objective_fn(object):
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials
    
    def ctb_clf(self, params):
        clf = cb.CatBoostClassifier(**params['clf_params'], verbose=False, task_type="GPU")
        acc = cross_val_score(clf, self.X_train, self.y_train, scoring='accuracy', fit_params=params['fit_params']).mean()
        return {"loss": -acc, "status": STATUS_OK} 

In [29]:
classifier_parameters['learning_rate']

<hyperopt.pyll.base.Apply at 0x2b74c4da8760>

In [30]:
obj = objective_fn(X_train, y_train)

trials = Trials()
start = time.time()
best = obj.process(fn_name='ctb_clf', space=ctb_para, trials=trials, algo=tpe.suggest, max_evals=300)
end = time.time()

total_time = end - start

  0%|                                                                                                                                                | 0/300 [00:00<?, ?trial/s, best loss=?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [3:20:43<00:00, 40.14s/trial, best loss: -0.730937403971048]


In [33]:
opt_params = space_eval(classifier_parameters, best[0])
opt_params

{'bagging_temperature': 0.16704402473991137,
 'border_count': 254,
 'depth': 9,
 'eval_metric': 'Accuracy',
 'iterations': 900,
 'l2_leaf_reg': 19.0,
 'learning_rate': 0.05167031190494173,
 'loss_function': 'Logloss',
 'random_seed': 1864,
 'random_strength': 43.0}

In [38]:
clf = cb.CatBoostClassifier(
    **opt_params
)

clf.fit(X_train, y_train, early_stopping_rounds=30, verbose=True)
preds = clf.predict(X_test)
print(f'Test accuracy of the current optimal catboost model: {accuracy_score(y_test, preds)}')

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.5897959	total: 1.16s	remaining: 17m 19s
1:	learn: 0.6212981	total: 2.56s	remaining: 19m 9s
2:	learn: 0.6229554	total: 4.28s	remaining: 21m 18s
3:	learn: 0.6244725	total: 5.67s	remaining: 21m 9s
4:	learn: 0.6286031	total: 7.2s	remaining: 21m 29s
5:	learn: 0.6332182	total: 8.81s	remaining: 21m 52s
6:	learn: 0.6500083	total: 9.85s	remaining: 20m 56s
7:	learn: 0.6511174	total: 11.5s	remaining: 21m 19s
8:	learn: 0.6519971	total: 13s	remaining: 21m 22s
9:	learn: 0.6502633	total: 14.8s	remaining: 21m 58s
10:	learn: 0.6547508	total: 16.5s	remaining: 22m 11s
11:	learn: 0.6539222	total: 17.7s	remaining: 21m 53s
12:	learn: 0.6549166	total: 19.2s	remaining: 21m 49s
13:	learn: 0.6603475	total: 20.9s	remaining: 22m 4s
14:	learn: 0.6637387	total: 22.4s	remaining: 22m 1s
15:	learn: 0.6664797	total: 24.1s	remaining: 22m 11s
16:	learn: 0.6674996	total: 25.6s	remaining: 22m 7s
17:	learn: 0.6668876	total: 27.2s	remaining: 22m 10s
18:	learn: 0.6668112	total: 28.5s	remaining: 22m
19:	learn: 0.66

# On original data

## Default parameters

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"

In [5]:
df = pd.read_csv('HIGGS.csv')

In [62]:
print(f'Original dataaset contains: {df.shape[0]} samples')

Original dataaset contains: 10999999 samples


In [23]:
df_small = pd.read_csv('higgs_cleaned.csv')

In [25]:
df.columns = df_small.columns

In [29]:
df.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b-tag,...,jet4eta,jet4phi,jet4b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
2,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
3,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487
4,0.0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818


In [51]:
test_original_x, test_original_y = df.tail(500000).iloc[:, 1:], df.tail(500000)['class']

In [52]:
train_original_x, train_original_y  = df.iloc[0:(df.shape[0] - 500000), 1:], df.iloc[0:(df.shape[0] - 500000), 0]

In [66]:
clf_default = cb.CatBoostClassifier(task_type="GPU", random_state=1864)
start = time.time()
clf_default.fit(train_original_x, train_original_y, verbose= False)
end = time.time()
cb_default_preds = clf_default.predict(test_original_x)

In [68]:
np.savetxt("default_catboost_preds.csv", cb_default_preds, delimiter=",")

In [55]:
print('Vanilla Catboost Scores:')
print('Accuracy:', accuracy_score(test_original_y, cb_preds))
print('ROC-AUC:', roc_auc_score(test_original_y, clf.predict_proba(test_original_x)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(test_original_y, cb_preds))

Vanilla Catboost Scores:
Accuracy: 0.730336
ROC-AUC: 0.8105816787962593
************************************************************
Detailed Report:
              precision    recall  f1-score   support

         0.0       0.72      0.70      0.71    235493
         1.0       0.74      0.75      0.75    264507

    accuracy                           0.73    500000
   macro avg       0.73      0.73      0.73    500000
weighted avg       0.73      0.73      0.73    500000



## Best catboost model optimized on smaller dataset

In [69]:
catboost_opt = {'bagging_temperature': 0.2879213699517955,
             'border_count': 128,
             'depth': 10,
             'eval_metric': 'Accuracy',
             'l2_leaf_reg': 5,
             'learning_rate': 0.031731849925222905,
             'loss_function': 'Logloss',
             'random_seed': 1864,
             'random_strength': 59.7185747097968}

In [70]:
clf_opt = cb.CatBoostClassifier(
    task_type="GPU",
    **catboost_opt
)

start = time.time()
clf_opt.fit(train_original_x, train_original_y, verbose= False)
end = time.time()
cb_optimized_preds = clf_opt.predict(test_original_x)

In [71]:
np.savetxt("optimal_catboost_preds.csv", cb_optimized_preds, delimiter=",")

In [60]:
print(f'Training time: {end-start} sec')

Training time: 81.10947608947754 sec


In [61]:
print('Optimized Catboost Scores:')
print('Accuracy:', accuracy_score(test_original_y, cb_preds))
print('ROC-AUC:', roc_auc_score(test_original_y, clf.predict_proba(test_original_x)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(test_original_y, cb_preds))

Optimized Catboost Scores:
Accuracy: 0.74673
ROC-AUC: 0.8293187529464071
************************************************************
Detailed Report:
              precision    recall  f1-score   support

         0.0       0.74      0.72      0.73    235493
         1.0       0.76      0.77      0.76    264507

    accuracy                           0.75    500000
   macro avg       0.75      0.75      0.75    500000
weighted avg       0.75      0.75      0.75    500000



## Evaluation of other catboost configurations we have obtained as suboptimal:

In [64]:
clf2 = cb.CatBoostClassifier(
    task_type="GPU",
    bagging_temperature = 1.4426651823376004,
    border_count = 128,
    depth = 7,
    eval_metric = 'Accuracy',
    l2_leaf_reg = 10,
    learning_rate = 0.05212984419824801,
    loss_function = 'Logloss',
    random_seed = 1864,
    random_strength = 62.078606789809335
)

start = time.time()
clf2.fit(train_original_x, train_original_y, verbose= False)
end = time.time()
cb2_preds = clf2.predict(test_original_x)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

In [65]:
print('Optimized Catboost Scores:')
print('Accuracy:', accuracy_score(test_original_y, cb2_preds))
print('ROC-AUC:', roc_auc_score(test_original_y, clf.predict_proba(test_original_x)[:, 1]))
print('*'*60)
print('Detailed Report:')
print(classification_report(test_original_y, cb2_preds))

Optimized Catboost Scores:
Accuracy: 0.744238
ROC-AUC: 0.8293187529464071
************************************************************
Detailed Report:
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.73    235493
         1.0       0.75      0.77      0.76    264507

    accuracy                           0.74    500000
   macro avg       0.74      0.74      0.74    500000
weighted avg       0.74      0.74      0.74    500000

