In [19]:
import pandas as pd
import numpy as np
import os

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram

from skopt import BayesSearchCV

from utils import ModelSuplier, DataLoader, DataSaver, get_best_params_overall

In [9]:
ms = ModelSuplier()

pipelines = ms.pipelines

In [10]:
dl = DataLoader()

data_as_X_and_y = dl.transformed_data

Using default path


In [11]:
ds = DataSaver(os.path.join("..", "history_bayes"))

# Bayes Search (for default params)

In [13]:
param_distributions = [
    {
        "model__max_depth": Integer(1, 30),
        "model__min_samples_split": Integer(2, 60),
        "model__criterion": Categorical(["gini", "entropy"]),
        "model__min_samples_leaf": Integer(1, 60)
    },
    {
        "model__n_estimators": Integer(100, 500),      
        "model__min_samples_leaf": Integer(1, 250),    
        "model__max_samples": Real(0.5, 1),        
        "model__max_features": Real(1e-6, 1)   
    },
    {
        "model__max_depth": Integer(1, 19),
        "model__min_child_weight": Integer(0, 19),
        "model__eta": Real(0.01, 0.101), 
        "model__alpha": Real(1e-4, 10, prior="log-uniform")
    }   
]

In [None]:
best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,(name,pipe) in enumerate(pipelines):
    print("Training:",  name)
    for j,(X,y) in enumerate(data_as_X_and_y):
        bayes = BayesSearchCV(pipe, search_spaces=param_distributions[i], cv=5, random_state=42, n_jobs=-1)
        bayes.fit(X,y)
        pipe_best_scores.append(bayes.best_score_)
        pipe_best_models.append(bayes.best_estimator_)
        best_params[j].append(bayes.best_params_)
        history[i].append(bayes.cv_results_)   

In [15]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (200, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (200, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (200, 18)


In [16]:
ds.save(history_datasets,"history_bayes_all", ['DecisionTree','RandomForest','XGBoost'])

In [17]:
history_DecisionTree = pd.read_csv('../history_bayes/history_bayes_all_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history_bayes/history_bayes_all_RandomForest.csv')
history_XGBoost = pd.read_csv('../history_bayes/history_bayes_all_XGBoost.csv')

In [18]:
histories = [history_DecisionTree, history_RandomForest, history_XGBoost]
for h in histories:
    display(h.head())

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.066453,0.002581,0.008902,0.000697,gini,22,56,20,"OrderedDict([('model__criterion', 'gini'), ('m...",0.801,0.809,0.819,0.83,0.835,0.8188,0.012655,34
1,0,0.072213,0.003454,0.008244,0.001082,entropy,27,19,57,"OrderedDict([('model__criterion', 'entropy'), ...",0.821,0.824,0.822,0.825,0.82,0.8224,0.001855,27
2,0,0.102844,0.007273,0.008777,0.001034,gini,28,7,27,"OrderedDict([('model__criterion', 'gini'), ('m...",0.794,0.796,0.792,0.806,0.811,0.7998,0.007386,47
3,0,0.056093,0.00377,0.011241,0.002541,entropy,6,36,49,"OrderedDict([('model__criterion', 'entropy'), ...",0.823,0.825,0.823,0.821,0.828,0.824,0.002366,16
4,0,0.082433,0.007965,0.009537,0.002427,entropy,14,32,44,"OrderedDict([('model__criterion', 'entropy'), ...",0.819,0.821,0.826,0.824,0.819,0.8218,0.002786,31


Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_features,param_model__max_samples,param_model__min_samples_leaf,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,1.424231,0.018051,0.029302,0.001117,0.410105,0.863863,233,226,"OrderedDict([('model__max_features', 0.4101045...",0.812,0.817,0.806,0.812,0.828,0.815,0.007376,45
1,0,9.917461,0.075968,0.058881,0.006625,0.837389,0.941658,77,480,"OrderedDict([('model__max_features', 0.8373885...",0.821,0.83,0.819,0.842,0.84,0.8304,0.009436,33
2,0,5.692528,0.091459,0.037279,0.003391,0.444833,0.959361,27,273,"OrderedDict([('model__max_features', 0.4448330...",0.834,0.835,0.82,0.848,0.845,0.8364,0.009851,31
3,0,4.135328,0.01872,0.048913,0.003446,0.812396,0.585936,150,421,"OrderedDict([('model__max_features', 0.8123961...",0.811,0.818,0.812,0.834,0.837,0.8224,0.011002,39
4,0,4.753339,0.046417,0.043432,0.002413,0.799554,0.719015,132,386,"OrderedDict([('model__max_features', 0.7995536...",0.813,0.819,0.815,0.834,0.839,0.824,0.010507,38


Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,1.20345,0.030098,0.027251,0.002533,0.011234,0.076223,18,6,"OrderedDict([('model__alpha', 0.01123362169089...",0.835,0.826,0.82,0.848,0.841,0.834,0.01006,37
1,0,0.31294,0.008661,0.016681,0.001483,1.537948,0.090382,6,18,"OrderedDict([('model__alpha', 1.53794844658007...",0.834,0.839,0.829,0.84,0.846,0.8376,0.005748,21
2,0,0.15815,0.009617,0.014049,0.001618,0.016756,0.093604,3,8,"OrderedDict([('model__alpha', 0.01675569944093...",0.828,0.84,0.825,0.847,0.847,0.8374,0.009308,25
3,0,0.72495,0.011451,0.021517,0.001934,1.1534,0.02564,12,15,"OrderedDict([('model__alpha', 1.15339998595595...",0.832,0.837,0.822,0.851,0.839,0.8362,0.009453,30
4,0,0.57504,0.014834,0.022393,0.003007,0.994872,0.049861,10,14,"OrderedDict([('model__alpha', 0.99487199982341...",0.839,0.843,0.828,0.851,0.844,0.841,0.007563,5


# New defaults

In [20]:
best_params_DecisionTree, best_params_DecisionTree_score = get_best_params_overall(history_DecisionTree)
print(f"Best params for DecisionTree: {best_params_DecisionTree}") 
print(f"with score: {best_params_DecisionTree_score}")

Best params for DecisionTree: OrderedDict([('model__criterion', 'entropy'), ('model__max_depth', 30), ('model__min_samples_leaf', 1), ('model__min_samples_split', 2)])
with score: 0.9846


In [21]:
best_params_RandomForest, best_params_RandomForest_score = get_best_params_overall(history_RandomForest)
print(f"Best params for RandomForest: {best_params_RandomForest}") 
print(f"with score: {best_params_RandomForest_score}")

Best params for RandomForest: OrderedDict([('model__max_features', 1e-06), ('model__max_samples', 1.0), ('model__min_samples_leaf', 1), ('model__n_estimators', 500)])
with score: 1.0


In [22]:
best_params_XGBoost, best_params_XGBoost_score = get_best_params_overall(history_XGBoost)
print(f"Best params for XGBoost: {best_params_XGBoost}")
print(f"With score: {best_params_XGBoost_score}")

Best params for XGBoost: OrderedDict([('model__alpha', 0.0001), ('model__eta', 0.101), ('model__max_depth', 11), ('model__min_child_weight', 0)])
With score: 0.999
