In [17]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.model_selection import cross_validate
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import numpy as np
import os
import pandas as pd

import json

# Data


In [23]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [24]:
for data in DATA:
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5000 non-null   object 
 1   Location       5000 non-null   object 
 2   MinTemp        4975 non-null   float64
 3   MaxTemp        4993 non-null   float64
 4   Rainfall       4955 non-null   float64
 5   Evaporation    2853 non-null   float64
 6   Sunshine       2612 non-null   float64
 7   WindGustDir    4665 non-null   object 
 8   WindGustSpeed  4667 non-null   float64
 9   WindDir9am     4608 non-null   object 
 10  WindDir3pm     4868 non-null   object 
 11  WindSpeed9am   4961 non-null   float64
 12  WindSpeed3pm   4914 non-null   float64
 13  Humidity9am    4943 non-null   float64
 14  Humidity3pm    4882 non-null   float64
 15  Pressure9am    4493 non-null   float64
 16  Pressure3pm    4496 non-null   float64
 17  Cloud9am       3103 non-null   float64
 18  Cloud3pm

# Pipeline

In [25]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="missing")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

In [26]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
]

pipelines = []

In [27]:
for classifier in classifiers:
    pipelines.append((type(classifier), Pipeline([("transformer", col_trans), ("model", classifier)])))

In [62]:
pipelines

[(sklearn.tree._classes.DecisionTreeClassifier,
  Pipeline(steps=[('transformer',
                   ColumnTransformer(transformers=[('num_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer()),
                                                                    ('scale',
                                                                     MinMaxScaler())]),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000227F24DB190>),
                                                   ('cat_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer(fill_value='missing',
                                                                                   strategy='constant')),
                  

In [63]:
scores = []

for pipe in pipelines:
    pipe_score = []
    for data in DATA:
        score = cross_validate(pipe[1], data.iloc[:, :-1], LabelEncoder().fit_transform(data.iloc[:,-1]), cv = 5, scoring="roc_auc")
        pipe_score.append(score["test_score"].mean())

    scores.append((pipe[0], pipe_score))

In [29]:
os.listdir("../data")

['banking_final.csv',
 'flights_final.csv',
 'mushrooms_final.csv',
 'weather_final.csv']

In [64]:
scores

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.6856017268248299,
   0.9118589079864705,
   0.9854175523838447,
   0.6788933190280879]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.9175702490538278, 0.9863045570267867, 1.0, 0.8444766611174457]),
 (xgboost.sklearn.XGBClassifier,
  [0.9103290065917472,
   0.9891123481904669,
   0.9999846138273105,
   0.8337667489284263])]

# Random Searching - searching for new defaults

In [4]:
param_distributions = [
    {
        "model__max_depth": randint(1,31),
        "model__min_samples_split":randint(2,61),
        "model__criterion":["gini","entropy"],
        "model__min_samples_leaf":randint(1,61)
    },
    {
        "model__n_estimators":randint(1,2001),
        "model__min_samples_leaf":randint(1,5001),
        "model__max_samples":uniform(0.1,0.9),
        "model__max_features":uniform(0,1)
    },
    {
        "model__max_depth": randint(1,16),
        "model__min_child_weight": randint(1,129),
        "model__eta": uniform(2**(-10),1-2**(-10)),
        "model__alpha": uniform(2**(-10),2**(10))
    }
]

In [9]:
best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,pipe in enumerate(pipelines):    
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(pipe[1], 
                                param_distributions= param_distributions[i],
                                verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=2000,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        pipe_best_scores.append(rs.best_score_)
        pipe_best_models.append(rs.best_estimator_)
        best_params[j].append(rs.best_params_)
        history[i].append(rs.cv_results_)    


Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


In [10]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (8000, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (8000, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (8000, 18)


In [11]:
model_names = ['DecisionTree','RandomForest','XGBoost']

# Saving history to csv files

In [12]:
for i, df in enumerate(history_datasets):
    df.to_csv(f'../history/history_dataset_{model_names[i]}.csv', index=False)

# Reading history from csv

In [6]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [7]:
pd.set_option('display.max_colwidth', None)

history_XGBoost[['params', 'mean_test_score']].head(20)

Unnamed: 0,params,mean_test_score
0,"{'model__alpha': 383.5300582621992, 'model__eta': 0.9507624369700627, 'model__max_depth': 11, 'model__min_child_weight': 72}",0.5
1,"{'model__alpha': 613.0272643802655, 'model__eta': 0.15684284098887946, 'model__max_depth': 3, 'model__min_child_weight': 87}",0.5
2,"{'model__alpha': 59.47859542273625, 'model__eta': 0.8663068331325768, 'model__max_depth': 4, 'model__min_child_weight': 104}",0.831022
3,"{'model__alpha': 725.0672962256506, 'model__eta': 0.0215409547505917, 'model__max_depth': 2, 'model__min_child_weight': 88}",0.5
4,"{'model__alpha': 852.4222407421319, 'model__eta': 0.2131083107655044, 'model__max_depth': 12, 'model__min_child_weight': 21}",0.5
5,"{'model__alpha': 632.3020424212817, 'model__eta': 0.6120324054487416, 'model__max_depth': 13, 'model__min_child_weight': 108}",0.5
6,"{'model__alpha': 23.616899804909735, 'model__eta': 0.5252387475042306, 'model__max_depth': 15, 'model__min_child_weight': 42}",0.877764
7,"{'model__alpha': 47.7866156932422, 'model__eta': 0.9737811482175905, 'model__max_depth': 15, 'model__min_child_weight': 62}",0.854456
8,"{'model__alpha': 92.7819655241085, 'model__eta': 0.6187586792458479, 'model__max_depth': 12, 'model__min_child_weight': 55}",0.828152
9,"{'model__alpha': 1006.8294036286511, 'model__eta': 0.46728363261004247, 'model__max_depth': 5, 'model__min_child_weight': 51}",0.5


In [8]:
history_DecisionTree.shape

(8000, 18)

In [9]:
history_RandomForest.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_features,param_model__max_samples,param_model__min_samples_leaf,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,3.088375,0.034357,0.123892,0.004394,0.37454,0.955643,3773,1045,"{'model__max_features': 0.3745401188473625, 'model__max_samples': 0.9556428757689246, 'model__min_samples_leaf': 3773, 'model__n_estimators': 1045}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,310
1,0,4.22493,0.038158,0.159984,0.007611,0.156019,0.240395,4427,1483,"{'model__max_features': 0.15601864044243652, 'model__max_samples': 0.2403950683025824, 'model__min_samples_leaf': 4427, 'model__n_estimators': 1483}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,310
2,0,17.30488,0.262087,0.283086,0.018685,0.866176,0.641004,131,1686,"{'model__max_features': 0.8661761457749352, 'model__max_samples': 0.6410035105688879, 'model__min_samples_leaf': 131, 'model__n_estimators': 1686}",0.880235,0.867923,0.873697,0.833919,0.858916,0.862938,0.016113,43
3,0,3.379081,0.009503,0.126846,0.004132,0.056412,0.749799,2434,1216,"{'model__max_features': 0.056411579027100256, 'model__max_samples': 0.7497988950401423, 'model__min_samples_leaf': 2434, 'model__n_estimators': 1216}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,310
4,0,0.115851,0.019847,0.015705,0.00469,0.992212,0.655733,3386,22,"{'model__max_features': 0.9922115592912175, 'model__max_samples': 0.6557333586649449, 'model__min_samples_leaf': 3386, 'model__n_estimators': 22}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,310


In [10]:
history_XGBoost.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.165986,0.013078,0.015945,0.005451,383.530058,0.950762,11,72,"{'model__alpha': 383.5300582621992, 'model__eta': 0.9507624369700627, 'model__max_depth': 11, 'model__min_child_weight': 72}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
1,0,0.1605,0.013494,0.011362,0.004381,613.027264,0.156843,3,87,"{'model__alpha': 613.0272643802655, 'model__eta': 0.15684284098887946, 'model__max_depth': 3, 'model__min_child_weight': 87}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
2,0,0.134376,0.005874,0.01431,0.003767,59.478595,0.866307,4,104,"{'model__alpha': 59.47859542273625, 'model__eta': 0.8663068331325768, 'model__max_depth': 4, 'model__min_child_weight': 104}",0.840699,0.839393,0.830057,0.802664,0.842297,0.831022,0.014805,160
3,0,0.140599,0.005001,0.016829,0.010731,725.067296,0.021541,2,88,"{'model__alpha': 725.0672962256506, 'model__eta': 0.0215409547505917, 'model__max_depth': 2, 'model__min_child_weight': 88}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
4,0,0.145276,0.009771,0.017556,0.002008,852.422241,0.213108,12,21,"{'model__alpha': 852.4222407421319, 'model__eta': 0.2131083107655044, 'model__max_depth': 12, 'model__min_child_weight': 21}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714


# New defaults below

In [11]:
def get_best_params_overall(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
    grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
    return grouped_mean.iloc[0, 0], grouped_mean.iloc[0, 1]

In [12]:
best_params_DecisionTree, best_params_DecisionTree_score = get_best_params_overall(history_DecisionTree)
print(f"Best params for DecisionTree: {best_params_DecisionTree}") 
print(f"with score: {best_params_DecisionTree_score}")

Best params for DecisionTree: {'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}
with score: 0.9035694410593584


In [13]:
best_params_RandomForest, best_params_RandomForest_score = get_best_params_overall(history_RandomForest)
print(f"Best params for RandomForest: {best_params_RandomForest}") 
print(f"with score: {best_params_RandomForest_score}")

Best params for RandomForest: {'model__max_features': 0.6325405332263061, 'model__max_samples': 0.43556377347004105, 'model__min_samples_leaf': 2, 'model__n_estimators': 375}
with score: 0.9398295218158871


In [14]:
best_params_XGBoost, best_params_XGBoost_score = get_best_params_overall(history_XGBoost)
print(f"Best params for XGBoost: {best_params_XGBoost}")
print(f"With score: {best_params_XGBoost_score}")

Best params for XGBoost: {'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}
With score: 0.9312130131165086


# Tunability

##### Now let's compute tunability of each of the ML algorithms. We'll start with looking for the optimal configuration of the hyperparameters for each of the datasets

In [69]:
def get_best_params_per_dataset(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    best_params_per_dataset = df.sort_values(['dataset', 'rank_test_score'], ascending=[True, True]).groupby('dataset').first().reset_index()
    best_params_per_dataset.rename(columns={'params_str': 'best_params', 'mean_test_score': 'best_score'}, inplace=True)
    best_params_per_dataset = best_params_per_dataset[['dataset', 'best_params', 'best_score']]
    default_params, _ = get_best_params_overall(df)
    score_for_default_params = df[df['params_str'] == default_params][['dataset', 'mean_test_score']].rename(columns={'mean_test_score': 'default_score'})
    best_params_per_dataset = best_params_per_dataset.merge(score_for_default_params, on='dataset', how='left')
    best_params_per_dataset['abs_tunability'] = best_params_per_dataset['best_score'] - best_params_per_dataset['default_score']
    best_params_per_dataset['rel_tunability (%)'] = best_params_per_dataset['abs_tunability'] / best_params_per_dataset['default_score'] * 100 
    return best_params_per_dataset

In [24]:
best_params_per_dataset_DecisionTree = get_best_params_per_dataset(history_DecisionTree)
best_params_per_dataset_DecisionTree

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 16, 'model__min_samples_leaf': 1, 'model__min_samples_split': 55}",0.875537,0.86955,0.005987,0.688491
1,1,"{'model__criterion': 'entropy', 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 58}",0.97676,0.971298,0.005462,0.56235
2,2,"{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 6, 'model__min_samples_split': 23}",0.987566,0.976419,0.011146,1.141524
3,3,"{'model__criterion': 'entropy', 'model__max_depth': 7, 'model__min_samples_leaf': 45, 'model__min_samples_split': 47}",0.81752,0.79701,0.02051,2.573347


In [25]:
best_params_per_dataset_RandomForest = get_best_params_per_dataset(history_RandomForest)
best_params_per_dataset_RandomForest

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.6325405332263061, 'model__max_samples': 0.43556377347004105, 'model__min_samples_leaf': 2, 'model__n_estimators': 375}",0.924874,0.924874,0.0,0.0
1,1,"{'model__max_features': 0.7758047357929561, 'model__max_samples': 0.9471137538091479, 'model__min_samples_leaf': 2, 'model__n_estimators': 225}",0.985769,0.985332,0.000437,0.044341
2,2,"{'model__max_features': 0.7758047357929561, 'model__max_samples': 0.9471137538091479, 'model__min_samples_leaf': 2, 'model__n_estimators': 225}",0.999979,0.99996,1.9e-05,0.001863
3,3,"{'model__max_features': 0.8791183075621647, 'model__max_samples': 0.9502588200622692, 'model__min_samples_leaf': 11, 'model__n_estimators': 1086}",0.849182,0.849151,3.1e-05,0.00364


In [26]:
best_params_per_dataset_XGBoost = get_best_params_per_dataset(history_XGBoost)
best_params_per_dataset_XGBoost

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}",0.900224,0.900224,0.0,0.0
1,1,"{'model__alpha': 13.600928904086004, 'model__eta': 0.5424625491648044, 'model__max_depth': 13, 'model__min_child_weight': 20}",0.986025,0.985244,0.00078,0.07921
2,2,"{'model__alpha': 0.9526994373401294, 'model__eta': 0.9033181355211285, 'model__max_depth': 10, 'model__min_child_weight': 13}",0.998642,0.991926,0.006716,0.677032
3,3,"{'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}",0.847458,0.847458,0.0,0.0


# Badanie tunowalności poszczególnych parametrów

In [3]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [41]:
def do_random_search(clf, param_distributions):
    global DATA 
    history = []
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(clf, 
                                param_distributions= param_distributions,
                                #verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=200,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        history.append(rs.cv_results_)
    return history

## Decision tree

In [47]:
param_distributions_Decision_Tree = param_distributions[0]
tunable_parameters = param_distributions_Decision_Tree.keys()
tunable_parameters

dict_keys(['model__max_depth', 'model__min_samples_split', 'model__criterion', 'model__min_samples_leaf'])

In [48]:
param_distributions_Decision_Tree

{'model__max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x133824f10>,
 'model__min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x132188e90>,
 'model__criterion': ['gini', 'entropy'],
 'model__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x133826a10>}

In [49]:
best_params_DT_dict = json.loads(best_params_DecisionTree.replace("'", "\""))
best_params_DT_dict = {key: [value] for key, value in best_params_DT_dict.items()}
best_params_DT_dict

{'model__criterion': ['gini'],
 'model__max_depth': [17],
 'model__min_samples_leaf': [10],
 'model__min_samples_split': [58]}

In [None]:
param_history = {}
for param in tunable_parameters:
    temp_param_grid = {**best_params_DT_dict} 
    temp_param_grid.update({param:param_distributions_Decision_Tree[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_random_search(pipelines[0][1], temp_param_grid)
    param_history.update({param: history})

In [62]:
len(param_history["model__max_depth"][])

17

In [63]:
params_history_frames = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames.update({param:df})

In [64]:
params_history_frames["model__criterion"]

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__min_samples_split,param_model__min_samples_leaf,param_model__max_depth,param_model__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.105099,0.00878,0.015429,0.003671,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.787883,0.800885,0.789166,0.804345,0.807687,0.797993,0.008035,1
1,0,0.114604,0.012346,0.01294,0.002294,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.771594,0.789281,0.775129,0.783114,0.793746,0.782573,0.00832,2
2,1,0.040952,0.007055,0.006948,0.00158,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.975798,0.971953,0.96436,0.975191,0.968272,0.971114,0.004312,2
3,1,0.04423,0.005163,0.005204,0.000452,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.981627,0.97352,0.967011,0.976724,0.973755,0.974527,0.004761,1
4,2,0.046389,0.003247,0.011586,0.004546,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.861568,0.902895,0.847828,0.873421,0.863093,0.869761,0.01846,1
5,2,0.045041,0.002042,0.00671,0.001326,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.855224,0.894338,0.847901,0.851003,0.86877,0.863447,0.01701,2
6,3,0.077533,0.007429,0.011376,0.002776,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.97886,0.988254,0.971594,0.969296,0.973894,0.97638,0.006729,2
7,3,0.080284,0.009185,0.00884,0.000378,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.975028,0.980824,0.973653,0.986539,0.975463,0.978301,0.004788,1


In [65]:
params_history_frames['model__max_depth']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.107094,0.016469,0.012695,0.002983,gini,7,10,58,"{'model__criterion': 'gini', 'model__max_depth': 7, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.787423,0.812503,0.797988,0.813216,0.821115,0.806449,0.012092,14
1,0,0.122068,0.007226,0.017022,0.004310,gini,20,10,58,"{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.788119,0.795348,0.789710,0.801270,0.799576,0.794804,0.005206,125
2,0,0.122834,0.010963,0.014593,0.004015,gini,29,10,58,"{'model__criterion': 'gini', 'model__max_depth': 29, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.787998,0.798676,0.789166,0.797537,0.796837,0.794043,0.004512,147
3,0,0.102597,0.011336,0.012477,0.003648,gini,15,10,58,"{'model__criterion': 'gini', 'model__max_depth': 15, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.783157,0.794033,0.788769,0.808174,0.801999,0.795226,0.008970,107
4,0,0.094956,0.013095,0.009644,0.001571,gini,11,10,58,"{'model__criterion': 'gini', 'model__max_depth': 11, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.791269,0.798647,0.799395,0.804118,0.793543,0.797394,0.004542,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.083718,0.008882,0.012290,0.002154,gini,29,10,58,"{'model__criterion': 'gini', 'model__max_depth': 29, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.983842,0.989135,0.977992,0.969296,0.980356,0.980124,0.006586,57
796,3,0.052561,0.008390,0.012371,0.003251,gini,4,10,58,"{'model__criterion': 'gini', 'model__max_depth': 4, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.777016,0.772864,0.771021,0.777688,0.753694,0.770457,0.008746,172
797,3,0.093595,0.002104,0.009494,0.001203,gini,30,10,58,"{'model__criterion': 'gini', 'model__max_depth': 30, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.983842,0.989135,0.977992,0.969430,0.980170,0.980114,0.006541,65
798,3,0.050612,0.004125,0.010582,0.002098,gini,5,10,58,"{'model__criterion': 'gini', 'model__max_depth': 5, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.816716,0.815860,0.822976,0.829819,0.790297,0.815134,0.013395,166


In [66]:
params_history_frames['model__min_samples_leaf']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.084143,0.006159,0.015894,0.003470,gini,17,39,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 39, 'model__min_samples_split': 58}",0.797292,0.810846,0.798794,0.826523,0.826962,0.812083,0.012859,18
1,0,0.082901,0.002799,0.019535,0.004657,gini,17,52,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 52, 'model__min_samples_split': 58}",0.797548,0.806310,0.795543,0.828683,0.820422,0.809701,0.012922,59
2,0,0.118521,0.006652,0.016944,0.003846,gini,17,29,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 29, 'model__min_samples_split': 58}",0.793556,0.798264,0.800928,0.813783,0.825703,0.806447,0.011732,119
3,0,0.111362,0.003045,0.012652,0.002128,gini,17,15,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 15, 'model__min_samples_split': 58}",0.784932,0.801552,0.798437,0.825930,0.808774,0.803925,0.013448,147
4,0,0.102354,0.007219,0.014424,0.003492,gini,17,43,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 43, 'model__min_samples_split': 58}",0.801661,0.810440,0.796803,0.829411,0.824846,0.812632,0.012700,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.087806,0.010458,0.013088,0.002083,gini,17,43,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 43, 'model__min_samples_split': 58}",0.927517,0.936200,0.944952,0.922405,0.952967,0.936808,0.011150,141
796,3,0.089958,0.005880,0.014912,0.003691,gini,17,29,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 29, 'model__min_samples_split': 58}",0.951572,0.962508,0.950054,0.954153,0.957318,0.955121,0.004441,98
797,3,0.090991,0.012452,0.013759,0.002187,gini,17,36,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 36, 'model__min_samples_split': 58}",0.948896,0.947442,0.935759,0.938638,0.939536,0.942054,0.005167,121
798,3,0.091313,0.007543,0.014628,0.003508,gini,17,13,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 13, 'model__min_samples_split': 58}",0.973505,0.987007,0.963525,0.970254,0.972125,0.973283,0.007670,41


In [67]:
params_history_frames['model__min_samples_split']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.104002,0.008128,0.015526,0.002978,gini,17,10,40,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 40}",0.764943,0.796521,0.765789,0.787509,0.794672,0.781887,0.013824,57
1,0,0.125067,0.012540,0.013333,0.003034,gini,17,10,53,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 53}",0.775253,0.789669,0.781699,0.801932,0.797884,0.789287,0.009884,31
2,0,0.160459,0.020617,0.014780,0.004172,gini,17,10,30,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 30}",0.760258,0.780376,0.753331,0.773156,0.780169,0.769458,0.010884,103
3,0,0.141070,0.010011,0.015746,0.002139,gini,17,10,16,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 16}",0.752134,0.776199,0.745130,0.760574,0.780668,0.762941,0.013636,143
4,0,0.119967,0.012509,0.015516,0.003821,gini,17,10,44,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 44}",0.770317,0.795258,0.768850,0.789057,0.797720,0.784240,0.012305,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.115553,0.017846,0.029131,0.022153,gini,17,10,29,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 29}",0.982429,0.990588,0.979028,0.972740,0.982205,0.981398,0.005775,98
796,3,0.130162,0.012242,0.016124,0.006800,gini,17,10,3,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 3}",0.983239,0.990349,0.980472,0.975890,0.982976,0.982585,0.004694,65
797,3,0.099085,0.004707,0.015498,0.004152,gini,17,10,43,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 43}",0.978965,0.990485,0.975483,0.971258,0.977482,0.978735,0.006421,144
798,3,0.093473,0.007794,0.012530,0.002267,gini,17,10,46,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 46}",0.978767,0.990485,0.975483,0.971254,0.976722,0.978542,0.006457,161


In [74]:
from IPython.display import display

for param in tunable_parameters:
    print("Results for param:", param)
    display(get_best_params_per_dataset(params_history_frames[param]).head())

Results for param: model__max_depth


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.809599,0.796915,0.012684,1.591678
1,0,"{'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.809599,0.797666,0.011933,1.495946
2,0,"{'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.809599,0.795739,0.013861,1.741845
3,0,"{'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.809599,0.79666,0.012939,1.624192
4,0,"{'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.809599,0.79617,0.013429,1.686681


Results for param: model__min_samples_split


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 59}",0.797495,0.796544,0.000951,0.119357
1,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 59}",0.797495,0.794831,0.002664,0.335138
2,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 59}",0.797495,0.795853,0.001642,0.206282
3,1,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 59}",0.971501,0.971401,0.0001,0.010325
4,1,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 59}",0.971501,0.971208,0.000293,0.030144


Results for param: model__criterion


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.797993,0.797993,0.0,0.0
1,1,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.974527,0.971114,0.003413,0.351431
2,2,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.869761,0.869761,0.0,0.0
3,3,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.978301,0.97638,0.001922,0.196814


Results for param: model__min_samples_leaf


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 38, 'model__min_samples_split': 58}",0.814608,0.794246,0.020362,2.563726
1,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 38, 'model__min_samples_split': 58}",0.814608,0.794201,0.020407,2.569543
2,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 38, 'model__min_samples_split': 58}",0.814608,0.79278,0.021829,2.753422
3,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 38, 'model__min_samples_split': 58}",0.814608,0.794175,0.020434,2.572933
4,0,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 38, 'model__min_samples_split': 58}",0.814608,0.794554,0.020055,2.524043


### Save results to file

In [78]:
for param, history in params_history_frames.items():
    print(history.shape)
    history.to_csv(f'../history/history_hyperparameter_tuning_DT_{param}.csv', index=False)

(800, 19)
(800, 19)
(8, 19)
(800, 19)
