In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.model_selection import cross_validate
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import numpy as np
import os
import pandas as pd

# Data


In [3]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [11]:
for data in DATA:
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        5000 non-null   int64 
 1   job        5000 non-null   object
 2   marital    5000 non-null   object
 3   education  5000 non-null   object
 4   default    5000 non-null   object
 5   balance    5000 non-null   int64 
 6   housing    5000 non-null   object
 7   loan       5000 non-null   object
 8   contact    5000 non-null   object
 9   day        5000 non-null   int64 
 10  month      5000 non-null   object
 11  duration   5000 non-null   int64 
 12  campaign   5000 non-null   int64 
 13  pdays      5000 non-null   int64 
 14  previous   5000 non-null   int64 
 15  poutcome   5000 non-null   object
 16  y          5000 non-null   object
dtypes: int64(7), object(10)
memory usage: 664.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 

# Pipeline

In [4]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="missing")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

In [5]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
]

pipelines = []

In [6]:
for classifier in classifiers:
    pipelines.append((type(classifier), Pipeline([("transformer", col_trans), ("model", classifier)])))

In [62]:
pipelines

[(sklearn.tree._classes.DecisionTreeClassifier,
  Pipeline(steps=[('transformer',
                   ColumnTransformer(transformers=[('num_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer()),
                                                                    ('scale',
                                                                     MinMaxScaler())]),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000227F24DB190>),
                                                   ('cat_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer(fill_value='missing',
                                                                                   strategy='constant')),
                  

In [63]:
scores = []

for pipe in pipelines:
    pipe_score = []
    for data in DATA:
        score = cross_validate(pipe[1], data.iloc[:, :-1], LabelEncoder().fit_transform(data.iloc[:,-1]), cv = 5, scoring="roc_auc")
        pipe_score.append(score["test_score"].mean())

    scores.append((pipe[0], pipe_score))

In [29]:
os.listdir("../data")

['banking_final.csv',
 'flights_final.csv',
 'mushrooms_final.csv',
 'weather_final.csv']

In [64]:
scores

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.6856017268248299,
   0.9118589079864705,
   0.9854175523838447,
   0.6788933190280879]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.9175702490538278, 0.9863045570267867, 1.0, 0.8444766611174457]),
 (xgboost.sklearn.XGBClassifier,
  [0.9103290065917472,
   0.9891123481904669,
   0.9999846138273105,
   0.8337667489284263])]

# Random Searching - searching for new defaults

In [9]:
param_distributions = [
    {
        "model__max_depth": randint(1,31),
        "model__min_samples_split":randint(2,61),
        "model__criterion":["gini","entropy"],
        "model__min_samples_leaf":randint(1,61)
    },
    {
        "model__n_estimators":randint(1,2001),
        "model__min_samples_leaf":randint(1,5001),
        "model__max_samples":uniform(0.1,0.9),
        "model__max_features":uniform(0,1)
    },
    {
        "model__max_depth": randint(1,16),
        "model__min_child_weight": randint(1,129),
        "model__eta": uniform(2**(-10),1-2**(-10)),
        "model__alpha": uniform(2**(-10),2**(10))
    }
]

best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,pipe in enumerate(pipelines):    
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(pipe[1], 
                                param_distributions= param_distributions[i],
                                verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=2000,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        pipe_best_scores.append(rs.best_score_)
        pipe_best_models.append(rs.best_estimator_)
        best_params[j].append(rs.best_params_)
        history[i].append(rs.cv_results_)    


Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


In [10]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (8000, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (8000, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (8000, 18)


In [11]:
model_names = ['DecisionTree','RandomForest','XGBoost']

# Saving history to csv files

In [12]:
for i, df in enumerate(history_datasets):
    df.to_csv(f'../history/history_dataset_{model_names[i]}.csv', index=False)

# Reading history from csv

In [3]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [4]:
pd.set_option('display.max_colwidth', None)

history_XGBoost[['params', 'mean_test_score']].head(20)

Unnamed: 0,params,mean_test_score
0,"{'model__alpha': 383.5300582621992, 'model__eta': 0.9507624369700627, 'model__max_depth': 11, 'model__min_child_weight': 72}",0.5
1,"{'model__alpha': 613.0272643802655, 'model__eta': 0.15684284098887946, 'model__max_depth': 3, 'model__min_child_weight': 87}",0.5
2,"{'model__alpha': 59.47859542273625, 'model__eta': 0.8663068331325768, 'model__max_depth': 4, 'model__min_child_weight': 104}",0.831022
3,"{'model__alpha': 725.0672962256506, 'model__eta': 0.0215409547505917, 'model__max_depth': 2, 'model__min_child_weight': 88}",0.5
4,"{'model__alpha': 852.4222407421319, 'model__eta': 0.2131083107655044, 'model__max_depth': 12, 'model__min_child_weight': 21}",0.5
5,"{'model__alpha': 632.3020424212817, 'model__eta': 0.6120324054487416, 'model__max_depth': 13, 'model__min_child_weight': 108}",0.5
6,"{'model__alpha': 23.616899804909735, 'model__eta': 0.5252387475042306, 'model__max_depth': 15, 'model__min_child_weight': 42}",0.877764
7,"{'model__alpha': 47.7866156932422, 'model__eta': 0.9737811482175905, 'model__max_depth': 15, 'model__min_child_weight': 62}",0.854456
8,"{'model__alpha': 92.7819655241085, 'model__eta': 0.6187586792458479, 'model__max_depth': 12, 'model__min_child_weight': 55}",0.828152
9,"{'model__alpha': 1006.8294036286511, 'model__eta': 0.46728363261004247, 'model__max_depth': 5, 'model__min_child_weight': 51}",0.5


In [5]:
history_DecisionTree.shape

(8000, 18)

In [None]:
history_RandomForest.head()

In [19]:
history_XGBoost.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.120718,0.009384,0.015755,0.004705,383.530058,0.950762,11,72,"{'model__alpha': 383.5300582621992, 'model__et...",0.5,0.5,0.5,0.5,0.5,0.5,0.0,72
1,0,0.130825,0.014063,0.021409,0.013045,613.027264,0.156843,3,87,"{'model__alpha': 613.0272643802655, 'model__et...",0.5,0.5,0.5,0.5,0.5,0.5,0.0,72
2,0,0.158108,0.035043,0.017295,0.002255,59.478595,0.866307,4,104,"{'model__alpha': 59.47859542273625, 'model__et...",0.840699,0.839393,0.830057,0.802664,0.842297,0.831022,0.014805,19
3,0,0.117092,0.014543,0.014069,0.002245,725.067296,0.021541,2,88,"{'model__alpha': 725.0672962256506, 'model__et...",0.5,0.5,0.5,0.5,0.5,0.5,0.0,72
4,0,0.113698,0.008851,0.015426,0.002006,852.422241,0.213108,12,21,"{'model__alpha': 852.4222407421319, 'model__et...",0.5,0.5,0.5,0.5,0.5,0.5,0.0,72


# New defaults below

In [19]:
def get_best_params_overall(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
    grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
    return grouped_mean.iloc[0, 0], grouped_mean.iloc[0, 1]

In [20]:
best_params_DecisionTree, best_params_DecisionTree_score = get_best_params_overall(history_DecisionTree)
print(f"Best params for DecisionTree: {best_params_DecisionTree}") 
print(f"with score: {best_params_DecisionTree_score}")

Best params for DecisionTree: {'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}
with score: 0.9035694410593584


In [21]:
best_params_RandomForest, best_params_RandomForest_score = get_best_params_overall(history_RandomForest)
print(f"Best params for RandomForest: {best_params_RandomForest}") 
print(f"with score: {best_params_RandomForest_score}")

Best params for RandomForest: {'model__max_features': 0.6325405332263061, 'model__max_samples': 0.43556377347004105, 'model__min_samples_leaf': 2, 'model__n_estimators': 375}
with score: 0.9398295218158871


In [22]:
best_params_XGBoost, best_params_XGBoost_score = get_best_params_overall(history_XGBoost)
print(f"Best params for XGBoost: {best_params_XGBoost}")
print(f"With score: {best_params_XGBoost_score}")

Best params for XGBoost: {'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}
With score: 0.9312130131165086


# Tunability

##### Now let's compute tunability of each of the ML algorithms. We'll start with looking for the optimal configuration of the hyperparameters for each of the datasets

In [23]:
def get_best_params_per_dataset(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    best_params_per_dataset = df.sort_values(['dataset', 'rank_test_score'], ascending=[True, True]).groupby('dataset').first().reset_index()
    best_params_per_dataset.rename(columns={'params_str': 'best_params', 'mean_test_score': 'best_score'}, inplace=True)
    best_params_per_dataset = best_params_per_dataset[['dataset', 'best_params', 'best_score']]
    default_params, _ = get_best_params_overall(df)
    score_for_default_params = df[df['params_str'] == default_params][['dataset', 'mean_test_score']].rename(columns={'mean_test_score': 'default_score'})
    best_params_per_dataset = best_params_per_dataset.merge(score_for_default_params, on='dataset', how='left')
    best_params_per_dataset['abs_tunability'] = best_params_per_dataset['best_score'] - best_params_per_dataset['default_score']
    best_params_per_dataset['rel_tunability (%)'] = best_params_per_dataset['abs_tunability'] / best_params_per_dataset['default_score'] * 100 
    return best_params_per_dataset

In [24]:
best_params_per_dataset_DecisionTree = get_best_params_per_dataset(history_DecisionTree)
best_params_per_dataset_DecisionTree

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 16, 'model__min_samples_leaf': 1, 'model__min_samples_split': 55}",0.875537,0.86955,0.005987,0.688491
1,1,"{'model__criterion': 'entropy', 'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 58}",0.97676,0.971298,0.005462,0.56235
2,2,"{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 6, 'model__min_samples_split': 23}",0.987566,0.976419,0.011146,1.141524
3,3,"{'model__criterion': 'entropy', 'model__max_depth': 7, 'model__min_samples_leaf': 45, 'model__min_samples_split': 47}",0.81752,0.79701,0.02051,2.573347


In [25]:
best_params_per_dataset_RandomForest = get_best_params_per_dataset(history_RandomForest)
best_params_per_dataset_RandomForest

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.6325405332263061, 'model__max_samples': 0.43556377347004105, 'model__min_samples_leaf': 2, 'model__n_estimators': 375}",0.924874,0.924874,0.0,0.0
1,1,"{'model__max_features': 0.7758047357929561, 'model__max_samples': 0.9471137538091479, 'model__min_samples_leaf': 2, 'model__n_estimators': 225}",0.985769,0.985332,0.000437,0.044341
2,2,"{'model__max_features': 0.7758047357929561, 'model__max_samples': 0.9471137538091479, 'model__min_samples_leaf': 2, 'model__n_estimators': 225}",0.999979,0.99996,1.9e-05,0.001863
3,3,"{'model__max_features': 0.8791183075621647, 'model__max_samples': 0.9502588200622692, 'model__min_samples_leaf': 11, 'model__n_estimators': 1086}",0.849182,0.849151,3.1e-05,0.00364


In [26]:
best_params_per_dataset_XGBoost = get_best_params_per_dataset(history_XGBoost)
best_params_per_dataset_XGBoost

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}",0.900224,0.900224,0.0,0.0
1,1,"{'model__alpha': 13.600928904086004, 'model__eta': 0.5424625491648044, 'model__max_depth': 13, 'model__min_child_weight': 20}",0.986025,0.985244,0.00078,0.07921
2,2,"{'model__alpha': 0.9526994373401294, 'model__eta': 0.9033181355211285, 'model__max_depth': 10, 'model__min_child_weight': 13}",0.998642,0.991926,0.006716,0.677032
3,3,"{'model__alpha': 5.148166434938389, 'model__eta': 0.06101428497741022, 'model__max_depth': 9, 'model__min_child_weight': 26}",0.847458,0.847458,0.0,0.0
