In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import numpy as np
import os
import pandas as pd

# Data


In [2]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [3]:
for data in DATA:
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        5000 non-null   int64 
 1   job        5000 non-null   object
 2   marital    5000 non-null   object
 3   education  5000 non-null   object
 4   default    5000 non-null   object
 5   balance    5000 non-null   int64 
 6   housing    5000 non-null   object
 7   loan       5000 non-null   object
 8   contact    5000 non-null   object
 9   day        5000 non-null   int64 
 10  month      5000 non-null   object
 11  duration   5000 non-null   int64 
 12  campaign   5000 non-null   int64 
 13  pdays      5000 non-null   int64 
 14  previous   5000 non-null   int64 
 15  poutcome   5000 non-null   object
 16  y          5000 non-null   object
dtypes: int64(7), object(10)
memory usage: 664.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 

# Pipeline

In [4]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="most_frequent")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])


In [5]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
]

pipelines = []

In [6]:
for classifier in classifiers:
    pipelines.append((type(classifier), Pipeline([("transformer", col_trans), ("model", classifier)])))

In [20]:
pipelines

[(sklearn.tree._classes.DecisionTreeClassifier,
  Pipeline(steps=[('transformer',
                   ColumnTransformer(transformers=[('num_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer()),
                                                                    ('scale',
                                                                     MinMaxScaler())]),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000280296D4250>),
                                                   ('cat_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer(strategy='most_frequent')),
                                                                    ('one-hot',
                                      

In [7]:
from sklearn.calibration import LabelEncoder


scores = []

for pipe in pipelines:
    pipe_score = []
    for data in DATA:
        score = cross_validate(pipe[1], data.iloc[:, :-1], LabelEncoder().fit_transform(data.iloc[:,-1]), cv = 5, scoring="accuracy")
        pipe_score.append(score["test_score"].mean())

    scores.append((pipe[0], pipe_score))

In [8]:
os.listdir("../data")

['banking_final.csv',
 'flights_final.csv',
 'mushrooms_final.csv',
 'weather_final.csv']

In [9]:
scores

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.8688, 0.9102, 0.9815999999999999, 0.791]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.8998000000000002, 0.944, 0.9996, 0.8366]),
 (xgboost.sklearn.XGBClassifier,
  [0.9016, 0.9469999999999998, 0.9974000000000001, 0.8368])]

# Random Searching - searching for new defaults

In [53]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

param_distributions = [
    {
        "model__max_depth": randint(1,30),
        "model__min_samples_split":randint(2,60),
        "model__criterion":["gini","entropy"],
        "model__min_samples_leaf":randint(1,60)
    },
    {
        "model__n_estimators":randint(1,2000),
        "model__min_samples_leaf":randint(1,5000),
        "model__min_samples_split":randint(2,60),
        "model__max_features":uniform(0,1)
    },
    {
        "model__max_depth": randint(1,15),
        "model__min_child_weight": randint(1,128),
        "model__eta": uniform(0,1),
        "model__alpha": uniform(2**(-10),2**(10))
    }
]
best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,pipe in enumerate(pipelines):    
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(pipe[1], 
                                param_distributions= param_distributions[i],
                                verbose=True,
                                random_state=42,
                                cv=5,
                                n_iter=200
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        pipe_best_scores.append(rs.best_score_)
        pipe_best_models.append(rs.best_estimator_)
        best_params[j].append(rs.best_params_)
        history[i].append(rs.cv_results_)    


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [54]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (800, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (800, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (800, 18)


In [55]:
model_names = ['DecisionTree','RandomForest','XGBoost']

# Saving history to csv files

In [58]:
for i, df in enumerate(history_datasets):
    df.to_csv(f'../data/history_dataset_{model_names[i]}.csv', index=False)

# Reading history from csv

In [62]:
history_DecisionTree = pd.read_csv('../data/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../data/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../data/history_dataset_XGBoost.csv')

In [63]:
history_DecisionTree.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.034176,0.006215,0.004794,0.002535,gini,20,29,16,"{'model__criterion': 'gini', 'model__max_depth...",0.902,0.894,0.891,0.884,0.886,0.8914,0.006375,121
1,0,0.028392,0.001495,0.004094,0.002321,gini,8,21,40,"{'model__criterion': 'gini', 'model__max_depth...",0.904,0.889,0.885,0.887,0.889,0.8908,0.006765,134
2,0,0.031333,0.001139,0.005442,0.002487,entropy,19,23,12,"{'model__criterion': 'entropy', 'model__max_de...",0.889,0.898,0.895,0.89,0.874,0.8892,0.00828,151
3,0,0.026831,0.00397,0.006345,0.003212,gini,24,53,37,"{'model__criterion': 'gini', 'model__max_depth...",0.901,0.903,0.891,0.896,0.891,0.8964,0.004964,24
4,0,0.034022,0.001552,0.007549,0.001566,entropy,24,3,23,"{'model__criterion': 'entropy', 'model__max_de...",0.887,0.89,0.872,0.881,0.87,0.88,0.007925,200


In [64]:
history_RandomForest.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_features,param_model__min_samples_leaf,param_model__min_samples_split,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,1.640918,0.006159,0.049864,0.000418,0.37454,861,16,1131,"{'model__max_features': 0.3745401188473625, 'm...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,3
1,0,0.136891,0.001464,0.009309,0.000396,0.779691,3093,40,122,"{'model__max_features': 0.7796910002727693, 'm...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,3
2,0,0.101864,0.000262,0.008216,0.000592,0.155995,4427,12,88,"{'model__max_features': 0.15599452033620265, '...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,3
3,0,0.144345,0.000598,0.009973,0.000584,0.333709,2920,25,131,"{'model__max_features': 0.33370861113902184, '...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,3
4,0,1.595966,0.040971,0.0622,0.005089,0.020584,770,25,1516,"{'model__max_features': 0.020584494295802447, ...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,3


In [65]:
history_XGBoost.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.042777,0.001839,0.008124,6.8e-05,383.530058,0.950714,11,72,"{'model__alpha': 383.5300582621992, 'model__et...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,7
1,0,0.043513,0.000703,0.007536,0.000553,613.027264,0.156019,3,87,"{'model__alpha': 613.0272643802655, 'model__et...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,7
2,0,0.043707,0.001379,0.008389,0.000387,59.478595,0.866176,4,104,"{'model__alpha': 59.47859542273625, 'model__et...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,7
3,0,0.04349,0.000772,0.0081,8e-06,725.067296,0.020584,2,88,"{'model__alpha': 725.0672962256506, 'model__et...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,7
4,0,0.045204,0.001118,0.008311,0.00047,852.422241,0.212339,12,21,"{'model__alpha': 852.4222407421319, 'model__et...",0.883,0.883,0.883,0.883,0.883,0.883,0.0,7


# New defaults below

In [80]:
df = history_DecisionTree
df['params_str'] = df['params'].apply(lambda x: str(x))
grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
best_params_DecisionTree = grouped_mean.iloc[0, 0]
best_params_DecisionTree_score = grouped_mean.iloc[0, 1]
print(f"Best params for DecisionTree: {best_params_DecisionTree}") 
print(f"with score: {best_params_DecisionTree_score}")

Best params for DecisionTree: {'model__criterion': 'entropy', 'model__max_depth': 28, 'model__min_samples_leaf': 5, 'model__min_samples_split': 48}
with score: 0.8935500000000001


In [81]:
df = history_RandomForest
df['params_str'] = df['params'].apply(lambda x: str(x))
grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
best_params_RandomForest = grouped_mean.iloc[0, 0]
best_params_RandomForest_score = grouped_mean.iloc[0, 1]
print(f"Best params for RandomForest: {best_params_RandomForest}") 
print(f"with score: {best_params_RandomForest_score}")

Best params for RandomForest: {'model__max_features': 0.9191765518355596, 'model__min_samples_leaf': 10, 'model__min_samples_split': 57, 'model__n_estimators': 798}
with score: 0.90745


In [82]:
df = history_XGBoost
df['params_str'] = df['params'].apply(lambda x: str(x))
grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
best_params_XGBoost = grouped_mean.iloc[0, 0]
best_params_XGBoost_score = grouped_mean.iloc[0, 1]
print(f"Best params for XGBoost: {best_params_XGBoost}")
print(f"With score: {best_params_XGBoost_score}")

Best params for XGBoost: {'model__alpha': 14.73990891937001, 'model__eta': 0.11607264050691624, 'model__max_depth': 5, 'model__min_child_weight': 47}
With score: 0.88395
