# Libraries & utilities

In [1]:
%matplotlib inline
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

# To enable HalvingGridSearchCV (Note: method is still experimental)
from sklearn.experimental import enable_halving_search_cv

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Introducing the dataset

The dataset is located in `../data/bank-full.csv`. It has 17 columns, the target column or $ŷ$ is the column named `y`.

Columns summary  : 

| column name | simple description |
|-------------|------------------------|
| age | age |
| job | type of job |
| marital | marital status |
| education | education level |
| default | has credit in default? |
| balance | average yearly balance, in euros  |
| housing | has housing loan? |
| loan | has personal loan? |
| contact | contact communication |
| month | last contact month of year |
| day_of_week | last contact day of the week |
| duration |  last contact duration in seconds |
| campaign |  number of contacts performed during campaign for this client |
| pday | number of days that passed by after the client was last contacted from a previous campaign (-1 means client was not previously contacted) |
| previous | number of contacts performed before this campaign and for this client |
| poutcome | outcome of the previous marketing campaign |
| **output :** |  |
| y | has  the client subscribed a term deposit? |

In [2]:
## READING AND CLEANING DATA
df = pd.read_csv("../data/bank-full.csv", sep=";")

# remove duration column
df = df.drop(columns="duration", axis=1)

# add 'day of year' column, remove redundant columns
df['dayoftheyear'] = pd.to_datetime("1984" + "-" + df["month"] + "-" + df["day"].astype(str), format='%Y-%b-%d')
df['dayoftheyear'] = df['dayoftheyear'].dt.dayofyear.astype(int)
df = df.drop(columns=["day", "month"])

In [None]:
# Create the X and y DataFrames
df["y"] = df["y"].replace({"no": "0", "yes":"1"})
y = df["y"]
X = df.drop(columns="y", axis=1)

# Train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=37)

  df["y"] = df["y"].replace({"no": 0, "yes":1})


# Modeling phase

![lala](../img/grid-search-phd.png)

## Pipeline

In [None]:
# numerical and categorical features for pipeline
twocat_features = ["default", "housing", "loan"]
multicat_features = ["job", "marital", "education", "contact", "poutcome"]
numerical_features = df.columns.values[(df.dtypes==int) + (df.dtypes==float)]

In [None]:
# Pipeline creation
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

twocat_pipeline = Pipeline([
    ('ordinal_encoder', OrdinalEncoder(categories=[["no", "yes"], ["no", "yes"], ["no", "yes"]]))
])

multicat_pipeline = Pipeline([
    ('onehot_encoder', OneHotEncoder(handle_unknown='error'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('twocat', twocat_pipeline, twocat_features),
    ('multicat', multicat_pipeline, multicat_features)
], remainder="passthrough")

## Gridsearch

In [None]:
grid_search = Pipeline([
    ('preprocessor', preprocessor),
    ('gridsearch', GridSearchCV(
        RandomForestClassifier(max_samples=0.8, class_weight='balanced', n_jobs=-2),
        param_grid={
            "n_estimators" : np.linspace(1, 200, 10, dtype=int),
            "max_depth" : np.linspace(1, 30, 10, dtype=int)
            },
        verbose=1,
        n_jobs=-2,
        scoring="f1"
        )
    )
])

grid_search.fit(X_train, y_train)
result_grid_search = pd.DataFrame(grid_search["gridsearch"].cv_results_).sort_values(by="rank_test_score")
result_grid_search

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
43,2.289895,0.187721,0.269454,0.087526,13,67,"{'max_depth': 13, 'n_estimators': 67}",0.434734,0.463866,0.444812,0.429851,0.424242,0.439501,0.013932,1
48,6.057443,0.729462,0.643763,0.228306,13,177,"{'max_depth': 13, 'n_estimators': 177}",0.428969,0.460126,0.446309,0.416519,0.434470,0.437279,0.014909,2
45,3.395758,0.472137,0.306625,0.046743,13,111,"{'max_depth': 13, 'n_estimators': 111}",0.433898,0.453363,0.450083,0.420299,0.419394,0.435407,0.014316,3
46,4.380652,0.577029,0.417021,0.146320,13,133,"{'max_depth': 13, 'n_estimators': 133}",0.423804,0.461538,0.443080,0.421175,0.426667,0.435253,0.015200,4
47,5.520803,0.564756,0.745998,0.147281,13,155,"{'max_depth': 13, 'n_estimators': 155}",0.432492,0.456200,0.442222,0.425659,0.417622,0.434839,0.013394,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.349741,0.084480,0.047735,0.005618,23,1,"{'max_depth': 23, 'n_estimators': 1}",0.291691,0.318258,0.282881,0.276018,0.265871,0.286944,0.017794,96
90,0.450228,0.089710,0.040860,0.007474,30,1,"{'max_depth': 30, 'n_estimators': 1}",0.282216,0.293417,0.285009,0.254990,0.299805,0.283088,0.015363,97
60,0.304636,0.068396,0.031906,0.009860,20,1,"{'max_depth': 20, 'n_estimators': 1}",0.293473,0.261179,0.281024,0.285571,0.293994,0.283048,0.011977,98
80,0.334635,0.103225,0.033836,0.017121,26,1,"{'max_depth': 26, 'n_estimators': 1}",0.290383,0.296252,0.273452,0.267717,0.254914,0.276543,0.015068,99


In [32]:
result_grid_search[result_grid_search["param_max_depth"]==13].sort_values("param_n_estimators")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
40,0.296366,0.06941,0.046773,0.012299,13,1,"{'max_depth': 13, 'n_estimators': 1}",0.334654,0.348544,0.329327,0.322665,0.333636,0.333765,0.00851,61
41,0.73698,0.116217,0.08023,0.033131,13,23,"{'max_depth': 13, 'n_estimators': 23}",0.41639,0.450704,0.424209,0.396376,0.430861,0.423708,0.017781,17
42,1.286379,0.055698,0.128417,0.031047,13,45,"{'max_depth': 13, 'n_estimators': 45}",0.432967,0.450201,0.438889,0.424939,0.419664,0.433332,0.010704,6
43,2.289895,0.187721,0.269454,0.087526,13,67,"{'max_depth': 13, 'n_estimators': 67}",0.434734,0.463866,0.444812,0.429851,0.424242,0.439501,0.013932,1
44,2.893417,0.293981,0.292778,0.171543,13,89,"{'max_depth': 13, 'n_estimators': 89}",0.427273,0.456728,0.440963,0.417112,0.418968,0.432209,0.014869,8
45,3.395758,0.472137,0.306625,0.046743,13,111,"{'max_depth': 13, 'n_estimators': 111}",0.433898,0.453363,0.450083,0.420299,0.419394,0.435407,0.014316,3
46,4.380652,0.577029,0.417021,0.14632,13,133,"{'max_depth': 13, 'n_estimators': 133}",0.423804,0.461538,0.44308,0.421175,0.426667,0.435253,0.0152,4
47,5.520803,0.564756,0.745998,0.147281,13,155,"{'max_depth': 13, 'n_estimators': 155}",0.432492,0.4562,0.442222,0.425659,0.417622,0.434839,0.013394,5
48,6.057443,0.729462,0.643763,0.228306,13,177,"{'max_depth': 13, 'n_estimators': 177}",0.428969,0.460126,0.446309,0.416519,0.43447,0.437279,0.014909,2
49,6.338828,0.798536,0.626365,0.102589,13,200,"{'max_depth': 13, 'n_estimators': 200}",0.429613,0.460126,0.446927,0.409392,0.420158,0.433243,0.018239,7
