# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler


from utils import train_k_fold, bool_contact, train_ts

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


# Download datasets

In [3]:
# raw dataset
bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")

bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

# Define metrics, parameters and models

In [4]:
# metrics = {
#     'precision': make_scorer(precision_score, pos_label=1, average='binary', zero_division=0),
#     'recall': make_scorer(recall_score, pos_label=1, average='binary', zero_division=0),
#     'f1': make_scorer(f1_score, pos_label=1, average='binary', zero_division=0),
#     'accuracy': make_scorer(accuracy_score)
# }

metrics = ['precision', 'recall', 'f1', 'accuracy']

In [5]:
# First param grid (basic)
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'liblinear']
    },

    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },

    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    },

    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

In [6]:
# # Second param grid (improved but too long to compute so not used)

# param_grids = {
#     'Logistic Regression': {
#         'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
#         'classifier__penalty': ['l1', 'l2', 'elasticnet'],
#         'classifier__solver': ['lbfgs', 'liblinear', 'saga'],
#         'classifier__class_weight': [None, 'balanced'],
#         'classifier__l1_ratio': [0, 0.5, 1]  # only used with elasticnet
#     },

#     'Random Forest': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [None, 5, 10, 20],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__bootstrap': [True, False],
#         'classifier__class_weight': [None, 'balanced']
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__max_depth': [3, 5, 7],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__loss': ['log_loss', 'exponential']
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [3, 5, 7, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#         'classifier__gamma': [0, 0.1, 0.3],
#         'classifier__reg_alpha': [0, 0.1, 1],
#         'classifier__reg_lambda': [1, 1.5, 2],
#         'classifier__scale_pos_weight': [1, 2]  # useful for class imbalance
#     },

#     'CatBoost': {
#         'classifier__iterations': [200, 500, 800],
#         'classifier__depth': [4, 6, 8, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__l2_leaf_reg': [1, 3, 5, 7],
#         'classifier__border_count': [32, 64, 128],
#         'classifier__bagging_temperature': [0, 0.5, 1.0],
#         'classifier__random_strength': [0, 0.5, 1.0],
#         'classifier__boosting_type': ['Ordered', 'Plain']
#     }
# }


In [14]:
# Models to use during training
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(class_weight="balanced", random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Data Preparation

In [8]:
bank_full.y = bank_full.y.map({"yes": 1, "no":0})

bank_stable = bank_full.iloc[39130:].reset_index(drop=True)

# chronological train/test split
train_size = 0.8
split = int(len(bank_stable)*train_size)
train_set = bank_stable.iloc[:split].copy()
test_set = bank_stable.iloc[split:].copy()

# Split X and y from train dataset
X_train = train_set.drop(columns=['y'])

# Map target
y_train = train_set.y

In [9]:
print(len(X_train), len(test_set))

1646 412


# Data preprocessing, feature engineering and training

In [16]:
# Columns to drop
drop_cols = ["duration", "pdays", "emp.var.rate", "cons.conf.idx", "euribor3m", "month"]

# Columns on which we will apply Standard Scaler
std_cols = ["age", "cons.price.idx",  "nr.employed"]

# Columns on which we will apply Min Max Scaler
minmax_cols = ["campaign"]

# Columns on which we will apply One-Hot Encoder
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome']

# Columns on which we will apply Ordinal Encoder...
ordinal_cols = ["education", "day_of_week"]
# ... and their respective orders
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Column on which we will apply feature engineering
bool_cols = ["contact"]

# Feature engineering functions
ft_eng_contact = FunctionTransformer(bool_contact, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', drop_cols),
                                               ('std_scale', StandardScaler(), std_cols),
                                               ('minmax_scale', MinMaxScaler(), minmax_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)])


# Training for each selected model
for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[('fetaure_eng', ft_eng_contact),
                                  ('preprocessor', preprocessor),
                                  ('classifier', model)])

    train_ts(X_train=X_train,           # training set features
                 y_train=y_train,           # training set target
                 pipeline=pipeline,         # pipeline to use
                 n_folds=5,                 # number of folds
                 model_name = model_name,   # current model name
                 param_grids=param_grids,   # put {} to avoid K fold and process a classic train/val training
                 scoring_metrics=metrics,
                 refit_metric='recall',     # optimizing metric, choose from 'recall', 'precision', 'f1', 'accuracy', etc...
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')).tail(10)

Evaluating models:   0%|          | 0/5 [00:00<?, ?it/s]

Fold 0: [124 150]
Fold 1: [137 137]
Fold 2: [123 151]
Fold 3: [112 162]
Fold 4: [143 131]


Evaluating models:  20%|██        | 1/5 [00:01<00:04,  1.04s/it]


Model: Logistic Regression
Grid Search: True
Precision: 0.7093 ± 0.0750
Recall   : 0.5473 ± 0.1152
F1       : 0.6044 ± 0.0590
Accuracy : 0.6285 ± 0.0297
Time     : 1.00s
Pipeline saved at: saved_pipelines\Logistic_Regression_pipeline_1763047673.pkl

######################################################################

Fold 0: [124 150]
Fold 1: [137 137]
Fold 2: [123 151]
Fold 3: [112 162]
Fold 4: [143 131]


Evaluating models:  40%|████      | 2/5 [00:16<00:29,  9.73s/it]


Model: Random Forest
Grid Search: True
Precision: 0.6753 ± 0.0745
Recall   : 0.6528 ± 0.0932
F1       : 0.6542 ± 0.0400
Accuracy : 0.6380 ± 0.0235
Time     : 15.66s
Pipeline saved at: saved_pipelines\Random_Forest_pipeline_1763047689.pkl

######################################################################

Fold 0: [124 150]
Fold 1: [137 137]
Fold 2: [123 151]
Fold 3: [112 162]
Fold 4: [143 131]


Evaluating models:  60%|██████    | 3/5 [01:02<00:52, 26.26s/it]


Model: Gradient Boosting
Grid Search: True
Precision: 0.6096 ± 0.0831
Recall   : 0.7593 ± 0.1883
F1       : 0.6518 ± 0.0639
Accuracy : 0.5876 ± 0.0318
Time     : 45.75s
Pipeline saved at: saved_pipelines\Gradient_Boosting_pipeline_1763047735.pkl

######################################################################

Fold 0: [124 150]
Fold 1: [137 137]
Fold 2: [123 151]
Fold 3: [112 162]
Fold 4: [143 131]


Evaluating models:  80%|████████  | 4/5 [01:04<00:16, 16.51s/it]


Model: XGBoost
Grid Search: True
Precision: 0.6003 ± 0.0855
Recall   : 0.7706 ± 0.1891
F1       : 0.6498 ± 0.0643
Accuracy : 0.5781 ± 0.0324
Time     : 1.51s
Pipeline saved at: saved_pipelines\XGBoost_pipeline_1763047736.pkl

######################################################################

Fold 0: [124 150]
Fold 1: [137 137]
Fold 2: [123 151]
Fold 3: [112 162]
Fold 4: [143 131]


Evaluating models: 100%|██████████| 5/5 [01:10<00:00, 14.09s/it]


Model: CatBoost
Grid Search: True
Precision: 0.6423 ± 0.0770
Recall   : 0.6550 ± 0.0879
F1       : 0.6405 ± 0.0415
Accuracy : 0.6124 ± 0.0386
Time     : 6.05s
Pipeline saved at: saved_pipelines\CatBoost_pipeline_1763047742.pkl

######################################################################






Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file,Best_params,Timestamp
139,Logistic Regression,5,True,0.728229,0.078058,0.542745,0.124906,0.606003,0.047462,0.633577,0.032529,1.385964,saved_pipelines\Logistic_Regression_pipeline_1...,,
140,Random Forest,5,True,0.683737,0.073806,0.651695,0.093854,0.658451,0.045599,0.645255,0.031259,16.443338,saved_pipelines\Random_Forest_pipeline_1763047...,,
141,Gradient Boosting,5,True,0.622709,0.069398,0.711275,0.073707,0.657021,0.025121,0.605109,0.048823,39.338586,saved_pipelines\Gradient_Boosting_pipeline_176...,,
142,XGBoost,5,True,0.600321,0.085541,0.770572,0.189094,0.649834,0.064262,0.578102,0.032365,1.838812,saved_pipelines\XGBoost_pipeline_1763047439.pkl,,
143,CatBoost,5,True,0.678687,0.060576,0.626971,0.098286,0.643474,0.037826,0.635766,0.018523,6.308091,saved_pipelines\CatBoost_pipeline_1763047446.pkl,,
144,Logistic Regression,5,True,0.709309,0.074985,0.547267,0.115182,0.604364,0.058985,0.628467,0.029686,0.999099,saved_pipelines\Logistic_Regression_pipeline_1...,,
145,Random Forest,5,True,0.675317,0.07452,0.652817,0.093202,0.654243,0.040044,0.637956,0.023471,15.655715,saved_pipelines\Random_Forest_pipeline_1763047...,,
146,Gradient Boosting,5,True,0.609638,0.083082,0.759345,0.188335,0.651797,0.063867,0.587591,0.031817,45.750493,saved_pipelines\Gradient_Boosting_pipeline_176...,,
147,XGBoost,5,True,0.600321,0.085541,0.770572,0.189094,0.649834,0.064262,0.578102,0.032365,1.505121,saved_pipelines\XGBoost_pipeline_1763047736.pkl,,
148,CatBoost,5,True,0.642276,0.077009,0.655046,0.087855,0.640494,0.04154,0.612409,0.038583,6.045561,saved_pipelines\CatBoost_pipeline_1763047742.pkl,,


After multiple training attempts, we have decided that the model that seems best for our use case is :

    Random_Forest_pipeline_1762872489.pkl

It maximizes recall while maintaining a good accuracy.

Even though accuracy is not our main concern, it is reassuring that it doesn't drop compared to the other attempts.

In [11]:
trained_models = pd.read_csv(os.path.join(cwd, 'data', 'logs.csv'))
trained_models[trained_models.Pipeline_file == "saved_pipelines\Random_Forest_pipeline_1762872489.pkl"]

Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file,Best_params,Timestamp
51,Random Forest,5,True,0.374057,0.014647,0.618067,0.021961,0.465846,0.014598,0.840463,0.004512,62.044769,saved_pipelines\Random_Forest_pipeline_1762872...,,
