# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

from imblearn.pipeline import Pipeline as ImbPipeline


from utils import bool_contact, train_ts

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


# Download datasets

In [2]:
# raw dataset
bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")
bank_full.y = bank_full.y.map({"yes": 1, "no":0})

bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

# Define metrics, parameters and models

In [3]:
metrics = ['precision', 'recall', 'f1', 'accuracy']

In [4]:
# # First param grid (basic)
# param_grids = {
#     'Logistic Regression': {
#         'classifier__C': [0.1, 1, 10, 100],
#         'classifier__penalty': ['l2'],
#         'classifier__solver': ['lbfgs', 'liblinear']
#     },

#     'Random Forest': {
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__max_depth': [None, 5, 10],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2]
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'classifier__max_depth': [3, 5, 7],
#         'classifier__subsample': [0.8, 1.0]
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [50, 100],
#         'classifier__max_depth': [3, 5],
#         'classifier__learning_rate': [0.01, 0.1]
#     },

#     'CatBoost': {
#         'classifier__iterations': [100, 200],
#         'classifier__depth': [3, 5],
#         'classifier__learning_rate': [0.01, 0.1]
#     }
# }

In [5]:
# # Second Params Grid (complex)
# param_grids = {
#     'Logistic Regression': [
#         # liblinear supports l1, l2
#         {
#             'classifier__solver': ['liblinear'],
#             'classifier__penalty': ['l1', 'l2'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__class_weight': [None, 'balanced']
#         },
#         # lbfgs supports only l2
#         {
#             'classifier__solver': ['lbfgs'],
#             'classifier__penalty': ['l2'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__class_weight': [None, 'balanced']
#         },
#         # saga supports l1, l2, elasticnet
#         {
#             'classifier__solver': ['saga'],
#             'classifier__penalty': ['l1', 'l2', 'elasticnet'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__l1_ratio': [0.5],  # only used with elasticnet
#             'classifier__class_weight': [None, 'balanced']
#         }
#     ],

#     'Random Forest': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__max_depth': [None, 10],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2],
#         'classifier__max_features': ['sqrt', 'log2'],
#         'classifier__bootstrap': [True],
#         'classifier__class_weight': [None, 'balanced']
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__max_depth': [3, 5],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2],
#         'classifier__subsample': [0.8, 1.0],
#         'classifier__max_features': ['sqrt', None],
#         'classifier__loss': ['log_loss']  # keep only main loss for binary classification
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__max_depth': [3, 5],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__subsample': [0.8, 1.0],
#         'classifier__colsample_bytree': [0.8, 1.0],
#         'classifier__gamma': [0, 0.1],
#         'classifier__reg_alpha': [0, 0.1],
#         'classifier__reg_lambda': [1, 1.5],
#         'classifier__scale_pos_weight': [1]
#     },

#     'CatBoost': {
#         'classifier__iterations': [200, 500],
#         'classifier__depth': [6, 8],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__l2_leaf_reg': [3, 5],
#         'classifier__border_count': [32, 64],
#         'classifier__bagging_temperature': [0, 0.5],
#         'classifier__random_strength': [0, 0.5],
#         'classifier__boosting_type': ['Ordered']
#     }
# }


In [None]:
# Third params grid (used grid, more evolved than the 1st)

param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1.0, 10.0],  # 3 values
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']  # Removed liblinear (lbfgs is faster)
    },  # Total: 3 combinations

    'Random Forest': {
        'classifier__n_estimators': [100, 200],  # 2 values
        'classifier__max_depth': [10, 20],  # 2 values (removed None - too slow)
        'classifier__min_samples_split': [5, 10],  # 2 values
        'classifier__class_weight': ['balanced']  # Fixed
    },  # Total: 8 combinations

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__subsample': [0.8]  # Fixed
    },  # Total: 8 combinations

    'XGBoost': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__scale_pos_weight': [1]  # Fixed (or use balanced ratio)
    },  # Total: 8 combinations

    'CatBoost': {
        'classifier__iterations': [100, 200],  # 2 values
        'classifier__depth': [4, 6],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__auto_class_weights': ['Balanced']  # Fixed
    }  # Total: 8 combinations
}

In [7]:
# Models to use during training
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(class_weight="balanced", random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Data Preparation

In [None]:
split_idx = 36224 # 39130 or 36224

bank_stable = bank_full.iloc[split_idx:].reset_index(drop=True)

# chronological train/test split
train_size = 0.8
split = int(len(bank_stable)*train_size)
train_set = bank_stable.iloc[:split].copy()
test_set = bank_stable.iloc[split:].copy()

# Split X and y from train dataset
X_train = train_set.drop(columns=['y'])

# Map target
y_train = train_set.y

print(len(X_train), len(test_set))

# Data preprocessing, feature engineering and training

keep cons.price.idx, remove emp.var.rate

keep eurobr3m, remove nr.employed

keep cons.price.idx, remove cons.conf.idx since its still higly correlated with emp.var.rate and euribor3m

In [18]:
# Columns to drop
drop_cols = ["duration", "month", 'emp.var.rate', 'cons.conf.idx', 'nr.employed']

# Columns on which we will apply Standard Scaler
std_cols = ["age", "campaign", "pdays", "previous", 'euribor3m', 'cons.price.idx',]

# Columns on which we will apply Min Max Scaler
minmax_cols = ["campaign"]

# Columns on which we will apply One-Hot Encoder
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome', "contact", "education", "day_of_week"]

# Columns on which we will apply Ordinal Encoder...
ordinal_cols = ["education", "day_of_week"]
# ... and their respective orders
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Column on which we will apply feature engineering
bool_cols = ["contact"]

# Feature engineering functions
ft_eng_contact = FunctionTransformer(bool_contact, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', drop_cols),
                                               ('std_scale', StandardScaler(), std_cols),
                                               # ('minmax_scale', MinMaxScaler(), minmax_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               # ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)
                                               ])


# Training for each selected model
for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[# ('fetaure_eng', ft_eng_contact),
                                  ('preprocessor', preprocessor),
                                  ('classifier', model)])

    train_ts(X_train=X_train,               # training set features
                 y_train=y_train,           # training set target
                 pipeline=pipeline,         # pipeline to use
                 n_folds=5,                 # number of folds
                 model_name = model_name,   # current model name
                 param_grids=param_grids,   # put {} to avoid K fold and process a classic train/val training
                 scoring_metrics=metrics,
                 refit_metric='accuracy',     # optimizing metric, choose from 'recall', 'precision', 'f1', 'accuracy', etc...
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'train_logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'train_logs.csv')).tail(10)

Evaluating models:  20%|██        | 1/5 [00:04<00:18,  4.72s/it]


Predictions per fold:

Fold 0:
  Class:     [0 1]
  True:      [442 219]
  Predicted: [555 106]
  Accuracy:  0.6596
  Precision: 0.4717
  Recall:    0.2283
  F1-score:  0.3077

Fold 1:
  Class:     [0 1]
  True:      [415 246]
  Predicted: [519 142]
  Accuracy:  0.6974
  Precision: 0.6620
  Recall:    0.3821
  F1-score:  0.4845

Fold 2:
  Class:     [0 1]
  True:      [363 298]
  Predicted: [451 210]
  Accuracy:  0.6248
  Precision: 0.6190
  Recall:    0.4362
  F1-score:  0.5118

Fold 3:
  Class:     [0 1]
  True:      [313 348]
  Predicted: [283 378]
  Accuracy:  0.6127
  Precision: 0.6217
  Recall:    0.6753
  F1-score:  0.6474

Fold 4:
  Class:     [0 1]
  True:      [310 351]
  Predicted: [176 485]
  Accuracy:  0.6793
  Precision: 0.6433
  Recall:    0.8889
  F1-score:  0.7464

Model: Logistic Regression
Grid Search: True
Accuracy : 0.6548 ± 0.0320
Precision: 0.6035 ± 0.0677
Recall   : 0.5222 ± 0.2329
F1       : 0.5396 ± 0.1496
Time     : 4.72s
Pipeline saved at: saved_pipelines\L




Predictions per fold:

Fold 0:
  Class:     [0 1]
  True:      [442 219]
  Predicted: [559 102]
  Accuracy:  0.7110
  Precision: 0.6373
  Recall:    0.2968
  F1-score:  0.4050

Fold 1:
  Class:     [0 1]
  True:      [415 246]
  Predicted: [529 132]
  Accuracy:  0.7398
  Precision: 0.7803
  Recall:    0.4187
  F1-score:  0.5450

Fold 2:
  Class:     [0 1]
  True:      [363 298]
  Predicted: [412 249]
  Accuracy:  0.7776
  Precision: 0.8032
  Recall:    0.6711
  F1-score:  0.7313

Fold 3:
  Class:     [0 1]
  True:      [313 348]
  Predicted: [258 403]
  Accuracy:  0.8048
  Precision: 0.7717
  Recall:    0.8937
  F1-score:  0.8282

Fold 4:
  Class:     [0 1]
  True:      [310 351]
  Predicted: [265 396]
  Accuracy:  0.8411
  Precision: 0.8106
  Recall:    0.9145
  F1-score:  0.8594

Model: Random Forest
Grid Search: True
Accuracy : 0.7749 ± 0.0461
Precision: 0.7606 ± 0.0633
Recall   : 0.6390 ± 0.2480
F1       : 0.6738 ± 0.1735
Time     : 4.41s
Pipeline saved at: saved_pipelines\Random_

Evaluating models:  60%|██████    | 3/5 [00:13<00:08,  4.34s/it]


Predictions per fold:

Fold 0:
  Class:     [0 1]
  True:      [442 219]
  Predicted: [586  75]
  Accuracy:  0.6823
  Precision: 0.5600
  Recall:    0.1918
  F1-score:  0.2857

Fold 1:
  Class:     [0 1]
  True:      [415 246]
  Predicted: [545 116]
  Accuracy:  0.7126
  Precision: 0.7414
  Recall:    0.3496
  F1-score:  0.4751

Fold 2:
  Class:     [0 1]
  True:      [363 298]
  Predicted: [516 145]
  Accuracy:  0.6505
  Precision: 0.7310
  Recall:    0.3557
  F1-score:  0.4786

Fold 3:
  Class:     [0 1]
  True:      [313 348]
  Predicted: [357 304]
  Accuracy:  0.6369
  Precision: 0.6776
  Recall:    0.5920
  F1-score:  0.6319

Fold 4:
  Class:     [0 1]
  True:      [310 351]
  Predicted: [363 298]
  Accuracy:  0.6929
  Precision: 0.7483
  Recall:    0.6353
  F1-score:  0.6872

Model: Gradient Boosting
Grid Search: True
Accuracy : 0.6750 ± 0.0277
Precision: 0.6917 ± 0.0704
Recall   : 0.4249 ± 0.1655
F1       : 0.5117 ± 0.1405
Time     : 4.04s
Pipeline saved at: saved_pipelines\Gra

Evaluating models:  80%|████████  | 4/5 [00:14<00:03,  3.22s/it]


Predictions per fold:

Fold 0:
  Class:     [0 1]
  True:      [442 219]
  Predicted: [593  68]
  Accuracy:  0.6899
  Precision: 0.6029
  Recall:    0.1872
  F1-score:  0.2857

Fold 1:
  Class:     [0 1]
  True:      [415 246]
  Predicted: [542 119]
  Accuracy:  0.7050
  Precision: 0.7143
  Recall:    0.3455
  F1-score:  0.4658

Fold 2:
  Class:     [0 1]
  True:      [363 298]
  Predicted: [519 142]
  Accuracy:  0.6490
  Precision: 0.7324
  Recall:    0.3490
  F1-score:  0.4727

Fold 3:
  Class:     [0 1]
  True:      [313 348]
  Predicted: [359 302]
  Accuracy:  0.6278
  Precision: 0.6689
  Recall:    0.5805
  F1-score:  0.6215

Fold 4:
  Class:     [0 1]
  True:      [310 351]
  Predicted: [363 298]
  Accuracy:  0.6868
  Precision: 0.7416
  Recall:    0.6296
  F1-score:  0.6810

Model: XGBoost
Grid Search: True
Accuracy : 0.6717 ± 0.0287
Precision: 0.6920 ± 0.0511
Recall   : 0.4184 ± 0.1640
F1       : 0.5054 ± 0.1380
Time     : 1.47s
Pipeline saved at: saved_pipelines\XGBoost_pipel

Evaluating models: 100%|██████████| 5/5 [00:22<00:00,  4.46s/it]


Predictions per fold:

Fold 0:
  Class:     [0 1]
  True:      [442 219]
  Predicted: [562  99]
  Accuracy:  0.6641
  Precision: 0.4848
  Recall:    0.2192
  F1-score:  0.3019

Fold 1:
  Class:     [0 1]
  True:      [415 246]
  Predicted: [539 122]
  Accuracy:  0.7065
  Precision: 0.7131
  Recall:    0.3537
  F1-score:  0.4728

Fold 2:
  Class:     [0 1]
  True:      [363 298]
  Predicted: [432 229]
  Accuracy:  0.6838
  Precision: 0.6943
  Recall:    0.5336
  F1-score:  0.6034

Fold 3:
  Class:     [0 1]
  True:      [313 348]
  Predicted: [234 427]
  Accuracy:  0.6566
  Precision: 0.6417
  Recall:    0.7874
  F1-score:  0.7071

Fold 4:
  Class:     [0 1]
  True:      [310 351]
  Predicted: [234 427]
  Accuracy:  0.7126
  Precision: 0.6885
  Recall:    0.8376
  F1-score:  0.7558

Model: CatBoost
Grid Search: True
Accuracy : 0.6847 ± 0.0222
Precision: 0.6445 ± 0.0832
Recall   : 0.5463 ± 0.2397
F1       : 0.5682 ± 0.1648
Time     : 7.52s
Pipeline saved at: saved_pipelines\CatBoost_pip




Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file
45,Logistic Regression,5,True,0.611653,0.100242,0.444134,0.077803,0.51438,0.086959,0.640242,0.033012,0.341241,saved_pipelines\Logistic_Regression_pipeline_1...
46,Random Forest,5,True,0.752626,0.066298,0.560174,0.06788,0.641552,0.065023,0.731619,0.028167,8.423356,saved_pipelines\Random_Forest_pipeline_1763224...
47,Gradient Boosting,5,True,0.689554,0.099339,0.371141,0.089359,0.480878,0.09713,0.658699,0.0344,4.180241,saved_pipelines\Gradient_Boosting_pipeline_176...
48,XGBoost,5,True,0.685032,0.099502,0.368514,0.088932,0.477623,0.097187,0.656581,0.034379,1.214185,saved_pipelines\XGBoost_pipeline_1763224479.pkl
49,CatBoost,5,True,0.668652,0.102048,0.396695,0.087622,0.497114,0.09596,0.657489,0.033918,6.344752,saved_pipelines\CatBoost_pipeline_1763224485.pkl
50,Logistic Regression,5,True,0.603542,0.067746,0.522168,0.232929,0.539567,0.149649,0.654766,0.03197,4.72163,saved_pipelines\Logistic_Regression_pipeline_1...
51,Random Forest,5,True,0.760618,0.063312,0.63897,0.247968,0.673777,0.173476,0.774887,0.046051,4.405922,saved_pipelines\Random_Forest_pipeline_1763225...
52,Gradient Boosting,5,True,0.691674,0.070376,0.424872,0.165524,0.511704,0.14054,0.675038,0.027688,4.040267,saved_pipelines\Gradient_Boosting_pipeline_176...
53,XGBoost,5,True,0.692021,0.0511,0.418365,0.163985,0.505356,0.138016,0.67171,0.028657,1.46554,saved_pipelines\XGBoost_pipeline_1763225786.pkl
54,CatBoost,5,True,0.644499,0.083217,0.546271,0.239687,0.568202,0.164766,0.68472,0.022206,7.522618,saved_pipelines\CatBoost_pipeline_1763225794.pkl
