# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

from imblearn.pipeline import Pipeline as ImbPipeline


from utils import bool_contact, train_ts, modify_pdays

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


# Download datasets

In [2]:
# raw dataset
bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")
bank_full.y = bank_full.y.map({"yes": 1, "no":0})

bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

# Define metrics, parameters and models

In [3]:
metrics = ['precision', 'recall', 'f1', 'accuracy']

In [4]:
# # First param grid (basic)
# param_grids = {
#     'Logistic Regression': {
#         'classifier__C': [0.1, 1, 10, 100],
#         'classifier__penalty': ['l2'],
#         'classifier__solver': ['lbfgs', 'liblinear']
#     },

#     'Random Forest': {
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__max_depth': [None, 5, 10],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2]
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'classifier__max_depth': [3, 5, 7],
#         'classifier__subsample': [0.8, 1.0]
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [50, 100],
#         'classifier__max_depth': [3, 5],
#         'classifier__learning_rate': [0.01, 0.1]
#     },

#     'CatBoost': {
#         'classifier__iterations': [100, 200],
#         'classifier__depth': [3, 5],
#         'classifier__learning_rate': [0.01, 0.1]
#     }
# }

In [5]:
# # Second Params Grid (complex)
# param_grids = {
#     'Logistic Regression': [
#         # liblinear supports l1, l2
#         {
#             'classifier__solver': ['liblinear'],
#             'classifier__penalty': ['l1', 'l2'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__class_weight': [None, 'balanced']
#         },
#         # lbfgs supports only l2
#         {
#             'classifier__solver': ['lbfgs'],
#             'classifier__penalty': ['l2'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__class_weight': [None, 'balanced']
#         },
#         # saga supports l1, l2, elasticnet
#         {
#             'classifier__solver': ['saga'],
#             'classifier__penalty': ['l1', 'l2', 'elasticnet'],
#             'classifier__C': [0.01, 0.1, 1],
#             'classifier__l1_ratio': [0.5],  # only used with elasticnet
#             'classifier__class_weight': [None, 'balanced']
#         }
#     ],

#     'Random Forest': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__max_depth': [None, 10],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2],
#         'classifier__max_features': ['sqrt', 'log2'],
#         'classifier__bootstrap': [True],
#         'classifier__class_weight': [None, 'balanced']
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__max_depth': [3, 5],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2],
#         'classifier__subsample': [0.8, 1.0],
#         'classifier__max_features': ['sqrt', None],
#         'classifier__loss': ['log_loss']  # keep only main loss for binary classification
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [100, 300],
#         'classifier__max_depth': [3, 5],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__subsample': [0.8, 1.0],
#         'classifier__colsample_bytree': [0.8, 1.0],
#         'classifier__gamma': [0, 0.1],
#         'classifier__reg_alpha': [0, 0.1],
#         'classifier__reg_lambda': [1, 1.5],
#         'classifier__scale_pos_weight': [1]
#     },

#     'CatBoost': {
#         'classifier__iterations': [200, 500],
#         'classifier__depth': [6, 8],
#         'classifier__learning_rate': [0.05, 0.1],
#         'classifier__l2_leaf_reg': [3, 5],
#         'classifier__border_count': [32, 64],
#         'classifier__bagging_temperature': [0, 0.5],
#         'classifier__random_strength': [0, 0.5],
#         'classifier__boosting_type': ['Ordered']
#     }
# }


In [6]:
# Third params grid (used grid, more evolved than the 1st)

param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1.0, 10.0],  # 3 values
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']  # Removed liblinear (lbfgs is faster)
    },  # Total: 3 combinations

    'Random Forest': {
        'classifier__n_estimators': [100, 200],  # 2 values
        'classifier__max_depth': [10, 20],  # 2 values (removed None - too slow)
        'classifier__min_samples_split': [5, 10],  # 2 values
        'classifier__class_weight': ['balanced']  # Fixed
    },  # Total: 8 combinations

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__subsample': [0.8]  # Fixed
    },  # Total: 8 combinations

    'XGBoost': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__scale_pos_weight': [1]  # Fixed (or use balanced ratio)
    },  # Total: 8 combinations

    'CatBoost': {
        'classifier__iterations': [100, 200],  # 2 values
        'classifier__depth': [4, 6],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__auto_class_weights': ['Balanced']  # Fixed
    }  # Total: 8 combinations
}

In [7]:
# Models to use during training
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(class_weight="balanced", random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Data Preparation

In [8]:
split_idx = 36224 # 39130 or 36224

bank_stable = bank_full.iloc[split_idx:].reset_index(drop=True)

# chronological train/test split
train_size = 0.8
split = int(len(bank_stable)*train_size)
train_set = bank_stable.iloc[:split].copy()
test_set = bank_stable.iloc[split:].copy()

# Split X and y from train dataset
X_train = train_set.drop(columns=['y'])

# Map target
y_train = train_set.y

print(len(X_train), len(test_set))

3971 993


# Data preprocessing, feature engineering and training

keep cons.price.idx, remove emp.var.rate

keep eurobr3m, remove nr.employed

keep cons.price.idx, remove cons.conf.idx since its still higly correlated with emp.var.rate and euribor3m

In [9]:
# Columns to drop
drop_cols = ["duration"]

# Columns on which we will apply Standard Scaler
std_cols = ['euribor3m', 'cons.price.idx', 'emp.var.rate', 'cons.conf.idx', 'nr.employed']

# Columns on which we will apply Min Max Scaler
minmax_cols = ["age", "campaign", "pdays", "previous"]

# Columns on which we will apply One-Hot Encoder
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome', "contact", "month"]

# Columns on which we will apply Ordinal Encoder...
ordinal_cols = ["education", "day_of_week"]
# ... and their respective orders
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Feature engineering functions
ft_eng_pdays = FunctionTransformer(modify_pdays, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', drop_cols),
                                               ('std_scale', StandardScaler(), std_cols),
                                               ('minmax_scale', MinMaxScaler(), minmax_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)
                                               ])


# Training for each selected model
for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[('feature_eng', ft_eng_pdays),
                                  ('preprocessor', preprocessor),
                                  ('classifier', model)])

    train_ts(X_train=X_train,               # training set features
                 y_train=y_train,           # training set target
                 pipeline=pipeline,         # pipeline to use
                 n_folds=5,                 # number of folds
                 model_name = model_name,   # current model name
                 param_grids=param_grids,   # put {} to avoid K fold and process a classic train/val training
                 scoring_metrics=metrics,
                 refit_metric='f1',     # optimizing metric, choose from 'recall', 'precision', 'f1', 'accuracy', etc...
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'train_logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'train_logs.csv')).tail(5)

Evaluating models:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating models:  20%|██        | 1/5 [00:09<00:36,  9.12s/it]


Model: Logistic Regression
Grid Search: True
Accuracy : 0.6472 ± 0.0402
Precision: 0.6025 ± 0.0679
Recall   : 0.5209 ± 0.2435
F1       : 0.5313 ± 0.1511
Time     : 9.07s
Pipeline saved at: saved_pipelines\Logistic_Regression_pipeline_1763571117.pkl

######################################################################



Evaluating models:  40%|████      | 2/5 [00:15<00:23,  7.73s/it]


Model: Random Forest
Grid Search: True
Accuracy : 0.7785 ± 0.0330
Precision: 0.7848 ± 0.0242
Recall   : 0.6513 ± 0.2450
F1       : 0.6845 ± 0.1510
Time     : 6.55s
Pipeline saved at: saved_pipelines\Random_Forest_pipeline_1763571123.pkl

######################################################################



Evaluating models:  60%|██████    | 3/5 [00:22<00:14,  7.24s/it]


Model: Gradient Boosting
Grid Search: True
Accuracy : 0.6805 ± 0.0244
Precision: 0.7065 ± 0.0452
Recall   : 0.4378 ± 0.1723
F1       : 0.5219 ± 0.1386
Time     : 6.61s
Pipeline saved at: saved_pipelines\Gradient_Boosting_pipeline_1763571130.pkl

######################################################################



Evaluating models:  80%|████████  | 4/5 [00:25<00:05,  5.45s/it]


Model: XGBoost
Grid Search: True
Accuracy : 0.6735 ± 0.0269
Precision: 0.6908 ± 0.0469
Recall   : 0.4258 ± 0.1687
F1       : 0.5102 ± 0.1396
Time     : 2.63s
Pipeline saved at: saved_pipelines\XGBoost_pipeline_1763571133.pkl

######################################################################



Evaluating models: 100%|██████████| 5/5 [00:34<00:00,  6.95s/it]


Model: CatBoost
Grid Search: True
Accuracy : 0.6838 ± 0.0262
Precision: 0.6405 ± 0.0873
Recall   : 0.5579 ± 0.2575
F1       : 0.5697 ± 0.1675
Time     : 9.49s
Pipeline saved at: saved_pipelines\CatBoost_pipeline_1763571142.pkl

######################################################################






Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file
81,Logistic Regression,5,True,0.60249,0.067874,0.520865,0.243487,0.531343,0.151065,0.647201,0.040225,9.06963,saved_pipelines\Logistic_Regression_pipeline_1...
82,Random Forest,5,True,0.784766,0.024238,0.651311,0.244987,0.684457,0.151024,0.778517,0.032987,6.549681,saved_pipelines\Random_Forest_pipeline_1763571...
83,Gradient Boosting,5,True,0.706529,0.045169,0.437772,0.172253,0.521881,0.138575,0.680484,0.024364,6.609526,saved_pipelines\Gradient_Boosting_pipeline_176...
84,XGBoost,5,True,0.690831,0.046939,0.425819,0.168727,0.510183,0.139618,0.673525,0.026883,2.631409,saved_pipelines\XGBoost_pipeline_1763571133.pkl
85,CatBoost,5,True,0.640461,0.087331,0.557884,0.257492,0.569707,0.167537,0.683812,0.026203,9.491301,saved_pipelines\CatBoost_pipeline_1763571142.pkl
