# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler


from utils import train_k_fold, bool_contact, train_ts

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


# Download datasets

In [36]:
# raw dataset
bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")

bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

# Define metrics, parameters and models

In [37]:
# metrics = {
#     'precision': make_scorer(precision_score, pos_label=1, average='binary', zero_division=0),
#     'recall': make_scorer(recall_score, pos_label=1, average='binary', zero_division=0),
#     'f1': make_scorer(f1_score, pos_label=1, average='binary', zero_division=0),
#     'accuracy': make_scorer(accuracy_score)
# }

metrics = ['precision', 'recall', 'f1', 'accuracy']

In [38]:
# First param grid (basic)
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'liblinear']
    },

    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },

    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    },

    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

In [39]:
# # Second param grid (improved but too long to compute so not used)

# param_grids = {
#     'Logistic Regression': {
#         'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
#         'classifier__penalty': ['l1', 'l2', 'elasticnet'],
#         'classifier__solver': ['lbfgs', 'liblinear', 'saga'],
#         'classifier__class_weight': [None, 'balanced'],
#         'classifier__l1_ratio': [0, 0.5, 1]  # only used with elasticnet
#     },

#     'Random Forest': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [None, 5, 10, 20],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__bootstrap': [True, False],
#         'classifier__class_weight': [None, 'balanced']
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__max_depth': [3, 5, 7],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__loss': ['log_loss', 'exponential']
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [3, 5, 7, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#         'classifier__gamma': [0, 0.1, 0.3],
#         'classifier__reg_alpha': [0, 0.1, 1],
#         'classifier__reg_lambda': [1, 1.5, 2],
#         'classifier__scale_pos_weight': [1, 2]  # useful for class imbalance
#     },

#     'CatBoost': {
#         'classifier__iterations': [200, 500, 800],
#         'classifier__depth': [4, 6, 8, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__l2_leaf_reg': [1, 3, 5, 7],
#         'classifier__border_count': [32, 64, 128],
#         'classifier__bagging_temperature': [0, 0.5, 1.0],
#         'classifier__random_strength': [0, 0.5, 1.0],
#         'classifier__boosting_type': ['Ordered', 'Plain']
#     }
# }


In [40]:
# third params grid

param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1.0, 10.0],  # 3 values
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']  # Removed liblinear (lbfgs is faster)
    },  # Total: 3 combinations

    'Random Forest': {
        'classifier__n_estimators': [100, 200],  # 2 values
        'classifier__max_depth': [10, 20],  # 2 values (removed None - too slow)
        'classifier__min_samples_split': [5, 10],  # 2 values
        'classifier__class_weight': ['balanced']  # Fixed
    },  # Total: 8 combinations

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__subsample': [0.8]  # Fixed
    },  # Total: 8 combinations

    'XGBoost': {
        'classifier__n_estimators': [50, 100],  # 2 values
        'classifier__max_depth': [3, 5],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__scale_pos_weight': [1]  # Fixed (or use balanced ratio)
    },  # Total: 8 combinations

    'CatBoost': {
        'classifier__iterations': [100, 200],  # 2 values
        'classifier__depth': [4, 6],  # 2 values
        'classifier__learning_rate': [0.05, 0.1],  # 2 values
        'classifier__auto_class_weights': ['Balanced']  # Fixed
    }  # Total: 8 combinations
}

In [41]:
# Models to use during training
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(class_weight="balanced", random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Data Preparation

In [42]:
bank_full.y = bank_full.y.map({"yes": 1, "no":0})

# split_idx = 27680 # 2008 dec -> 13 508 rows
split_idx = 39130 # 2009 mar ->  2 058 rows

bank_stable = bank_full.iloc[split_idx:].reset_index(drop=True)

# chronological train/test split
train_size = 0.8
split = int(len(bank_stable)*train_size)
train_set = bank_stable.iloc[:split].copy()
test_set = bank_stable.iloc[split:].copy()

# Split X and y from train dataset
X_train = train_set.drop(columns=['y'])

# Map target
y_train = train_set.y

In [43]:
print(len(X_train), len(test_set))

1646 412


In [48]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

# Data preprocessing, feature engineering and training

In [49]:
# Columns to drop
drop_cols = ["duration", "month", 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Columns on which we will apply Standard Scaler
std_cols = ["age", "campaign", "pdays", "previous"]

# Columns on which we will apply Min Max Scaler
minmax_cols = ["campaign"]

# Columns on which we will apply One-Hot Encoder
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome', "education", "day_of_week", "contact"]

# Columns on which we will apply Ordinal Encoder...
ordinal_cols = ["education", "day_of_week"]
# ... and their respective orders
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Column on which we will apply feature engineering
bool_cols = ["contact"]

# Feature engineering functions
ft_eng_contact = FunctionTransformer(bool_contact, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', drop_cols),
                                               ('std_scale', StandardScaler(), std_cols),
                                               # ('minmax_scale', MinMaxScaler(), minmax_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               # ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)
                                               ])


# Training for each selected model
for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[# ('fetaure_eng', ft_eng_contact),
                                  ('preprocessor', preprocessor),
                                  ('classifier', model)])

    train_ts(X_train=X_train,               # training set features
                 y_train=y_train,           # training set target
                 pipeline=pipeline,         # pipeline to use
                 n_folds=5,                 # number of folds
                 model_name = model_name,   # current model name
                 param_grids=param_grids,   # put {} to avoid K fold and process a classic train/val training
                 scoring_metrics=metrics,
                 refit_metric='f1',     # optimizing metric, choose from 'recall', 'precision', 'f1', 'accuracy', etc...
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')).tail(10)

Evaluating models:  20%|██        | 1/5 [00:04<00:18,  4.55s/it]


Model: Logistic Regression
Grid Search: True
Precision: 0.6947 ± 0.0620
Recall   : 0.6632 ± 0.0505
F1       : 0.6766 ± 0.0439
Accuracy : 0.6628 ± 0.0468
Time     : 4.53s
Pipeline saved at: saved_pipelines\Logistic_Regression_pipeline_1763107310.pkl

######################################################################



Evaluating models:  40%|████      | 2/5 [00:08<00:12,  4.32s/it]


Model: Random Forest
Grid Search: True
Precision: 0.6846 ± 0.0477
Recall   : 0.6813 ± 0.0513
F1       : 0.6819 ± 0.0427
Accuracy : 0.6628 ± 0.0417
Time     : 4.11s
Pipeline saved at: saved_pipelines\Random_Forest_pipeline_1763107314.pkl

######################################################################



Evaluating models:  60%|██████    | 3/5 [00:11<00:07,  3.52s/it]


Model: Gradient Boosting
Grid Search: True
Precision: 0.6582 ± 0.0607
Recall   : 0.7471 ± 0.0489
F1       : 0.6985 ± 0.0490
Accuracy : 0.6562 ± 0.0589
Time     : 2.55s
Pipeline saved at: saved_pipelines\Gradient_Boosting_pipeline_1763107317.pkl

######################################################################



Evaluating models:  80%|████████  | 4/5 [00:12<00:02,  2.60s/it]


Model: XGBoost
Grid Search: True
Precision: 0.6639 ± 0.0582
Recall   : 0.7315 ± 0.0531
F1       : 0.6946 ± 0.0481
Accuracy : 0.6577 ± 0.0553
Time     : 1.18s
Pipeline saved at: saved_pipelines\XGBoost_pipeline_1763107318.pkl

######################################################################



Evaluating models: 100%|██████████| 5/5 [00:18<00:00,  3.65s/it]


Model: CatBoost
Grid Search: True
Precision: 0.6738 ± 0.0708
Recall   : 0.6862 ± 0.0393
F1       : 0.6788 ± 0.0505
Accuracy : 0.6540 ± 0.0550
Time     : 5.73s
Pipeline saved at: saved_pipelines\CatBoost_pipeline_1763107324.pkl

######################################################################






Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file,Best_params,Timestamp
100,Logistic Regression,5,True,0.603905,0.048392,0.617603,0.316873,0.54895,0.228893,0.576642,0.056822,0.237205,saved_pipelines\Logistic_Regression_pipeline_1...,,
101,Random Forest,5,True,0.700314,0.051476,0.647846,0.156906,0.657976,0.079936,0.656934,0.043183,3.364913,saved_pipelines\Random_Forest_pipeline_1763105...,,
102,Gradient Boosting,5,True,0.685588,0.052975,0.664241,0.203843,0.64961,0.101951,0.645985,0.046909,2.623885,saved_pipelines\Gradient_Boosting_pipeline_176...,,
103,XGBoost,5,True,0.69179,0.056319,0.628944,0.194438,0.636046,0.107803,0.643066,0.060165,1.086211,saved_pipelines\XGBoost_pipeline_1763105942.pkl,,
104,CatBoost,5,True,0.68368,0.04115,0.618395,0.118361,0.641371,0.064523,0.640146,0.044436,5.33022,saved_pipelines\CatBoost_pipeline_1763105947.pkl,,
105,Logistic Regression,5,True,0.694661,0.062034,0.663219,0.050537,0.676592,0.043871,0.662774,0.046772,4.530564,saved_pipelines\Logistic_Regression_pipeline_1...,,
106,Random Forest,5,True,0.684632,0.047652,0.681339,0.05131,0.681937,0.042696,0.662774,0.041715,4.114512,saved_pipelines\Random_Forest_pipeline_1763107...,,
107,Gradient Boosting,5,True,0.658229,0.060736,0.747091,0.048874,0.69847,0.049038,0.656204,0.058912,2.550936,saved_pipelines\Gradient_Boosting_pipeline_176...,,
108,XGBoost,5,True,0.663853,0.058229,0.731491,0.05309,0.694617,0.048149,0.657664,0.055272,1.17601,saved_pipelines\XGBoost_pipeline_1763107318.pkl,,
109,CatBoost,5,True,0.673781,0.070824,0.686229,0.039253,0.678751,0.050498,0.654015,0.054982,5.73446,saved_pipelines\CatBoost_pipeline_1763107324.pkl,,


After multiple training attempts, we have decided that the model that seems best for our use case is :

    saved_pipelines\Gradient_Boosting_pipeline_1763107317.pkl

preprocessing :

drop_cols = ["duration", "month", 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

std_cols = ["age", "campaign", "pdays", "previous"]

onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome', "education", "day_of_week", "contact"]