# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler


from utils import train_k_fold, bool_contact

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


# Download raw datasets

In [None]:
bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")
bank_test = pd.read_csv(os.path.join(cwd, "data", "bank-additional.csv"), sep=";")

# Define metrics, parameters and models

In [3]:
metrics = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

In [None]:
# First param grid (basic)
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'liblinear']
    },

    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },

    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    },

    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

In [None]:
# # Second param grid (improved but too long to compute so not used)

# param_grids = {
#     'Logistic Regression': {
#         'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
#         'classifier__penalty': ['l1', 'l2', 'elasticnet'],
#         'classifier__solver': ['lbfgs', 'liblinear', 'saga'],
#         'classifier__class_weight': [None, 'balanced'],
#         'classifier__l1_ratio': [0, 0.5, 1]  # only used with elasticnet
#     },

#     'Random Forest': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [None, 5, 10, 20],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__bootstrap': [True, False],
#         'classifier__class_weight': [None, 'balanced']
#     },

#     'Gradient Boosting': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__max_depth': [3, 5, 7],
#         'classifier__min_samples_split': [2, 5, 10],
#         'classifier__min_samples_leaf': [1, 2, 4],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__max_features': ['sqrt', 'log2', None],
#         'classifier__loss': ['log_loss', 'exponential']
#     },

#     'XGBoost': {
#         'classifier__n_estimators': [100, 300, 500],
#         'classifier__max_depth': [3, 5, 7, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#         'classifier__gamma': [0, 0.1, 0.3],
#         'classifier__reg_alpha': [0, 0.1, 1],
#         'classifier__reg_lambda': [1, 1.5, 2],
#         'classifier__scale_pos_weight': [1, 2]  # useful for class imbalance
#     },

#     'CatBoost': {
#         'classifier__iterations': [200, 500, 800],
#         'classifier__depth': [4, 6, 8, 10],
#         'classifier__learning_rate': [0.01, 0.05, 0.1],
#         'classifier__l2_leaf_reg': [1, 3, 5, 7],
#         'classifier__border_count': [32, 64, 128],
#         'classifier__bagging_temperature': [0, 0.5, 1.0],
#         'classifier__random_strength': [0, 0.5, 1.0],
#         'classifier__boosting_type': ['Ordered', 'Plain']
#     }
# }


In [6]:
# Models to use during training
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Data Preparation

In [7]:
# Split X and y from dataset
X = bank_full.drop(columns=['y'])

# Map target
y = bank_full.y.map({"yes": 1, "no":0})

# Data preprocessing, feature engineering and training

In [8]:
# Columns to drop
drop_cols = ["duration", "pdays"]

# Columns on which we will apply Standard Scaler
std_cols = ["age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]

# Columns on which we will apply Min Max Scaler
minmax_cols = ["campaign"]

# Columns on which we will apply One-Hot Encoder
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', "month", 'poutcome']

# Columns on which we will apply Ordinal Encoder...
ordinal_cols = ["education", "day_of_week"]
# ... and their respective orders
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Column on which we will apply feature engineering
bool_cols = ["contact"]

# Feature engineering functions
ft_eng_contact = FunctionTransformer(bool_contact, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', ['duration', 'pdays']),
                                               ('std_scale', StandardScaler(), std_cols),
                                               ('minmax_scale', MinMaxScaler(), minmax_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)])

# Undersampler
undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=42)

# Training for each selected model
for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[('fetaure_eng', ft_eng_contact),
                                  ('preprocessor', preprocessor),
                                  ('undersampler', undersampler),
                                  ('classifier', model)])

    train_k_fold(X_train=X,                 # training set features
                 y_train=y,                 # training set target
                 pipeline=pipeline,         # pipeline to use
                 n_folds=5,                 # number of fold
                 model_name = model_name,   # current model name
                 param_grids=param_grids,   # put {} to avoid K fold and process a classic train/val training
                 scoring_metrics=metrics,   # metrics functions
                 refit_metric='recall',     # optimizing metric, choose from 'recall', 'precision', 'f1', 'accuracy', etc...
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')).tail(10)

Evaluating models:   0%|          | 0/5 [00:00<?, ?it/s]

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\alexa\anaconda3\envs\env_ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\alexa\anaconda3\envs\env_ml\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\alexa\anaconda3\envs\env_ml\Lib\site-packages\imblearn\pipeline.py", line 526, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "c:\Users\alexa\anacon


Model: Logistic Regression
Grid Search: True
Precision: 0.2446 ± 0.0110
Recall   : 0.7116 ± 0.0137
F1       : 0.3640 ± 0.0127
Accuracy : 0.7199 ± 0.0055
Time     : 201.71s
Pipeline saved at: saved_pipelines\Logistic_Regression_pipeline_1762876056.pkl

######################################################################



Evaluating models:  20%|██        | 1/5 [35:56<2:23:44, 2156.06s/it]


KeyboardInterrupt: 

After multiple training attempts, we have decided that the model that seems best for our use case is :

    Random_Forest_pipeline_1762872489.pkl

It maximizes recall while maintaining a good accuracy.

Even though accuracy is not our main concern, it is reassuring that it doesn't drop compared to the other attempts.

In [None]:
trained_models = pd.read_csv(os.path.join(cwd, 'data', 'logs.csv'))
trained_models[trained_models.Pipeline_file == "saved_pipelines\Random_Forest_pipeline_1762872489.pkl"]

Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file
51,Random Forest,5,True,0.374057,0.014647,0.618067,0.021961,0.465846,0.014598,0.840463,0.004512,62.044769,saved_pipelines\Random_Forest_pipeline_1762872...
