# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from tqdm import tqdm
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import joblib

from utils import train_k_fold

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


In [2]:
# # Refresh logs
# logs = pd.DataFrame(columns=['Model','Folds', 'Grid_search', 'Precision_mean','Precision_std','Recall_mean','Recall_std','F1_mean','F1_std','Accuracy_mean','Accuracy_std','Time','Pipeline_file'])
# logs.to_csv(os.path.join(cwd, 'data', 'logs.csv'), index=False)

# Raw datasets

Citation Request:

  This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

In [3]:
# Link to web page : https://archive.ics.uci.edu/dataset/222/bank+marketing

bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")
bank_test = pd.read_csv(os.path.join(cwd, "data", "bank-additional.csv"), sep=";")

In [4]:
metrics = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

In [5]:
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs', 'liblinear']
    },

    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },

    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    },

    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

In [6]:
models = {
    'Logistic Regression': LogisticRegression(),
    # 'Random Forest': RandomForestClassifier(),
    # 'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
    # 'k-Nearest Neighbors': KNeighborsClassifier(),
    # 'Support Vector Machine': SVC()
}

In [7]:
X = bank_full.drop(columns=['y'])
y = bank_full.y.map({"yes": 1, "no":0})

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"Numerical Features   : {num_cols}")
print(f"Categorical Features : {cat_cols}")

Numerical Features   : ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical Features : ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [8]:
# Fonction pour créer la variable 'age_cat'
def add_age_cat(X: pd.DataFrame):
    X['age_cat'] = pd.cut(
        X['age'],
        bins=[0, 25, 30, 35, 40, 45, 50, 55, 60, 100],
        labels=['<25', '25-30', '30-35', '35-40', '40-45', '45-50', '50-55', '55-60', '60+']
    )
    return X

def pdays_replace(X: pd.DataFrame):
    X.loc[X.pdays == 999, 'pdays'] = -1
    return X

In [11]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

In [14]:
num_cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
bool_cols = []
onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'poutcome']

ordinal_cols = ["education", "contact", "month", "day_of_week"]
ordinal_categories = [["unknown", "illiterate","basic.4y","basic.6y","basic.9y","high.school","professional.course","university.degree"],
                      ["cellular", "telephone"],
                      ["mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"],
                      ["mon", "tue", "wed", "thu", "fri"]
                      ]

# Feature engineering functions
ft_eng_age = FunctionTransformer(add_age_cat,validate=False)
ft_eng_pdays = FunctionTransformer(pdays_replace, validate=False)

# Column transformers for preprocessing
preprocessor = ColumnTransformer(transformers=[('drop_cols', 'drop', ['duration']),
                                               ('numerical', StandardScaler(), num_cols),
                                               ('one_hot', OneHotEncoder(), onehot_cols),
                                               ('ordinal', OrdinalEncoder(categories = ordinal_categories), ordinal_cols)])

undersampler = RandomUnderSampler(sampling_strategy=0.75, random_state=42)

for model_name, model in tqdm(models.items(), desc="Evaluating models"):

    pipeline = ImbPipeline(steps=[#('ft_eng_age', ft_eng_age),
                               # ('ft_eng_pdays', ft_eng_pdays),
                               ('preprocessor', preprocessor),
                               ('undersample', undersampler),
                               ('classifier', model)])

    train_k_fold(X_train=X,
                 y_train=y,
                 pipeline=pipeline,
                 n_folds=5,
                 model_name = model_name,
                 param_grids=param_grids,
                 scoring_metrics=metrics,
                 refit_metric='f1',
                 logs=pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'logs.csv'))

Evaluating models:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating models:  33%|███▎      | 1/3 [00:04<00:09,  4.82s/it]


Model: Logistic Regression
Grid Search: True
Precision: 0.3533 ± 0.0135
Recall   : 0.6055 ± 0.0183
F1       : 0.4460 ± 0.0125
Accuracy : 0.8306 ± 0.0073
Time     : 4.77s
Pipeline saved at: saved_pipelines\Logistic_Regression_pipeline_1762711885.pkl

######################################################################



Evaluating models:  67%|██████▋   | 2/3 [00:10<00:05,  5.45s/it]


Model: XGBoost
Grid Search: True
Precision: 0.4431 ± 0.0199
Recall   : 0.5623 ± 0.0200
F1       : 0.4954 ± 0.0166
Accuracy : 0.8710 ± 0.0049
Time     : 5.82s
Pipeline saved at: saved_pipelines\XGBoost_pipeline_1762711891.pkl

######################################################################



Evaluating models: 100%|██████████| 3/3 [00:26<00:00,  8.99s/it]


Model: CatBoost
Grid Search: True
Precision: 0.4061 ± 0.0156
Recall   : 0.6149 ± 0.0229
F1       : 0.4889 ± 0.0160
Accuracy : 0.8553 ± 0.0048
Time     : 16.23s
Pipeline saved at: saved_pipelines\CatBoost_pipeline_1762711908.pkl

######################################################################






Unnamed: 0,Model,Folds,Grid_search,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Pipeline_file
0,Logistic Regression,5,True,0.657496,0.054721,0.233207,0.007963,0.343883,0.01344,0.899801,0.002287,16.448983,saved_pipelines\Logistic_Regression_pipeline_1...
1,XGBoost,5,True,0.648041,0.032703,0.262738,0.005966,0.373627,0.007391,0.900821,0.002247,12.320881,saved_pipelines\XGBoost_pipeline_1762705040.pkl
2,CatBoost,5,True,0.652612,0.027247,0.260726,0.005806,0.372367,0.005999,0.901039,0.002787,55.581001,saved_pipelines\CatBoost_pipeline_1762705096.pkl
3,Logistic Regression,5,True,0.657496,0.054721,0.233207,0.007963,0.343883,0.01344,0.899801,0.002287,10.365798,saved_pipelines\Logistic_Regression_pipeline_1...
4,XGBoost,5,True,0.648041,0.032703,0.262738,0.005966,0.373627,0.007391,0.900821,0.002247,11.110414,saved_pipelines\XGBoost_pipeline_1762705246.pkl
5,CatBoost,5,True,0.652612,0.027247,0.260726,0.005806,0.372367,0.005999,0.901039,0.002787,55.365255,saved_pipelines\CatBoost_pipeline_1762705302.pkl
6,Logistic Regression,5,True,0.650473,0.057464,0.224662,0.008311,0.33347,0.013226,0.898878,0.002601,17.980262,saved_pipelines\Logistic_Regression_pipeline_1...
7,XGBoost,5,True,0.651072,0.033992,0.263534,0.006811,0.374837,0.006535,0.901039,0.002442,12.51029,saved_pipelines\XGBoost_pipeline_1762706368.pkl
8,CatBoost,5,True,0.640595,0.032256,0.263708,0.004797,0.373306,0.004366,0.900311,0.002504,53.742503,saved_pipelines\CatBoost_pipeline_1762706422.pkl
9,Logistic Regression,5,True,0.65821,0.052671,0.207624,0.003568,0.315289,0.007446,0.898441,0.002763,10.326283,saved_pipelines\Logistic_Regression_pipeline_1...


# Test

In [None]:
X_test = bank_test.drop(columns=["y"])
y_test = bank_test["y"].map({"yes": 1, "no": 0})

model_name = "Logistic_Regression"
id = "1762519768" 

pipeline_path = os.path.join(cwd, "saved_pipelines", f"{model_name}_pipeline_{id}.pkl")

pipeline = joblib.load(pipeline_path)

y_pred = pipeline.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 score :", f1_score(y_test, y_pred))

FileNotFoundError: [Errno 2] No such file or directory: 'd:\\ALBERTSCHOOL\\SupervisedML\\supervised_ml_project_waerniers_lamy\\saved_pipelines\\Logistic_Regression_pipeline_1762519768.pkl'