# Machine Learning Project
by Alexandre Waerniers and Vincent Lamy,

students at Albert School x Mines Paris PSL

# Imports

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from tqdm import tqdm
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import joblib

# Get project path
cwd = os.getcwd()
print(cwd)

d:\ALBERTSCHOOL\SupervisedML\supervised_ml_project_waerniers_lamy


In [91]:
# # Refresh logs
logs = pd.DataFrame(columns=['Model','Folds', 'Grid_search', 'Grid_params', 'Precision_mean','Precision_std','Recall_mean','Recall_std','F1_mean','F1_std','Accuracy_mean','Accuracy_std','Time','Params_models'])
logs.to_csv(os.path.join(cwd, 'data', 'logs.csv'), index=False)
logs

Unnamed: 0,Model,Folds,Grid_search,Grid_params,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Params_models


# Functions

In [12]:
def compare_distributions(df1, df2, n_bins):
    """
    Plot side-by-side distributions for 2 DataFrames showing %
    """
    if df1.columns.tolist() == df2.columns.tolist():

        sns.set(style="whitegrid")

        num_cols = df1.select_dtypes(include=["number"]).columns
        cat_cols = df1.select_dtypes(include=["object", "category"]).columns

        # Numeric columns
        for col in num_cols:
            plt.figure(figsize=(10, 4))
            plt.suptitle(f"Distribution of {col}", fontsize=14)
            
            # Left-hand plot
            plt.subplot(1, 2, 1)
            sns.histplot(df1[col], bins=n_bins, kde=False, stat="percent", color="steelblue")
            plt.title("df1")
            plt.xlabel(col)
            plt.ylabel("Percentage (%)")

            # right-hand plot
            plt.subplot(1, 2, 2)
            sns.histplot(df2[col], bins=n_bins, kde=False, stat="percent", color="orange")
            plt.title("df2")
            plt.xlabel(col)
            plt.ylabel("Percentage (%)")

            plt.tight_layout()
            plt.show()

            # Print bin edges used
            bin_edges = np.histogram_bin_edges(df1[col].dropna(), bins=n_bins)
            print(f"Bins for '{col}':")
            print(bin_edges)
            print("-" * 80)

        # Categorical columns
        for col in cat_cols:
            plt.figure(figsize=(10, 4))
            plt.suptitle(f"Distribution of {col}", fontsize=14)

            # Compute normalized frequencies (%)
            df1_counts = (df1[col].value_counts(normalize=True) * 100).rename("df1_%")
            df2_counts = (df2[col].value_counts(normalize=True) * 100).rename("df2_%")

            combined = pd.concat([df1_counts, df2_counts], axis=1).fillna(0)
            combined = combined.reset_index().rename(columns={"index": col})

            # Train plot
            plt.subplot(1, 2, 1)
            sns.barplot(data=combined, y=col, x="df1_%", color="steelblue")
            plt.title("df1")
            plt.xlabel("Percentage (%)")
            plt.ylabel(col)

            # Test plot
            plt.subplot(1, 2, 2)
            sns.barplot(data=combined, y=col, x="df2_%", color="orange")
            plt.title("df2")
            plt.xlabel("Percentage (%)")
            plt.ylabel(col)

            plt.tight_layout()
            plt.show()
        
    else:
        print("ERROR : Dataframes do not have the same columns")


def plot_heatmap(data: pd.DataFrame, title: str):

    sns.set(style="white", font_scale=1.1)
    plt.figure(figsize=(12, 6))
    sns.heatmap(
        data,
        annot=True,          # show correlation values
        fmt=".2f",           # format as 2 decimals
        cmap="coolwarm",     # color palette
        square=False,        # make cells square
        linewidths=0.5,      # line between cells
        cbar_kws={"shrink": 0.8, "label": "Correlation"}  # smaller colorbar
    )

    plt.title(title, fontsize=16, pad=20)
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()


def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt((chi2 / n) / (min(k-1, r-1)))

# Raw datasets

Citation Request:

  This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

In [13]:
# Link to web page : https://archive.ics.uci.edu/dataset/222/bank+marketing

bank_full = pd.read_csv(os.path.join(cwd, "data", "bank-additional-full.csv"), sep=";")
bank_test = pd.read_csv(os.path.join(cwd, "data", "bank-additional.csv"), sep=";")

In [None]:
""" 
Input variables:

# bank client data:

1 - age (numeric)
2 - job :
        type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")

3 - marital :
        marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)

4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")

5 - default:
        has credit in default? (categorical: "no","yes","unknown")

6 - housing:
        has housing loan? (categorical: "no","yes","unknown")

7 - loan:
        has personal loan? (categorical: "no","yes","unknown")

# related with the last contact of the current campaign:

8 - contact:
        contact communication type (categorical: "cellular","telephone")

9 - month:
        last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")

10 - day_of_week:
        last contact day of the week (categorical: "mon","tue","wed","thu","fri")

11 - duration: last contact duration, in seconds (numeric). 
        Important note:  
        this attribute highly affects the output target (e.g., if duration=0 then y="no"). 
        Yet, the duration is not known before a call is performed.
        Also, after the end of the call y is obviously known.
        Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

# other attributes:

12 - campaign:
        number of contacts performed during this campaign and for this client (numeric, includes last contact)

13 - pdays:
        number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

14 - previous:
        number of contacts performed before this campaign and for this client (numeric)

15 - poutcome:
        outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")

# social and economic context attributes

16 - emp.var.rate: employment variation rate - quarterly indicator (numeric) :
        Measures how employment levels have changed compared to the previous quarter.

17 - cons.price.idx: consumer price index - monthly indicator (numeric) :
        Measures the average change in prices of a fixed basket of goods and services (inflation).

18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric) :
        Reflects how optimistic or pessimistic consumers are about the economy.

19 - euribor3m: euribor 3 month rate - daily indicator (numeric) :
        Euro Interbank Offered Rate for loans with a 3-month maturity
        Basically, the average interest rate at which major European banks lend money to each other for 3 months

20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):

21 - y - has the client subscribed a term deposit? (binary: "yes","no")

Missing Attribute Values: There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques. 
"""

' \nInput variables:\n\nBank client data:\n\n1 - age (numeric)\n2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student","blue-collar","self-employed","retired","technician","services") \n3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)\n4 - education (categorical: "unknown","secondary","primary","tertiary")\n5 - default: has credit in default? (binary: "yes","no")\n6 - balance: average yearly balance, in euros (numeric) \n7 - housing: has housing loan? (binary: "yes","no")\n8 - loan: has personal loan? (binary: "yes","no")\n\nRelated with the last contact of the current campaign:\n\n9 - contact: contact communication type (categorical: "unknown","telephone","cellular") \n10 - day: last contact day of the month (numeric)\n11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")\n12 - duration: last contact duration, in sec

In [100]:
X = bank_full.drop(columns=['y'])
y = bank_full.y.map({"yes": 1, "no":0})

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
print(f"Numerical Features   : {num_cols}")
print(f"Categorical Features : {cat_cols}")

Numerical Features   : ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical Features : ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])

In [None]:
def train_k_fold_grid(X_train: pd.Series,
                      y_train: pd.Series,
                      preprocessor: ColumnTransformer,
                      n_folds: int,
                      param_grids: dict,
                      models: dict,
                      scoring_metrics: dict,
                      logs: pd.DataFrame,
                      models_dir: str = "saved_models",
                      pipelines_dir: str = "saved_pipelines"):

    # Ensure the folder exists
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(pipelines_dir, exist_ok=True)

    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=777)
    results_list = []

    for model_name, model in tqdm(models.items(), desc="Evaluating models"):
        
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

        # Check if we have a parameter grid for this model
        grid_use = False
        grid_params = param_grids.get(model_name, None)
        
        start_time = time.time()

        if grid_params:
            grid_use = True
            print(f"Parameter grid for {model_name}: {grid_params}")

            grid = GridSearchCV(
                estimator=pipeline,
                param_grid=grid_params,
                scoring=scoring_metrics,
                cv=kfold,
                refit='f1',
                n_jobs=-1
            )

            grid.fit(X_train, y_train)
            estimator_pipeline = grid.best_estimator_
            estimator_model = grid.best_estimator_.named_steps['classifier']
            parameters = grid.best_params_

            precision_mean = grid.cv_results_["mean_test_precision"][0]
            precision_std = grid.cv_results_["std_test_precision"][0]
            recall_mean = grid.cv_results_["mean_test_recall"][0]
            recall_std = grid.cv_results_["std_test_recall"][0]
            f1_mean = grid.cv_results_["mean_test_f1"][0]
            f1_std = grid.cv_results_["std_test_f1"][0]
            accuracy_mean = grid.cv_results_["mean_test_accuracy"][0]
            accuracy_std = grid.cv_results_["std_test_accuracy"][0]

        else:
            cv_results = cross_validate(
                pipeline, X_train, y_train, cv=kfold,
                scoring=scoring_metrics, return_estimator=True
            )
            estimator_pipeline = cv_results['estimator']
            # Save per-fold parameters
            parameters = [est.named_steps['classifier'].get_params() for est in estimator_pipeline]
            # For non-grid CV, just pick the first fold for single model save
            estimator_model = estimator_pipeline[0].named_steps['classifier']

            precision_mean = cv_results['test_precision'].mean()
            precision_std = cv_results['test_precision'].std()
            recall_mean = cv_results['test_recall'].mean()
            recall_std = cv_results['test_recall'].std()
            f1_mean = cv_results['test_f1'].mean()
            f1_std = cv_results['test_f1'].std()
            accuracy_mean = cv_results['test_accuracy'].mean()
            accuracy_std = cv_results['test_accuracy'].std()

        compute_time = time.time() - start_time

        # Save pipeline
        timestamp = int(time.time())
        pipeline_filename = f"{model_name.replace(' ', '_')}_pipeline_{timestamp}.pkl"
        pipeline_path = os.path.join(pipelines_dir, pipeline_filename)
        joblib.dump(estimator_pipeline, pipeline_path)

        # Save model (classifier) separately
        model_filename = f"{model_name.replace(' ', '_')}_model_{timestamp}.pkl"
        model_path = os.path.join(models_dir, model_filename)
        joblib.dump(estimator_model, model_path)

        print(f"Saved pipeline at {pipeline_path}")
        print(f"Saved model at {model_path}")

        train_results = {
            'Model': model_name,
            'Folds': n_folds,
            'Grid_search': grid_use,
            'Grid_params': grid_params,
            'Precision_mean': precision_mean,
            'Precision_std': precision_std,
            'Recall_mean': recall_mean,
            'Recall_std': recall_std,
            'F1_mean': f1_mean,
            'F1_std': f1_std,
            'Accuracy_mean': accuracy_mean,
            'Accuracy_std': accuracy_std,
            'Time': compute_time,
            'Pipeline_file': pipeline_path,
            'Model_file': model_path,
            'Params_models': parameters
        }

        results_list.append(train_results)

        # Print results for this model
        print()
        for k, v in list(train_results.items())[:-3]:  # skip pipeline file, model file, params_models
            print(f" {k} : {v}")
        print("\n#####################################################################\n")

    # Update logs
    logs = pd.concat([logs, pd.DataFrame(results_list)], ignore_index=True)
    logs.to_csv(os.path.join(os.getcwd(), 'data', 'logs.csv'), index=False)

    return logs


In [68]:
metrics = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

In [66]:
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],  # 'l1' requires solver='liblinear' or 'saga'
        'classifier__solver': ['lbfgs', 'liblinear']
    },

    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    },

    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0]
    },

    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    },

    'CatBoost': {
        'classifier__iterations': [100, 200],
        'classifier__depth': [3, 5],
        'classifier__learning_rate': [0.01, 0.1]
    }
}

In [77]:
models = {
    'Logistic Regression': LogisticRegression(),
    # 'Random Forest': RandomForestClassifier(),
    # 'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
    # 'CatBoost': CatBoostClassifier(verbose=0)
    # 'k-Nearest Neighbors': KNeighborsClassifier(),
    # 'Support Vector Machine': SVC()
}

In [99]:
train_k_fold_grid(X_train=X,
                  y_train=y,
                  preprocessor=preprocessor,
                  n_folds=5,
                  param_grids=param_grids,
                  models=models,
                  scoring_metrics=metrics,
                  logs=pd.read_csv(os.path.join(cwd, 'data', 'logs.csv')))

pd.read_csv(os.path.join(cwd, 'data', 'logs.csv'))

Evaluating models:   0%|          | 0/2 [00:00<?, ?it/s]

Parameter grid for Logistic Regression: {'classifier__C': [0.01, 0.1, 1, 10], 'classifier__penalty': ['l2'], 'classifier__solver': ['lbfgs', 'liblinear']}


Evaluating models:  50%|█████     | 1/2 [00:09<00:09,  9.34s/it]

{'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Saved pipeline at saved_pipelines\Logistic_Regression_pipeline_1762428372.pkl
Saved model at saved_models\Logistic_Regression_model_1762428372.pkl

 Model : Logistic Regression
 Folds : 5
 Grid_search : True
 Grid_params : {'classifier__C': [0.01, 0.1, 1, 10], 'classifier__penalty': ['l2'], 'classifier__solver': ['lbfgs', 'liblinear']}
 Precision_mean : 0.6711497941239819
 Precision_std : 0.025301816600836274
 Recall_mean : 0.3963369048112805
 Recall_std : 0.00585357623295847
 F1_mean : 0.4982900955296142
 F1_std : 0.011165844798484757
 Accuracy_mean : 0.9101438381514046
 Accuracy_std : 0.002034434140238951
 Time : 9.323189973831177

#####################################################################

Parameter grid for XGBoost: {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [3, 5], 'classifier__learning_rate': [0.01, 0.1]}


Evaluating models: 100%|██████████| 2/2 [00:22<00:00, 11.01s/it]

{'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 100}
Saved pipeline at saved_pipelines\XGBoost_pipeline_1762428385.pkl
Saved model at saved_models\XGBoost_model_1762428385.pkl

 Model : XGBoost
 Folds : 5
 Grid_search : True
 Grid_params : {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [3, 5], 'classifier__learning_rate': [0.01, 0.1]}
 Precision_mean : 0.0
 Precision_std : 0.0
 Recall_mean : 0.0
 Recall_std : 0.0
 F1_mean : 0.0
 F1_std : 0.0
 Accuracy_mean : 0.8873459788011762
 Accuracy_std : 0.004413957950006394
 Time : 12.680991649627686

#####################################################################






Unnamed: 0,Model,Folds,Grid_search,Grid_params,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,Accuracy_mean,Accuracy_std,Time,Params_models,Pipeline_file,Model_file
0,Logistic Regression,5,True,"{'classifier__C': [0.01, 0.1, 1, 10], 'classif...",0.67115,0.025302,0.396337,0.005854,0.49829,0.011166,0.910144,0.002034,17.656423,"{'classifier__C': 10, 'classifier__penalty': '...",saved_pipelines\Logistic_Regression_pipeline_1...,saved_models\Logistic_Regression_model_1762428...
1,XGBoost,5,True,"{'classifier__n_estimators': [50, 100], 'class...",0.0,0.0,0.0,0.0,0.0,0.0,0.887346,0.004414,13.553396,"{'classifier__learning_rate': 0.1, 'classifier...",saved_pipelines\XGBoost_pipeline_1762428162.pkl,saved_models\XGBoost_model_1762428162.pkl
2,Logistic Regression,5,True,"{'classifier__C': [0.01, 0.1, 1, 10], 'classif...",0.67115,0.025302,0.396337,0.005854,0.49829,0.011166,0.910144,0.002034,9.32319,"{'classifier__C': 10, 'classifier__penalty': '...",saved_pipelines\Logistic_Regression_pipeline_1...,saved_models\Logistic_Regression_model_1762428...
3,XGBoost,5,True,"{'classifier__n_estimators': [50, 100], 'class...",0.0,0.0,0.0,0.0,0.0,0.0,0.887346,0.004414,12.680992,"{'classifier__learning_rate': 0.1, 'classifier...",saved_pipelines\XGBoost_pipeline_1762428385.pkl,saved_models\XGBoost_model_1762428385.pkl


# Test

In [97]:
X_test = bank_test.drop(columns=["y"])
y_test = bank_test.y.map({"yes":1, "no":0})

# Choose the id and name of the model you want to test
id = "1762428162"
model_name = "XGBoost"

# Get the saved pipeline and model
model_path = os.path.join(cwd, 'saved_models', model_name+"_model_"+id+".pkl")
pipeline_path = os.path.join(cwd, 'saved_pipelines', model_name+"_pipeline_"+id+".pkl")

# Load pipeline and model
pipeline = joblib.load(pipeline_path)
model = joblib.load(model_path)

# Preprocess X_test using loaded pipeline
X_test_preprocessed = pipeline.named_steps['preprocessor'].transform(X_test)

# Test using loaded model
y_pred = model.predict(X_test_preprocessed)

# result, confusion matrix, metrics ...
f1_score(y_pred, y_test)

0.6585956416464891

# Train, Validation Split & Test set

In [122]:
X = bank_full.drop(columns=['y'])
y = bank_full.y

X_test = bank_test.drop(columns=["y"])
y_test = bank_test.y

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

if (X_train.shape[0] == len(y_train)) & (X_val.shape[0] == len(y_val)) & (X_test.shape[0] == len(y_test)):

    print(f"X_train shape : {X_train.shape},  y_train lenght : {len(y_train)}")
    print(f"X_val shape   : {X_val.shape},   y_val lenght   : {len(y_val)}")
    print(f"X_test shape  : {X_test.shape},   y_test lenght  : {len(y_test)}")
else:
    print("ERROR - Shapes and lengths don't match.")

X_train shape : (37069, 20),  y_train lenght : 37069
X_val shape   : (4119, 20),   y_val lenght   : 4119
X_test shape  : (4119, 20),   y_test lenght  : 4119
