In [578]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import pylab as pl

from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree

import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display="diagram")

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score,precision_recall_curve, roc_curve,confusion_matrix,accuracy_score, recall_score, precision_score,plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error

<h2 style="color:red;">Business Understanding</h2>

<p>The Business goal is  to come up with the best machine learning classification model to predict if a future client will subscribe a term deposit or not based on several independent variables such as education level, marital status, if has housing loan or not, personal loan or not, etc. The best machine learning classification model is selected by ranking up four different machine learning models: KNeighborsClassifier, Logistic Regression, Support Vector Machine, and Decision Tree by their metrics and other indicators such as the Precision-recal curve, and confusion matrix. The dataset used to train those four models is related to the marketing of bank products over the telephone as mentioned before. The analysis will be done using python & jupyter notebook.</p>

The original dataset, named "bank-full.csv," is formatted as a .csv file and comprises 17 columns with a total of 6316 rows. Among these columns, the target/independent variable is denoted as "y," and it is a categorical (nominal feature) representing whether the client has subscribed to a term deposit. Notably, this variable exhibits an imbalance, as will become evident later. In addition to "y," only two other columns, namely "age" and "balance," are numerical in nature. The column "duration" is included primarily for benchmarking purposes and should be omitted if the goal is to construct a realistic predictive model. The remaining columns in the dataset are categorical (nominal).

As a consequence, a significant portion of the dataset provided exhibits imbalance even before entering the modeling phase. It's worth mentioning that none of the columns contain "NaN" values, and no duplicates were identified. To gain a deeper understanding of this dataset, it is advisable to initiate a data preparation phase, involving data cleaning processes, as a preliminary step.

<h5>Read in the bank-full.csv file.</h5>

In [579]:
df = pd.read_csv('Downloads/bank-full.csv')

In [580]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [558]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


<h5>Investigate the dataset for missing or problematic data.</h5>

In [413]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [414]:
df.loc[df.duplicated()]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y


<h2 style="color:red;">Data Preparation</h2>

<h3 style="color:red;">Basic Cleaning</h3>

In [581]:
df.dropna()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


<h5>Feature "unknown" is contained in several diferent variables, therefore, it was renamed as shown below:</h5>

In [582]:
df['job'] = df['job'].replace({'unknown': 'unknown_job'})
df['education'] = df['education'].replace({'unknown': 'unknown_education'})
df['poutcome'] = df['poutcome'].replace({'unknown': 'unknown_poutcome'})
df['contact'] = df['contact'].replace({'unknown': 'unknown_contact'})

<p style="color:red;"> Removing values equal to -1 in the column "pdays"</p>

In [583]:
df=df.query('pdays!=-1')

<h4 style='color:red;font-size:18px'>Modelling</h4>

<h6 style='color:blue;font-size:18px'>KNeighborsClassifier</h6>

In [516]:
pipeline_knn=Pipeline([('transformer', ohe_transformer),('scale', StandardScaler()), ('knn', KNeighborsClassifier())])
pipeline_knn

<h6>Hyperparameters being tested</h6>

In [517]:
param2_knn={'knn__n_neighbors':np.array(range(1,21,2)),'knn__weights' : ['uniform', 'distance'],'knn__p':[1,2]}

In [592]:
precision_preds_grid_knn=GridSearchCV(pipeline_knn, param_grid=param2_knn,scoring='roc_auc',cv = 5)
precision_preds_grid_knn.fit(X_train,y_train)


best_score_test_knn=precision_preds_grid_knn.score(X_test,y_test)
best_score_train_knn=precision_preds_grid_knn.score(X_train,y_train)
best_params_precision_preds_knn=precision_preds_grid_knn.best_params_


print(f'roc_auc train: {best_score_train_knn: .3f}',f'roc_auc test: {best_score_test_knn: .3f}')
print("Tuned Hyperparameters KNearestNeighborsClassifier:", best_params_precision_preds_knn)

n_splits_knn  = precision_preds_grid_knn.n_splits_ 
n_iter_knn = pd.DataFrame(precision_preds_grid_knn.cv_results_).shape[0]
mean_time_knn=np.mean(precision_preds_grid_knn.cv_results_['mean_fit_time'])
print(f'Elapsed Time using GridSearchCV: {mean_time_knn * n_splits_knn * n_iter_knn: .3f}')

roc_auc train:  1.000 roc_auc test:  0.852
Tuned Hyperparameters KNearestNeighborsClassifier: {'knn__n_neighbors': 19, 'knn__p': 1, 'knn__weights': 'distance'}
Elapsed Time using GridSearchCV:  4.287


<h6 style='color:blue;font-size:18px'>Logistic Regression</h6>

In [519]:
pipeline_lgr=Pipeline([('transformer', ohe_transformer),('scale', StandardScaler()), 
                       ('log', LogisticRegression(random_state=42, class_weight='w'))])
pipeline_lgr

<h6>Hyperparameters being tested</h6>

In [520]:
param2_lgr = {'log__penalty' : ['l1','l2'],'log__C': np.logspace(-3,3,7),
              'log__solver': ['newton-cg', 'lbfgs', 'liblinear']}

In [593]:
pipeline_lgr=Pipeline([('transformer', ohe_transformer),('scale', StandardScaler()), ('log', LogisticRegression(random_state=42, class_weight='w'))])
precision_preds_grid_lgr=GridSearchCV(pipeline_lgr, param_grid=param2_lgr,scoring='roc_auc',cv = 5)
precision_preds_grid_lgr.fit(X_train,y_train)


best_score_test_lgr=precision_preds_grid_lgr.score(X_test,y_test)
best_score_train_lgr=precision_preds_grid_lgr.score(X_train,y_train)
best_params_precision_preds_lgr=precision_preds_grid_lgr.best_params_

print(f'roc_auc train: {best_score_train_lgr: .3f}',f'roc_auc test: {best_score_test_lgr: .3f}')
print("Tuned Hyperparameters Logistic Regression :", best_params_precision_preds_lgr)

n_splits_lgr  = precision_preds_grid_lgr.n_splits_ 
n_iter_lgr = pd.DataFrame(precision_preds_grid_lgr.cv_results_).shape[0]
mean_time_lgr=np.mean(precision_preds_grid_lgr.cv_results_['mean_fit_time'])
print(f'Elapsed Time using GridSearchCV: {mean_time_lgr* n_splits_lgr * n_iter_lgr: .3f}')

roc_auc train:  0.853 roc_auc test:  0.863
Tuned Hyperparameters Logistic Regression : {'log__C': 0.01, 'log__penalty': 'l2', 'log__solver': 'newton-cg'}
Elapsed Time using GridSearchCV:  6.342


<h6 style='color:blue;font-size:18px'>Support Vector Machine</h6>

In [522]:
pipeline_svc=Pipeline(steps=[('transformer', ohe_transformer),('scale', StandardScaler()),
                             ('svc', SVC(probability=True))])
pipeline_svc

<h6>Hyperparameters being tested</h6>

In [523]:
param2_svc = {'svc__kernel': ['rbf', 'poly', 'linear','sigmoid'],'svc__gamma': [0.001,0.1, 1.0,10]}

In [594]:
pipeline_svc=Pipeline(steps=[('transformer', ohe_transformer),('scale', StandardScaler()),('svc', SVC(probability=True))])
precision_preds_grid_svc=GridSearchCV(pipeline_svc, param_grid=param2_svc,scoring='roc_auc',cv=5)
precision_preds_grid_svc.fit(X_train,y_train)


best_score_test_svc=precision_preds_grid_svc.score(X_test,y_test)
best_score_train_svc=precision_preds_grid_svc.score(X_train,y_train)
best_params_precision_preds_svc=precision_preds_grid_svc.best_params_

print(f'roc_auc train: {best_score_train_svc: .3f}',f'roc_auc test: {best_score_test_svc: .3f}')
print("Tuned Hyperparameters SVC :", best_params_precision_preds_svc)

n_splits_svc  = precision_preds_grid_svc.n_splits_ 
n_iter_svc = pd.DataFrame(precision_preds_grid_svc.cv_results_).shape[0]
mean_time_svc=np.mean(precision_preds_grid_svc.cv_results_['mean_fit_time'])
print(f'Elapsed Time using GridSearchCV: {mean_time_svc * n_splits_svc * n_iter_svc: .3f}')

roc_auc train:  0.871 roc_auc test:  0.847
Tuned Hyperparameters SVC : {'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
Elapsed Time using GridSearchCV:  423.447


<h6 style='color:blue;font-size:18px'>Decision Tree</h6>

In [525]:
pipeline_tree=Pipeline(steps=[('transformer', ohe_transformer),('scale', StandardScaler()),
                              ('tree', DecisionTreeClassifier(random_state=42))])
pipeline_tree

<h6>Hyperparameters being tested</h6>

In [526]:
param2_tree = {'tree__max_depth': [i for i in range(1, 10)],'tree__min_samples_split': [0.01,0.05,0.1, 0.2, 0.25],
               'tree__criterion': ['gini', 'entropy','log_loss'],'tree__min_samples_leaf': [i for i in range(1, 6)]}

In [595]:
tree_grid = GridSearchCV(pipeline_tree, param_grid=param2_tree,scoring='roc_auc',cv=5)
tree_grid.fit(X_train, y_train)


best_score_test_tree=tree_grid.score(X_test,y_test)
best_score_train_tree=tree_grid.score(X_train,y_train)
best_params_precision_preds_tree=tree_grid.best_params_

print(f'roc_auc train: {best_score_train_svc: .3f}',f'roc_auc test: {best_score_test_svc: .3f}')
print("Tuned Hyperparameters Decision Tree :", best_params_precision_preds_tree)

n_splits_tree  = tree_grid.n_splits_ 
n_iter_tree = pd.DataFrame(tree_grid.cv_results_).shape[0]
mean_time_tree=np.mean(tree_grid.cv_results_['mean_fit_time'])
print(f'Elapsed Time using GridSearchCV: {mean_time_tree * n_splits_tree * n_iter_tree: .3f}')

roc_auc train:  0.871 roc_auc test:  0.847
Tuned Hyperparameters Decision Tree : {'tree__criterion': 'gini', 'tree__max_depth': 9, 'tree__min_samples_leaf': 5, 'tree__min_samples_split': 0.05}
Elapsed Time using GridSearchCV:  93.143


<h2 style="color:red;">Main Conclusions & Recomendations<h2>

Following the data cleaning process, removal of outliers, and the conversion of categorical data into binary (0 and 1) values, the finalized dataset comprises 77 columns and 6316 rows. The target column "y" represents whether a prospective client will subscribe to a term deposit or not.

In the overall evaluation, the Logistic Regression model emerges as the best classification model for the dataset under examination in this study. Nevertheless, the other three models, particularly the Decision Tree and KNeighborsClassifier, perform reasonably well.

Notably, before commencing the regression modeling, the numerical variable "balance" was scaled by dividing it by 100. This adjustment was made because the majority of the columns predominantly contain values of 0 and 1, contributing to improvements in the model's metrics.

Five numerical independent variables were included in the analysis, namely 'age,' 'balance,' 'duration,' 'previous,' 'campaign,' and 'pdays.' The 'duration' variable was utilized solely for benchmarking purposes and was subsequently excluded from the modeling phase. The remaining variables, initially nominal, were converted into binary values (0 and 1). Consequently, a significant portion of the independent variables employed in the modeling phase assumes binary values.

The metric employed for optimizing the parameters of each model was 'roc_auc,' representing the area under the ROC curve. This metric is particularly suitable for handling imbalanced data, as is the case in this dataset.

In addition to 'roc_auc,' the precision-recall curve was selected as an indicator, especially effective for moderate to severely imbalanced data, which accurately characterizes the dataset used in this analysis.

Across all categories, including the client's age, job, marital status, education level, housing and personal loans, and communication methods, clients who subscribed to term deposits tend to maintain higher balances in their accounts.

In [None]:
y_score_knn = precision_preds_grid_knn.predict_proba(X_test)[:, 1]
y_score_lgr = precision_preds_grid_lgr.predict_proba(X_test)[:, 1]
y_score_svc = precision_preds_grid_svc.predict_proba(X_test)[:, 1]
y_score_tree = tree_grid.predict_proba(X_test)[:, 1]
precision_knn, recall_knn, thresholds_knn = precision_recall_curve(y_test, y_score_knn,pos_label="yes")
precision_lgr, recall_lgr, thresholds_lgr = precision_recall_curve(y_test, y_score_lgr,pos_label="yes")
precision_svc, recall_svc, thresholds_svc = precision_recall_curve(y_test, y_score_svc,pos_label="yes")
precision_tree, recall_tree, thresholds_tree = precision_recall_curve(y_test, y_score_tree,pos_label="yes")
fig, ax = plt.subplots()
ax.plot(recall_knn, precision_knn, color='purple',label='knn')
ax.plot(recall_lgr, precision_lgr, color='red',label='Logistic Regression')
ax.plot(recall_svc, precision_svc, color='blue',label='SVC')
ax.plot(recall_tree, precision_tree, color='green',label='Decision Tree')

#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
plt.legend(loc='best')

#display plot:
