We will use the classification models of XGBoost to predict the profile of target customers.  
To proceed with the machine learning, we have to convert the categorical variables into numerical variables first using one hot encoding.

In [1]:
%pip install xgboost
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import make_pipeline as make_imb_pipeline

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
customerdata = pd.read_csv('customer_bank_marketing_data.csv')
customerdata

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,deposit_num
0,59,administrative,married,secondary,no,2343,yes,no,1
1,56,administrative,married,secondary,no,45,no,no,1
2,41,technician,married,secondary,no,1270,yes,no,1
3,55,services,married,secondary,no,2476,yes,no,1
4,54,administrative,married,tertiary,no,184,no,no,1
...,...,...,...,...,...,...,...,...,...
45202,59,management,married,tertiary,no,138,yes,yes,0
45203,37,management,married,tertiary,no,1428,no,no,0
45204,34,blue-collar,single,secondary,no,1475,yes,no,0
45205,57,blue-collar,married,secondary,no,668,no,no,0


# One Hot Encoding

In [4]:
#all categorical variables are converted into dummy variables using the pd.get_dummies function.
#each new added columns represents one category and contains 1 if the category is present.

catcols = ['job','marital','education','default','housing','loan']
for var in catcols:
    customerdata = pd.concat([customerdata.drop(var, axis=1), pd.get_dummies(customerdata[var], prefix=var, prefix_sep='_', drop_first=True, dummy_na=False, dtype=int)], axis=1)
customerdata   

Unnamed: 0,age,balance,deposit_num,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes
0,59,2343,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
1,56,45,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,41,1270,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
3,55,2476,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
4,54,184,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45202,59,138,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,1
45203,37,1428,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
45204,34,1475,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
45205,57,668,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


# XGBoost Model Training

In [5]:
#the dataframe x will contain all the independent variables
#the dataframe y will only contain the dependent variable -> deposit_num

x = customerdata.drop(columns = 'deposit_num')
y = customerdata[['deposit_num']]
customerdata

Unnamed: 0,age,balance,deposit_num,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes
0,59,2343,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
1,56,45,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,41,1270,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
3,55,2476,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
4,54,184,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45202,59,138,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,1
45203,37,1428,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
45204,34,1475,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
45205,57,668,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
#the test size is set to 30% of the data
#random state is set to ensure the reproducibility of results.
#it allows us to obtain the same random behaviour everytime we run the code

test_size = 0.3
rand_state = 40

In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=rand_state)

# Define the imbalanced-learn pipeline
pipeline = make_imb_pipeline(
    SMOTE(sampling_strategy='auto', random_state=42),  #Apply SMOTE for oversampling
    xgb.XGBClassifier(n_estimators=100, learning_rate=0.08, gamma=0, subsample=1.0, colsample_bytree=1, max_depth=7)
)

# Train the model using the pipeline
pipeline.fit(x_train, y_train)

# Make predictions on testing set
y_test_pred = pipeline.predict(x_test)

# Evaluate the model
print('Initial model performance:')
print('F1 Score:', f1_score(y_test, y_test_pred))
print('Accuracy:', accuracy_score(y_test, y_test_pred))
print('AUC Score:', roc_auc_score(y_test, y_test_pred))

# Define the parameter grid for GridSearchCV
param_grid = {
    'xgbclassifier__n_estimators': [100, 200, 300],
    'xgbclassifier__max_depth': [5, 10, 15],
    'xgbclassifier__learning_rate': [0.05, 0.1, 0.2],
    'xgbclassifier__subsample': uniform(0.6, 0.4).rvs(size=10).tolist(),  # Tuning subsample
    'xgbclassifier__colsample_bytree': uniform(0.6, 0.4).rvs(size=10).tolist()  # Tuning colsample_bytree
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_

# Retrain the model with the best parameters
pipeline.set_params(**best_params)
pipeline.fit(x_train, y_train)

# Make predictions on testing set
y_test_pred_tuned = pipeline.predict(x_test)

# Evaluate the tuned model
print()
print('Tuned model performance:')
print('F1 Score:', f1_score(y_test, y_test_pred_tuned))
print('Accuracy:', accuracy_score(y_test, y_test_pred_tuned))
print('AUC Score:', roc_auc_score(y_test, y_test_pred_tuned))

Initial model performance:
F1 Score: 0.27489564698867025
Accuracy: 0.8206886382068864
AUC Score: 0.5878114774729424


We are using SMOTE oversampling here to handle class imblanace and improve the performance of the model as the class where deposit=1 is significantly underrepresented in the data.

n_estimators - represent the number of trees to build. Higher values allow the model to learn more complex patterns but can increase the risk of overfitting

learning_rate - controls the step size at each iteration while finding minimum cost function. Lower value make the model training more robust

gamma - specify the minimum loss reduction required to make a further partition on a leaf node.

subsample - determine the fraction of samples to be used for training each tree

colsample_bytree - determines the fraction of columns to be randomly sampled and used to build each tree

max_depth - set the maximum depth of the tree

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
# Extracting values from confusion matrix
tn = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
tp = cm[1, 1]

# Calculate rates
tpr = tp / (tp + fn)  # True Positive Rate
fpr = fp / (fp + tn)  # False Positive Rate
tnr = tn / (tn + fp)  # True Negative Rate
fnr = fn / (fn + tp)  # False Negative Rate

# Print rates
print("True Positive Rate (TPR):", tpr)
print("False Positive Rate (FPR):", fpr)
print("True Negative Rate (TNR):", tnr)
print("False Negative Rate (FNR):", fnr)

#Print Confusion Matrix
print()
print("Confusion Matrix for XGBoost: ")
sb.heatmap(cm, annot=True, fmt=".0f", annot_kws={"size":18})

In [None]:
# Get feature importances from the trained XGBoost model
feature_importances = pipeline.named_steps['xgbclassifier'].feature_importances_

# Create a DataFrame to store feature names and their importances
headers = ["name", "score"]
values = sorted(zip(x_train.columns, feature_importances), key=lambda x: x[1], reverse=True)
xgb_feature_importances = pd.DataFrame(values, columns=headers)

# Plot the feature importances
x_pos = np.arange(0, len(xgb_feature_importances))
plt.bar(x_pos, xgb_feature_importances['score'])
plt.xticks(x_pos, xgb_feature_importances['name'])
plt.xticks(rotation=90)
plt.title('Feature Importances (XGBoost)')
plt.show()

From the graph, we can see that the top 3 variables that we can used to estimate our target customers would be job_housemaid, housing_yes, loan_yes

# Conclusion from XGBoost

From the machine leraning model of XGBoost:

Initial model performance:  
F1 Score: 0.27489564698867025  
Accuracy: 0.8206886382068864  
AUC Score: 0.5878114774729424  

Tuned model performance:  
F1 Score: 0.3194530764449969   
Accuracy: 0.8385312983853129  
AUC Score: 0.6118592163772129  

True Positive Rate (TPR): 0.28041362530413627  
False Positive Rate (FPR): 0.10479067035825153  
True Negative Rate (TNR): 0.8952093296417485   
False Negative Rate (FNR): 0.7195863746958637   

The XGBoost model shows improvements in these performance metrics after tuning. However, the F1 score is still low mainly due to the highly imbalanced classes in the dataset, meaning the class wehre deposit=0 significantly outnumbers the class deposit=1. Hence, this model may struggle to correctly identify the minority class. 