## 1. Import dataset

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")
df.head()

In [None]:
# drop columns
df = df.drop(columns = ['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

In [None]:
print(f"Rows, Columns: {df.shape}\n")
df.head()

In [None]:
# Dataframe information
df.info()

## 2. EDA : Exploratory data analysis

In [None]:
# Import necessary packages
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Explore Missing Value (ms)
df_ms = df.isnull().sum()

# Missing Value (Total)
print(f"Total Missing value for each column:\n\n{df_ms}")

In [None]:
# Ckeck unique and duplicated vulues of dataframe
print(f"Check unique of CLIENTNUM = {len(df['CLIENTNUM'].unique())}")
print(f"Check duplicated of CLIENTNUM = {df['CLIENTNUM'].duplicated().sum()}")
print(f"Check duplicated of dataframe = {df.duplicated().sum()}")

In [None]:
# Drop CLIENTNUM column
df.drop(columns=['CLIENTNUM'], inplace=True)

In [None]:
# Catagorical and numerical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if col not in cat_col]

print(f"cat_col = {len(cat_col)}\nnum_col = {len(num_col)}\n")
print(f"Categorical col: {cat_col}\nNumerical col: {num_col}")

In [None]:
# Explore Attrition_Flag value counts
df['Attrition_Flag'].value_counts()

In [None]:
# Display Attrition flag ratio
counts_att_flag = df['Attrition_Flag'].value_counts()

plt.figure(figsize=(9,4))
plt.pie(counts_att_flag, labels=counts_att_flag.index, autopct='%1.2f%%',
        startangle=30, explode=[0, 0.02])
plt.title('Attrition Flag')

plt.show()

In [None]:
# Explore Categorical column
fig1, axarr1 = plt.subplots(2, 3, figsize=(18, 9))
plt.subplots_adjust(hspace=0.8, wspace=0.3)

# Configure new title
title_cat = ['Attrition Flag', 'Gender', 'Education Level', 'Marital Status', 'Income Category', 'Card Category']

# Generate bar chart using loop
for i, col in enumerate(cat_col):
    sns.countplot(data=df, x=df[col], hue='Attrition_Flag', ax=axarr1[i // 3][i % 3])
    axarr1[i // 3][i % 3].set_xlabel(col.replace('_', ' '))
    axarr1[i // 3][i % 3].set_ylabel('Count')
    axarr1[i // 3][i % 3].set_xticklabels(axarr1[i // 3][i % 3].get_xticklabels(), rotation=90)
    axarr1[i // 3][i % 3].set_title(title_cat[i])
    axarr1[i // 3][i % 3].legend(['Existing','Attrited'])

plt.show()

In [None]:
# Check all columns have an unknown value
for col in df.columns:
    unknown_var = (df[col]=='Unknown').any()
    if unknown_var:
        print(f"{col} has an unknown value: {unknown_var}")
    else: pass

In [None]:
# Check 'Unknown' values of the Education_Level column
qr_edl = (df['Education_Level'] == 'Unknown')
educ_lv = df[qr_edl]['Attrition_Flag'].value_counts()
print(f"Education_Level (Unknown):\n{educ_lv}")

In [None]:
# Check 'Unknown' values of the Marital_Status column
qr_mrs = (df['Marital_Status'] == 'Unknown')
mar_sta = df[qr_mrs]['Attrition_Flag'].value_counts()
print(f"Marital Status (Unknown):\n{mar_sta}")

In [None]:
# Check 'Unknown' values of the Income_Category column
qr_icc = (df['Income_Category'] == 'Unknown')
inc_cat = df[qr_icc]['Attrition_Flag'].value_counts()
print(f"Income Category (Unknown):\n{inc_cat}")

In [None]:
# Explore Numerical column
fig2, axarr2 = plt.subplots(4, 4, figsize=(16, 16))
plt.subplots_adjust(hspace=0.5, wspace=0.6)

# Configure new title
titles_num = ['Customers age in years', 'Number of dependents', 'Period of relationship with bank',
              'Total Relationship count', 'Months Inactive (12 months)', 'Number of contacts (12 months)',
              'Credit limit on the credit card', 'Total Revolving Balance', 'Open to buy credit line (Avg. 12 months)',
              'Change in transaction amount (Q4/Q1)', 'Total transaction amount', 'Total transaction count',
              'Change in transaction count (Q4/Q1)', 'Average card utilization ratio']

# Generate bar chart using loop
for i, col in enumerate(num_col):
    sns.boxplot(data=df, x=df['Attrition_Flag'], y=df[col], hue='Attrition_Flag', ax=axarr2[i // 4][i % 4])
    axarr2[i // 4][i % 4].set_xlabel(col.replace('_', ' '))
    axarr2[i // 4][i % 4].set_ylabel('Count')
    axarr2[i // 4][i % 4].set_xticklabels(['Existing', 'Attrited'], rotation=0)
    axarr2[i // 4][i % 4].set_title(titles_num[i])
    axarr2[i // 4][i % 4].legend([])

plt.show()

In [None]:
# Descriptive Statistics of dataframe
df.describe()

**NOTE :**

*   No anomalies data were detected, as the data appears to be relatively clean.
*   Outlier values ​​in each column are accurate and follow the criteria for each property in that column.



## 3. Data Preprocessing

In [None]:
# Import necessary packages
import numpy as np
from sklearn.impute import SimpleImputer

In [None]:
# Review dataframe
print(f"Rows, Columns: {df.shape}\n")
df.head()

**NOTE :**

All columns have an unknown value
*   "Education_Level" has an unknown value
*   "Marital_Status" has an unknown value
*   "Income_Category" has an unknown value

In [None]:
# Display Education level value counts before editing
print(f"Education level (Before):\n{df['Education_Level'].value_counts()}\n")

# Configure to replace Unknow values with missing values
replace_un = {'Unknown': np.nan}
df['Education_Level'].replace(replace_un, inplace=True)

# Manage missing value in Education_Level using SimpleImputer
imp1 = SimpleImputer(strategy="most_frequent")
df[['Education_Level']] = imp1.fit_transform(df[['Education_Level']])

# Display Education level value counts using impute value with mode
print(f"Education level (Impute with mode):\n{df['Education_Level'].value_counts()}\n")

# Configure to replace new variables
educt_lavel = {'Uneducated':0, 'High School':1, 'College':2,
               'Graduate':3, 'Post-Graduate':4, 'Doctorate':5}
df.replace(educt_lavel, inplace=True)

# Display Education level value counts after editing
print(f"Education level (After):\n{df['Education_Level'].value_counts()}")

In [None]:
# Display Marital status value counts before editing
print(f"Marital status (Before):\n{df['Marital_Status'].value_counts()}\n")

# Configure to replace Unknow values with missing values
replace_un = {'Unknown': np.nan}
df['Marital_Status'].replace(replace_un, inplace=True)

# Manage missing value in Marital status using SimpleImputer
imp2 = SimpleImputer(strategy="most_frequent")
df[['Marital_Status']] = imp2.fit_transform(df[['Marital_Status']])

# Display Marital status value counts using impute value with mode
print(f"Marital status (Impute with mode):\n{df['Marital_Status'].value_counts()}\n")

# Configure to replace new variables
marital_status = {'Single':0, 'Married':1, 'Divorced':2}
df.replace(marital_status, inplace=True)

# Display Marital status value counts after editing
print(f"Marital status (After):\n{df['Marital_Status'].value_counts()}")

In [None]:
# Display Income category value counts before editing
print(f"Income category (Before):\n{df['Income_Category'].value_counts()}\n")

# Configure to replace Unknow values with missing values
replace_un = {'Unknown': np.nan}
df['Income_Category'].replace(replace_un, inplace=True)

# Manage missing value in Income category using SimpleImputer
imp3 = SimpleImputer(strategy="most_frequent")
df[['Income_Category']] = imp3.fit_transform(df[['Income_Category']])

# Display Income category value counts using impute value with mode
print(f"Income category (Impute with mode):\n{df['Income_Category'].value_counts()}\n")

# Configure to replace new variables
income_cat = {'Less than $40K': 0, '$40K - $60K': 1, '$60K - $80K': 2,
              '$80K - $120K': 3, '$120K +': 4}
df.replace(income_cat, inplace=True)

# Display Income category value counts after editing
print(f"Income category (After):\n{df['Income_Category'].value_counts()}")

In [None]:
# Display Attrition flag value counts before editing
print(f"Attrition Flag (Before):\n{df['Attrition_Flag'].value_counts()}\n")

# Configure to replace new variables
att_flag = {'Existing Customer':0, 'Attrited Customer':1}
df['Attrition_Flag'].replace(att_flag, inplace=True)

# Display attrition flag value counts after editing
print(f"Attrition Flag (After):\n{df['Attrition_Flag'].value_counts()}")

In [None]:
# Display Gender value counts before editing
print(f"Gender (Before):\n{df['Gender'].value_counts()}\n")

# Configure to replace new variables
gender = {'F':0, 'M':1}
df['Gender'].replace(gender, inplace=True)

# Display gender value counts after editing
print(f"Gender (After):\n{df['Gender'].value_counts()}")

In [None]:
# Display Card category value counts before editing
print(f"Card category (Before):\n{df['Card_Category'].value_counts()}\n")

# Configure to replace new variables
card_cat = {'Blue':0, 'Silver':1, 'Gold':2, 'Platinum':3}
df['Card_Category'].replace(card_cat, inplace=True)

# Display Card category value counts after editing
print(f"Card category (After):\n{df['Card_Category'].value_counts()}")

In [None]:
# Dataframe information
df.info()

In [None]:
# Review dataframe (After editing)
print(f"Rows, Columns: {df.shape}\n")
df.head()

## 4. Feature Selection

In [None]:
# Import necessary packages
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Correlation Coefficient of all the Features
corr_matrix_df = df.corr()

plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix_df, vmin= -1, vmax= 1, linewidths=0.5, cmap='rocket_r', annot=True, fmt='.2f')   # Cmd check cmap => plt.colormaps()

In [None]:
x = df.drop(columns=['Attrition_Flag'])
y = df['Attrition_Flag']

In [None]:
# Explore the best-performing features in 4 ensemble models
model_rf = RandomForestClassifier(n_estimators=250, max_depth = 1, random_state=0)
model_xb = XGBClassifier(n_estimators=250, learning_rate = 0.1, max_depth = 1, random_state = 0)
model_et = ExtraTreesClassifier(n_estimators=250, max_depth = 1, random_state = 0)
model_gd = GradientBoostingClassifier(n_estimators=250, learning_rate = 0.1, max_depth = 1, random_state = 0)

# Training model
model_rf.fit(x, y)
model_xb.fit(x, y)
model_et.fit(x, y)
model_gd.fit(x, y)

In [None]:
# Display feature importances values for Random forest
fi_rf = pd.Series(model_rf.feature_importances_, index=x.columns).sort_values(ascending=True)
fi_rf

In [None]:
# Display feature importances values for XGBoost
fi_xb = pd.Series(model_xb.feature_importances_, index=x.columns).sort_values(ascending=True)
fi_xb

In [None]:
# Display feature importances values for ExtraTrees
fi_et = pd.Series(model_et.feature_importances_, index=x.columns).sort_values(ascending=True)
fi_et

In [None]:
# Display feature importances values for GradientBoosting
fi_gd = pd.Series(model_gd.feature_importances_, index=x.columns).sort_values(ascending=True)
fi_gd

In [None]:
# Display Feature Importance for all models
model_list = [fi_rf, fi_xb, fi_et, fi_gd]
model_names = ['Random forest', 'XGBoost', 'ExtraTrees','GradientBoosting']

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18, 10))

for id, model in enumerate(model_list):
    row = id // 2
    col = id % 2
    ax = model.plot(kind='barh', color='c', title=f"Feature Importance ({model_names[id]})", ax=axes[row, col])
    ax.axvline(x=0.1, linestyle='--', color='red', linewidth=1.5)
    ax.grid(axis='x', linestyle='-')

    for i, val in enumerate(model):
        ax.text(val, i, '{:.4f}'.format(val))

plt.subplots_adjust(hspace=0.2, wspace=0.4)
plt.show()

In [None]:
# Create a multi-choice list and choose 'df_fi' based on the importance of the feature that gives the model the best accuracy score.

df_fi = df[fi_rf[fi_rf > 0.1].index]    # [Random forest] --> Best Choice!
#df_fi = df[fi_xb[fi_xb > 0.1].index]    # [XGBoost]
#df_fi = df[fi_et[fi_et > 0.1].index]    # [ExtraTrees]
#df_fi = df[fi_gd[fi_gd > 0.1].index]    # [GradientBoosting]

# Overview df_fi dataframe
df_fi.head()

## 5. Feature Scaling

In [None]:
# Standardize the feature (Z-Values) using "StandardScaler" class
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_fi_scaled = pd.DataFrame(sc.fit_transform(df_fi), columns= df_fi.columns)

In [None]:
df_fi_scaled.head()

In [None]:
# Display the variance of each feature.
df_fi_scaled.var()

In [None]:
# Combine data
df_scaled = pd.concat([df_fi_scaled, df[['Attrition_Flag']]], axis=1)

In [None]:
# Review df_scaled after combining data
df_scaled.head()

## 6. Imbalanced Data

In [None]:
# Check Imbalanced data (Before SMOTE)
df_scaled['Attrition_Flag'].value_counts()

In [None]:
# Dividing Dataset into Features & Label
x = df_scaled.iloc[:,:4]
y = df_scaled['Attrition_Flag']

In [None]:
# Manage Imbalanced data using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

# Oversampling using SMOTE
x_resampled, y_resampled = smote.fit_resample(x, y)

In [None]:
# Recheck Imbalanced data (After SMOTE)
y_resampled.value_counts()

In [None]:
# Count the number of data for each group (Before & After SMOTE)
counts_before_smote = y.value_counts()
counts_after_smote = y_resampled.value_counts()

# Plot pie chart (Before SMOTE)
plt.figure(figsize=(9,4))
plt.subplot(1, 2, 1)
plt.pie(counts_before_smote, labels=counts_before_smote.index, autopct='%1.2f%%',
        startangle=30, explode=[0, 0.02])
plt.title('Imbalanced Data Distribution (Before SMOTE)')

# Plot pie chart (After SMOTE)
plt.subplot(1, 2, 2)
plt.pie(counts_after_smote, labels=counts_after_smote.index, autopct='%1.1f%%',
        startangle=90, explode=[0, 0.02])
plt.title('Imbalanced Data Distribution (After SMOTE)')

plt.legend(['0 : Existing Customer', '1 : Attrited Customer'], loc=9, bbox_to_anchor=(-0.225, 0.9))

plt.tight_layout()
plt.show()

## 7. Model Development

In [None]:
# Import necessary packages
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

In [None]:
# Divide data and class labels into training set and test set with 80:20 ratio
from sklearn.model_selection import train_test_split
x_train_res, x_test_res, y_train_res, y_test_res = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=0)

In [None]:
print(f"x_train: {x_train_res.shape}\ny_train: {y_train_res.shape}\n\nx_test: {x_test_res.shape}\ny_test: {y_test_res.shape}")

### 7.1) Find the parameter for KNN


In [None]:
# find k parameter for KNN model using Loop
k_neighbors = np.arange(1, 10)

# train model
for i, k in enumerate(k_neighbors):
    # k_neighbors = [1,2,3,4,5,6,7,8,9]
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train_res, y_train_res)
    test_score = knn.score(x_test_res, y_test_res)
    print(f"Accuracy (n = {i+1}) : {test_score:.4%} %")

In [None]:
# Find parameter for KNN model using GridSearchCV
param_knn_grid = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9],
                  'weights':['uniform', 'distance'],
                  'algorithm':['auto', 'brute','kd_tree', 'ball_tree']}

knn = KNeighborsClassifier(n_jobs=-1)
knn_grid = GridSearchCV(knn, param_knn_grid, cv=5, n_jobs=-1)
knn_grid.fit(x_train_res, y_train_res)

print(f"{knn_grid.best_estimator_}")
print(f"{knn_grid.best_params_}")
print(f"{knn_grid.best_score_}")

### 7.2) Find the parameter for Random Forest

In [None]:
# Find parameter for Random Forest model using GridSearchCV
param_ranfor_grid = {'n_estimators': [100,200,300,400,500,600],
                     'max_features': [0.3,0.5,0.7,0.9],
                     'max_depth': [None, 10, 20, 30, 40, 50]}

ranfor = RandomForestClassifier(random_state=0)
ranfor_grid = GridSearchCV(ranfor, param_ranfor_grid, cv=5, refit=True)
ranfor_grid.fit(x_train_res, y_train_res)

print(f"{ranfor_grid.best_estimator_}\n")
print(f"{ranfor_grid.best_params_}\n")
print(f"{ranfor_grid.best_score_}")

### 7.3) Find the parameter for XGBoost

In [None]:
# Find parameter for XGBoost model using GridSearchCV
param_xgb_grid = {'n_estimators': [100,150,200,250,300],
                  'max_depth': [1,3,5,7,9],
                  'learning_rate': [0.001, 0.01, 0.1, 1],
                  'gamma': [0, 0.5, 1]}

xgb = XGBClassifier()
xgb_grid = GridSearchCV(xgb, param_xgb_grid, cv=5, n_jobs=-1)
xgb_grid.fit(x_train_res, y_train_res)

print(f"{xgb_grid.best_estimator_}\n")
print(f"{xgb_grid.best_params_}\n")
print(f"{xgb_grid.best_score_}")

### 7.4) Find the parameter for ExtraTrees

In [None]:
# Find parameter for ExtraTrees model using GridSearchCV
param_ext_grid = {'n_estimators': [100,150,200,250,300],
                  'max_depth': [1,3,5,7,9],
                  'bootstrap':[True, False]}

ext = ExtraTreesClassifier()
ext_grid = GridSearchCV(ext, param_ext_grid, cv=5, n_jobs=-1)
ext_grid.fit(x_train_res, y_train_res)

print(f"{ext_grid.best_estimator_}\n")
print(f"{ext_grid.best_params_}\n")
print(f"{ext_grid.best_score_}")

### 7.5) Find the parameter for AdaBoost

In [None]:
# Find parameter for AdaBoost model using GridSearchCV
param_adb_grid = {'n_estimators': [50,100,150,200,250,300],
                  'learning_rate': [0.001, 0.01, 0.1, 1],
                  'algorithm':['SAMME.R','SAMME']}

adb = AdaBoostClassifier()
adb_grid = GridSearchCV(adb, param_adb_grid, cv=5, n_jobs=-1)
adb_grid.fit(x_train_res, y_train_res)

print(f"{adb_grid.best_estimator_}\n")
print(f"{adb_grid.best_params_}\n")
print(f"{adb_grid.best_score_}")

### 7.6) Model Building and Evaluation

In [None]:
# Generate models and Hyperparameter Tuning
algo = [[KNeighborsClassifier(n_neighbors=1, n_jobs=-1, algorithm='auto', weights='uniform'), 'KNeighborsClassifier'],
        [LogisticRegression(solver='lbfgs'), 'LogisticRegression'],
        [Perceptron(), 'Perceptron'],
        [DecisionTreeClassifier(min_samples_split=10), 'DecisionTreeClassifier'],
        [GradientBoostingClassifier(n_estimators=300, learning_rate = 0.1, random_state = 0), 'GradientBoostingClassifier'],
        [RandomForestClassifier(n_estimators=200, max_features=0.7, max_depth=None, random_state=0), 'RandomForestClassifier'],
        [BaggingClassifier(n_jobs=-1), 'BaggingClassifier'],
        [AdaBoostClassifier(n_estimators=300, algorithm='SAMME.R', learning_rate=1, random_state=0), 'AdaBoostClassifier'],
        [ExtraTreesClassifier(n_estimators=250, max_depth=9, bootstrap=True, random_state = 0), 'ExtraTreesClassifier'],
        [GaussianNB(), 'GaussianNB'],
        [MLPClassifier(), 'MLPClassifier'],
        [SVC(kernel='linear'), 'SVC_linear'],
        [XGBClassifier(n_estimators=300, gamma= 0, learning_rate=0.1, max_depth=9, random_state=0), 'XGBoostClassifier'],
        [GaussianProcessClassifier(kernel=None, n_jobs=-1), 'GaussianProcessClassifier']]

In [None]:
# Model Evaluation
model_score = []

for a in algo:
  model = a[0]
  model.fit(x_train_res, y_train_res)
  score = model.score(x_test_res, y_test_res)
  model_score.append([score, a[1]])
  y_pred = model.predict(x_test_res)
  print(f"{a[1]:20} score: {score:.04f}")
  print(confusion_matrix(y_test_res, y_pred))
  print(classification_report(y_test_res, y_pred))
  print('-' * 100)

print(model_score)
print(f"\nBest score = {max(model_score)}")

In [None]:
# The accuracy score for all models
model_score

In [None]:
# Create models score dataframe
models_score = pd.DataFrame(model_score, columns=['Score', 'Classifier'])

# Create column "Rank"
models_score['Ranked'] = models_score['Score'].rank(ascending=False).astype(int)

# Reorder columns in a new DataFrame
models_score = models_score.reindex(columns=['Ranked', 'Score', 'Classifier'])

# Display the ranking results of the model.
ranked_model = models_score.sort_values('Score', ascending=False)
ranked_model

In [None]:
# Visualization of the ranking results of the model.
import plotly.express as px
fig = px.bar(ranked_model, x='Score', y='Classifier', color='Classifier',
             text=ranked_model['Score'].apply(lambda x: f'{x:.4f} ({x:.2%})'),
             labels={'Score': 'Accuracy score', 'Classifier':'Classification Models'},
             height=500, width=1100, template='plotly')

fig.update_layout(title={'text': 'Visualization of the ranking results of the model.', 'x': 0.5,
                          'xanchor' : 'center', 'yanchor' : 'top'})
fig.show()

### 7.7) Receiver Operating Characteristic (ROC)

In [None]:
# Receiver Operating Characteristic (ROC)
# Generate models
algo_dict = [{'KNN': KNeighborsClassifier(n_neighbors=1, n_jobs=-1, algorithm='auto', weights='uniform')},
             {'Logistic R.': LogisticRegression(solver='lbfgs')},
             {'Perceptron': Perceptron()},
             {'DecisionTree': DecisionTreeClassifier(min_samples_split=10)},
             {'GradientBoosting': GradientBoostingClassifier(n_estimators=300, learning_rate = 0.1, random_state = 0)},
             {'RandomForest': RandomForestClassifier()},
             {'Bagging': BaggingClassifier(n_jobs=-1)},
             {'AdaBoost': AdaBoostClassifier(n_estimators=300, algorithm='SAMME.R', learning_rate=1, random_state=0)},
             {'ExtraTrees': ExtraTreesClassifier(n_estimators=250, max_depth=9, bootstrap=True, random_state = 0)},
             {'NaiveBayes': GaussianNB()},
             {'MLPC': MLPClassifier()},
             {'XGBoot': XGBClassifier(n_estimators=300, gamma= 0, learning_rate=0.1, max_depth=9, random_state=0)},
             {'SVC': SVC(kernel='linear', probability=True)},
             {'GaussianProcess': GaussianProcessClassifier(kernel=None, n_jobs=-1)}]

# Plot ROC curve for each model
plt.figure(figsize=(9.5, 8))
for model_dict in algo_dict:
    model_name, model = list(model_dict.items())[0]
    if 'predict_proba' in dir(model):
        model.fit(x_train_res, y_train_res)
        y_pred_prob = model.predict_proba(x_test_res)[:, 1]
    else: continue
    fpr, tpr, thresholds = roc_curve(y_test_res, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1.5, label='%s (AUC = %0.2f)' % (model_name, roc_auc))

plt.plot([0, 1], [0, 1], color='gray', lw=1.5, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### 7.8) Apply the model with the highest evaluation

In [None]:
# Bulding the XGBoost model
xgb_model = XGBClassifier(n_estimators=300, gamma= 0, learning_rate=0.1, max_depth=9, random_state=0)

# Training model
xgb_model.fit(x_train_res, y_train_res)

# Use XGBoost to model to predict on test data
y_pred_xgb = xgb_model.predict(x_test_res)

# Create confusion matrix using predicted and actual values
conf = confusion_matrix(y_test_res, y_pred_xgb)
print(f"Confusion matrix:\n\n{conf}\n")

# Plot confusion matrix
group_names = ['True Negative', 'False Positive','False Negative','True Positive']
categories = ['0 : Existing', '1 : Attrited']

group_counts = ["{0:0.0f}".format(value) for value in conf.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in conf.flatten()/np.sum(conf)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.array([labels]).reshape(2,2)

sns.heatmap(conf, annot=labels, fmt='', cmap='Blues',
            xticklabels=categories, yticklabels=categories)

f1 = f1_score(y_test_res, y_pred_xgb)
recall = recall_score(y_test_res, y_pred_xgb)
precision = precision_score(y_test_res, y_pred_xgb)
accuracy = accuracy_score(y_test_res, y_pred_xgb)

plt.xlabel(f"Predicted Label\n\nF1 score : {f1:.3f}\n\
Recall score : {recall:.4f}\n\
Precision score : {precision:.4f}\n\
Accuracy score : {accuracy:.4f}")

plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

# Display classification report
conf_rp = classification_report(y_test_res, y_pred_xgb)
print(f"\nClassification Report:\n")
print(f"{conf_rp}\n")

# Display Accuracy score using XGBoost model
acs = accuracy_score(y_test_res, y_pred_xgb)
print(f"Accuracy score (XGBoost model) = {acs:.4f} ({acs:.2%})")

In [None]:
# ROC-AUC Curve of XGBClassifier model
# Calculate the probability estimates of the positive class
y_pred_prob_xgb = xgb_model.predict_proba(x_test_res)[:, 1]

# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test_res, y_pred_prob_xgb)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, color='blue', lw=1.5, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=1.5, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

## 8. Conclusion

The best models obtained are XGBoost and Random Forest as the best algorithms to predict Bank Customer Churn since they have the highest accuracy (95.68% and 95.32%), XGBoost and Random Forest produced almost perfect ROC-AUC scores of 0.9895 and 0.9879 in test data, respectively. This means that the model works well in predicting each class.
