# BANK CUSTOMER CHURN PREDICTION

It is the dataset of a U.S. bank customer for getting the information that , this particular customer will leave bank or not.



In [None]:
!pip install -U kaleido



## STEP 1: Import data and preprocess

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
input_data = pd.read_csv("Churn_Modelling.csv")
input_data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
input_data.shape

(10000, 14)

In [None]:
input_data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
input_data.nunique()

RowNumber          10000
CustomerId         10000
Surname             2932
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

Remove features irrelevant to the predictive outcome.

In [None]:
input_data = input_data.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)

In [None]:
input_data.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

As I can see from the array above, the continuous values are indeed of int or float type. Only the features that describe a category such as the Geography location and the gender, are of type object. Therefore, no data transformation is needed.

In [None]:
input_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


**Tenure:** From how many years customer is in bank

**Balance:** Average balance of customer

**Number of Products:** Number of bank product facilities customer is using



## Step 2: Exploratory data analysis (EDA)

In [None]:
# first lets create a pie chart for the categorical data only
countries = list(dict.fromkeys(input_data["Geography"]))
print(countries)

['France', 'Spain', 'Germany']


In [None]:
input_data['Geography'] = input_data['Geography'].astype('str')

In [None]:
def update_layout_plot():
    fig.update_layout(
    uniformtext_minsize=12, uniformtext_mode='hide',
    font=dict(color='white'),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title_font=dict(color="white"),
    legend_title_font=dict(color="white"),
    legend_font=dict(color="white"),
    showlegend=True,
    xaxis_title_font=dict(color="white"),
    yaxis_title_font=dict(color="white"),
    xaxis_tickfont=dict(color="white"),
    yaxis_tickfont=dict(color="white")
)

**Feature Correllation**

In [None]:
import plotly.express as px
correlation_matrix = input_data.corr()
fig = px.imshow(correlation_matrix, x=correlation_matrix.columns, y=correlation_matrix.columns, zmin=-1, zmax=1, color_continuous_scale='RdBu_r')
# update_layout_plot()
fig.show()
fig.write_image("feature_correlation.png", format="png")

  correlation_matrix = input_data.corr()


In [None]:
from collections import Counter
import plotly.express as px
import pandas as pd

def pie_chart(input_list: list, chart_title: str):
  # Count the frequency of each category
  category_counts = Counter(input_list)

  # Create a DataFrame
  df = pd.DataFrame({'Category': list(category_counts.keys()), 'Count': list(category_counts.values())})

  # Create a dynamic pie chart with Plotly Express
  fig = px.pie(df, names='Category', values='Count', title=chart_title)

  # Make the background transparent
  # fig.update_layout(
  #     paper_bgcolor='rgba(0,0,0,0)',
  #     plot_bgcolor='rgba(0,0,0,0)',
  #     title_font=dict(color="white"),  # Set title text color to white
  #     legend_title_font=dict(color="white"),  # Set legend title text color to white
  #     legend_font=dict(color="white"),  # Set legend text color to white
  #     showlegend=True  # Show the legend
  # )

  # Set the label text color to white
  fig.update_traces(textinfo='percent+label')
  fig.write_image("customers_age_piechart.png", format="png")  # Show the pie chart
  fig.show()

In [None]:
pie_chart(input_list=input_data["Geography"], chart_title="Customers & Region")


In [None]:
pie_chart(input_list=input_data["Gender"], chart_title="Customers & Gender")


In [None]:
bins = [0, 18, 30, 45, 60, 100]  # Define age range boundaries
labels = ['0-18', '19-30', '31-45', '46-60', '61-100']  # Labels for age ranges

input_data['AgeRange'] = pd.cut(input_data['Age'], bins=bins, labels=labels)

In [None]:
age_group_counts = input_data.groupby(['AgeRange', 'Exited']).size().unstack(fill_value=0)
age_group_counts

Exited,0,1
AgeRange,Unnamed: 1_level_1,Unnamed: 2_level_1
0-18,20,2
19-30,1800,146
31-45,4989,932
46-60,805,842
61-100,349,115


In [None]:
string_counts = {string: list(input_data['AgeRange']).count(string) for string in set(input_data['AgeRange'])}
string_counts

{'19-30': 1946, '61-100': 464, '0-18': 22, '46-60': 1647, '31-45': 5921}

In [None]:
pie_chart(input_list=input_data["AgeRange"], chart_title="Customers & AgeRange")
# update_layout_plot()
fig.write_image("customers_age.png", format="png")

In [None]:
def exited_feature(feature:str):
  group_counts = input_data.groupby([feature, 'Exited']).size().unstack(fill_value=0)
  group_counts['Stayed'] = (group_counts[0] / (group_counts[0] + group_counts[1])) * 100
  group_counts['Exited'] = (group_counts[1] / (group_counts[0] + group_counts[1])) * 100

  fig = px.bar(group_counts.reset_index(), x=feature, y=['Stayed', 'Exited'], title=f'Percentage of exited by {feature}')

  fig.update_traces(
      text=round(group_counts['Stayed'],2),
      textposition='inside',
      selector=dict(name='Stayed')
  )
  fig.update_traces(
      text=round(group_counts['Exited'],2),
      textposition='inside',
      selector=dict(name='Exited')
  )

  fig.update_layout(barmode='stack')
  fig.update_xaxes(title_text=feature)
  fig.update_yaxes(title_text='Percentage')

  return fig

In [None]:
fig = exited_feature(feature="AgeRange")
# update_layout_plot()
fig.show()
fig.write_image("exited_age.png", format="png")

In [None]:
import pandas as pd
import plotly.express as px


df = input_data

age_ranges = ['0-18', '19-30', '31-45', '46-60', '61-100']

probabilities_label_0 = []
probabilities_label_1 = []

for specific_age_range in age_ranges:
    age_range_mask = df['AgeRange'] == specific_age_range
    probability_label_0 = ((df['Exited'] == 0) & age_range_mask).mean()
    probability_label_1 = ((df['Exited'] == 1) & age_range_mask).mean()
    probabilities_label_0.append(probability_label_0)
    probabilities_label_1.append(probability_label_1)

label_0_df = pd.DataFrame({'AgeRange': age_ranges, 'Probability': probabilities_label_0, 'Label': 'Stayed'})
label_1_df = pd.DataFrame({'AgeRange': age_ranges, 'Probability': probabilities_label_1, 'Label': 'Exited'})


combined_df = pd.concat([label_0_df, label_1_df], ignore_index=True)

fig = px.bar(combined_df, x='AgeRange', y='Probability', color='Label', text='Probability',
             title='Probabilities of Stayed and Exited in Different Age Ranges')

fig.update_xaxes(title_text='Age Range')
fig.update_yaxes(title_text='Probability')
fig.update_traces(texttemplate='%{text:.2%}', textposition='outside')

# update_layout_plot()

fig.show()
fig.write_image("fprob_exited_age.png", format="png")

In [None]:
fig = px.box(input_data, x="Exited", y="Balance", color="Exited")

# Customize the plot
fig.update_layout(
    title="Box Plot of Balance by Exit Status",
    xaxis_title="Exited",
    yaxis_title="Balance",
    legend_title="Exited",
)

# update_layout_plot()

fig.show()
fig.write_image("exit_balance_box_plot.png", format="png")

**COMMENTS**

From the diagram above and the category of 0, which describes the customers of staying in the bank, the min value equals to Q1. This means that a significant portion of the data is concentrated within the lower range of values. In other words, there's a clustering of data points at or near the minimum value.

Also, the maximum value is significantly far away from the third quartile (Q3) in a dataset, it indicates that there is a substantial range of values, including outliers or extreme values, in the upper tail of the distribution, for both classes. Therefore, a substantial portion of the data falls within the lower 75% of the range, while a relatively small number of data points extend into the upper 25% of the range.

The boxes of two classes have some differences in their medians or quartiles, about 30k, and it confirms the correlation between "Balance" and the likelihood of a customer exiting the bank.

The boxes do not seem to have the same distribution, and that suggests a strong correlation between them.

In [None]:
# TODO number of products and balance

# fig = px.box(data_frame=input_data, x='NumOfProducts', y='Balance', title='Box Plot of Balance Grouped by Number of Products')
colors = ["red", "blue", "green", "purple"]
fig = px.box(data_frame=input_data, x='NumOfProducts', y='Balance', color='NumOfProducts',
             color_discrete_map={1: colors[0], 2: colors[1], 3: colors[2], 4: colors[3]},
             title='Box Plot of Balance Grouped by Number of Products')

# update_layout_plot()
fig.show()
fig.write_image("numofprod_balance_box_plot.png", format="png")

In [None]:
# Group the data by 'numofproducts' and calculate summary statistics
summary_stats = input_data.groupby('NumOfProducts')['Balance'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])

# Rename the columns for clarity
summary_stats = summary_stats.rename(columns={'count': 'Count', 'mean': 'Mean', 'median': 'Median', 'std': 'Standard Deviation',
                                            'min': 'Min', 'max': 'Max', '25%': 'Q1', '75%': 'Q3'})

# Display the summary statistics for each 'numofproducts' category
print(summary_stats)

               Count          Mean      Median  Standard Deviation  Min  \
NumOfProducts                                                             
1               5084  98551.870614  111886.035        53234.092790  0.0   
2               4590  51879.145813       0.000        62644.474506  0.0   
3                266  75458.328195   95973.935        62924.623833  0.0   
4                 60  93733.135000  116808.195        57507.383750  0.0   

                     Max  
NumOfProducts             
1              238387.56  
2              214346.96  
3              250898.09  
4              195238.29  


## Step 3: SMOTE ANALYSIS

**Identify Minority Class**

In [None]:
value_counts = {}
for value in input_data['Exited']:
    if value in value_counts:
        value_counts[value] += 1
    else:
        value_counts[value] = 1
value_counts

{1: 2037, 0: 7963}

 There is a significant difference in the number of samples between the two classes

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
X = input_data.drop('Exited', axis=1)
y = input_data['Exited']

numerical_input_data = X.drop(columns=['Geography', 'Gender', 'AgeRange'], axis=1)
categ_input_data = X[['Geography','Gender']]

In [None]:
scaler = StandardScaler()
numerical_features_standardized = scaler.fit_transform(numerical_input_data)

In [None]:
encoder = LabelEncoder()
categorical_features_encoded = categ_input_data.apply(encoder.fit_transform)
categorical_features_encoded.head()

Unnamed: 0,Geography,Gender
0,0,0
1,2,0
2,0,0
3,0,0
4,2,0


In [None]:
# Combine standardized numerical and encoded categorical features
X_processed = pd.concat([pd.DataFrame(numerical_features_standardized, columns=numerical_input_data.columns),
                         categorical_features_encoded], axis=1)
X_processed.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography,Gender
0,-0.326221,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,0,0
1,-0.440036,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,2,0
2,-1.536794,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,0,0
3,0.501521,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,0,0
4,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,2,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(len(X_train), len(X_train_resampled))
print(len(y_train), len(y_train_resampled))

8000 12712
8000 12712


In [None]:
y_train_resampled_list = y_train_resampled.to_list()

In [None]:
value_counts = {}
for value in y_train_resampled_list:
    if value in value_counts:
        value_counts[value] += 1
    else:
        value_counts[value] = 1
value_counts

{0: 6356, 1: 6356}

## Step 4: Model Training & Evaluation

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000).fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

# Train set
predictions_lr = pd.DataFrame()
predictions_lr['true'] = y_train_resampled
predictions_lr['preds'] = lr.predict(X_train_resampled)

# Test set
predictions_test_lr = pd.DataFrame()
predictions_test_lr['true'] = y_test
predictions_test_lr['preds'] = lr.predict(X_test)
predictions_test_lr['preds_over'] = lr.predict(X_test)

train_acc_lr = accuracy_score(predictions_lr.true, predictions_lr.preds)
test_acc_lr = accuracy_score(predictions_test_lr.true, predictions_test_lr.preds)
test_acc_over_lr = accuracy_score(predictions_test_lr.true, predictions_test_lr.preds_over)

y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test,y_pred_lr)
prec_lr = precision_score(y_test,y_pred_lr)
conf_mat_lr = confusion_matrix(y_test,y_pred_lr)

print(f"Train Acc (Logistic Regression): {train_acc_lr:.4f}")
print(f"Test Acc (Logistic Regression): {test_acc_lr:.4f}")
print(f"Test Acc Oversampled (Logistic Regression): {test_acc_over_lr:.4f}")

Train Acc (Logistic Regression): 0.7082
Test Acc (Logistic Regression): 0.7135
Test Acc Oversampled (Logistic Regression): 0.7135


In [None]:
confusion_df = pd.DataFrame(conf_mat_lr, index=['Actual 0 (Stayed)', 'Actual 1 (Exited)'], columns=['Predicted 0', 'Predicted 1'])

fig = px.imshow(confusion_df, labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Predicted 0', 'Predicted 1'],
                y=['Actual 0', 'Actual 1'],
                color_continuous_scale="Blues")

fig.update_xaxes(side="bottom")
fig.update_layout(title_text='Confusion Matrix', title_x=0.5)

# update_layout_plot()

fig.show()
fig.write_image("feature_correlation_matrix_lr.png", format="png")

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf = clf.fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.metrics import accuracy_score

# Train set
predictions_rf = pd.DataFrame()
predictions_rf['true'] = y_train_resampled
predictions_rf['preds'] = clf.predict(X_train_resampled)

# Test set
predictions_test_rf = pd.DataFrame()
predictions_test_rf['true'] = y_test
predictions_test_rf['preds'] = clf.predict(X_test)
predictions_test_rf['preds_over'] = clf.predict(X_test)

train_acc_rf = accuracy_score(predictions_rf.true, predictions_rf.preds)
test_acc_rf = accuracy_score(predictions_test_rf.true, predictions_test_rf.preds)
test_acc_over_rf = accuracy_score(predictions_test_rf.true, predictions_test_rf.preds_over)

print(f"Train Acc (Logistic Regression): {train_acc_rf:.4f}")
print(f"Test Acc (Logistic Regression): {test_acc_rf:.4f}")
print(f"Test Acc Oversampled (Logistic Regression): {test_acc_over_rf:.4f}")

Train Acc (Logistic Regression): 0.7629
Test Acc (Logistic Regression): 0.7620
Test Acc Oversampled (Logistic Regression): 0.7620


In [None]:
y_pred_rf = clf.predict(X_test)
acc_rf = accuracy_score(y_test,y_pred_rf)
prec_rf = precision_score(y_test,y_pred_rf)
conf_mat_rf = confusion_matrix(y_test,y_pred_rf)

In [None]:
confusion_df = pd.DataFrame(conf_mat_rf, index=['Actual 0 (Stayed)', 'Actual 1 (Exited)'], columns=['Predicted 0', 'Predicted 1'])

fig = px.imshow(confusion_df, labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Predicted 0', 'Predicted 1'],
                y=['Actual 0', 'Actual 1'],
                color_continuous_scale="Blues")

fig.update_xaxes(side="bottom")
fig.update_layout(title_text='Confusion Matrix', title_x=0.5)

# update_layout_plot()

fig.show()
fig.write_image("feature_correlation_matrix_rf.png", format="png")

### XGBoost

In [None]:
from xgboost import XGBClassifier,XGBRegressor

model = XGBClassifier()
model = model.fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.metrics import accuracy_score

# Train set
predictions_xgb = pd.DataFrame()
predictions_xgb['true'] = y_train_resampled
predictions_xgb['preds'] = model.predict(X_train_resampled)

# Test set
predictions_test_xgb = pd.DataFrame()
predictions_test_xgb['true'] = y_test
predictions_test_xgb['preds'] = model.predict(X_test)
predictions_test_xgb['preds_over'] = model.predict(X_test)

train_acc_xgb = accuracy_score(predictions_xgb.true, predictions_xgb.preds)
test_acc_xgb = accuracy_score(predictions_test_xgb.true, predictions_test_xgb.preds)
test_acc_over_xgb = accuracy_score(predictions_test_xgb.true, predictions_test_xgb.preds_over)

print(f"Train Acc (Logistic Regression): {train_acc_xgb:.4f}")
print(f"Test Acc (Logistic Regression): {test_acc_xgb:.4f}")
print(f"Test Acc Oversampled (Logistic Regression): {test_acc_over_xgb:.4f}")

Train Acc (Logistic Regression): 0.9607
Test Acc (Logistic Regression): 0.8535
Test Acc Oversampled (Logistic Regression): 0.8535


In [None]:
y_pred_xgb = clf.predict(X_test)
acc_xgb = accuracy_score(y_test,y_pred_xgb)
prec_xgb = precision_score(y_test,y_pred_xgb)
conf_mat_xgb = confusion_matrix(y_test,y_pred_xgb)

In [None]:
confusion_df = pd.DataFrame(conf_mat_xgb, index=['Actual 0 (Stayed)', 'Actual 1 (Exited)'], columns=['Predicted 0', 'Predicted 1'])

fig = px.imshow(confusion_df, labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Predicted 0', 'Predicted 1'],
                y=['Actual 0', 'Actual 1'],
                color_continuous_scale="Blues")

fig.update_xaxes(side="bottom")
fig.update_layout(title_text='Confusion Matrix', title_x=0.5)

# update_layout_plot()

fig.show()
fig.write_image("feature_correlation_matrix_xgb.png", format="png")

#### Hyperparameter tunning

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'min_child_weight': [1, 2, 3],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
# }

# xgb_classifier = xgb.XGBClassifier()

# grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
# grid_search.fit(X_train_resampled, y_train_resampled)
# best_params = grid_search.best_params_


In [None]:
# best_params

In [None]:
# best_xgb_classifier = xgb.XGBClassifier(**best_params)
# best_xgb_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# from sklearn.metrics import accuracy_score

# # Train set
# predictions_xgb = pd.DataFrame()
# predictions_xgb['true'] = y_train_resampled
# predictions_xgb['preds'] = best_xgb_classifier.predict(X_train_resampled)

# # Test set
# predictions_test_xgb = pd.DataFrame()
# predictions_test_xgb['true'] = y_test
# predictions_test_xgb['preds'] = clf.predict(X_test)
# predictions_test_xgb['preds_over'] = best_xgb_classifier.predict(X_test)

# train_acc_xgb = accuracy_score(predictions_xgb.true, predictions_xgb.preds)
# test_acc_xgb = accuracy_score(predictions_test_xgb.true, predictions_test_xgb.preds)
# test_acc_over_xgb = accuracy_score(predictions_test_xgb.true, predictions_test_xgb.preds_over)

# print(f"Train Acc (Logistic Regression): {train_acc_xgb:.4f}")
# print(f"Test Acc (Logistic Regression): {test_acc_xgb:.4f}")
# print(f"Test Acc Oversampled (Logistic Regression): {test_acc_over_xgb:.4f}")