In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Bank Customer Churn Prediction.csv')

## EDA

In [None]:
df.head(10)

In [None]:
df.shape  # 10000 rows(customer) and 12 columns (features)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()  # no missing values in the dataset

Although the original dataset had no missing values,
I simulated missing data to reflect real-world scenarios
and demonstrated how to handle them properly
using median

In [None]:
import numpy as np


In [None]:
# here I will add some NANs to simulate missing values
np.random.seed(42)

for col in ['age', 'balance', 'estimated_salary']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

In [None]:
df.isnull().sum()

In [None]:
# Handling missing values by filling them with the median of each column
# (median is better for numerical data with outliers)

df['age'].fillna(df['age'].median(), inplace=True)
df['balance'].fillna(df['balance'].median(), inplace=True)
df['estimated_salary'].fillna(df['estimated_salary'].median(), inplace=True)


In [None]:
df.isnull().sum()  # no missing values now

In [None]:
df['churn'].value_counts()
# 0 --> cutomer stayed, 1 --> customer left
# imbalanced data (customer stayed >> customer left)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.countplot(x='churn', data=df)
plt.show()

### Numerical data

In [None]:
df.describe() # statistical summary of numerical columns
# --- Key Insight on Balance Column ---
# 1. Mean (77.5k) is lower than Median (97.3k). 
# 2. This happens because the first 25% of customers have a 0.0 balance (Min and 25% are both 0).
# 3. These "Zero-balance" customers act as outliers that pull the average (Mean) down.

As we can see here, the Median(50%) is higher than the Mean, which indicates a Left-Skewed distribution of non-zero balances, heavily influenced by the 25% of customers with zero balance

In [None]:
sns.histplot(df['age'], bins=30)
plt.show()
sns.histplot(df['balance'], bins=30)
plt.show()

In [None]:
sns.boxplot(x='churn', y='age', data=df) # boxplot to see age distribution of cutomers that left vs stayed
plt.show()

### categorical data

In [None]:
df['country'].value_counts()

In [None]:
sns.countplot(x='country', hue='churn', data=df)
plt.show()
# --- Key Insight on Country vs Churn ---
# 1. France has the largest customer base (blue bars) among the three countries.
# 2. Germany shows a significantly higher "churn rate" (the orange bar is almost half the blue bar).
# 3. Spain appears to be the most stable market with the lowest churn numbers.
# 4. customers in Germany are leaving at a higher percentage compared to France and Spain.

In [None]:
sns.countplot(x='gender', hue='churn', data=df)
plt.show()
# --- Key Insight on Gender vs Churn ---
# 1. The customer base is almost evenly split between males and females.
# 2. However, females show a slightly higher churn rate compared to males.
# 3. This could indicate that female customers are more likely to leave the bank than male customers.

In [None]:
df.head()

In [None]:
sns.countplot(x='active_member', hue='churn', data=df)
plt.show()
# --- Key Insight on Active Member vs Churn ---
# 1. Active members (blue bars) are significantly more likely to stay with the bank
# 2. Inactive members (orange bars) show a much higher churn rate.

### correlation

In [None]:
df_numeric = df.select_dtypes(include=['int64', 'float64'])
# here I used only the numerical columns for correlation heatmap and excluded categorical columns
# I kept the categorical columns as they are during the EDA phase (didn't encode it) 
# to ensure that visualizations and insights remain human-readable .
# This helps in avoiding "Data Leakage"

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm')
plt.show()

# data preprocessing

In [None]:
df.drop('customer_id', axis=1, inplace=True)
# dropping customer_id as it is not useful for prediction

In [None]:
X = df.drop('churn', axis=1)
y = df['churn']
# Features and target variable separation

In [None]:
X.select_dtypes(include='object').columns
# Categorical columns that need encoding

In [None]:
X = pd.get_dummies(X, drop_first=True)
# I used drop_first=True to avoid the Dummy Variable Trap.
# It removes redundant information 

In [None]:
X.head()
#here we dropped france as the first column 
# so if it is false at spain and germany then it's true at france 
# and also we dropped the female gender column as if it's false at male then it's true at female


In [None]:
X.shape 

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#I used StandardScaler to ensure that all features are on the same scale.
# This prevents features with large values (like Salary) from dominating the model
# and helps algorithms like Logistic Regression or SVM to converge faster and perform better


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split
(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
#I split the data into 80% training and 20% testing
# I used "stratify=y" to ensure that both sets have the same proportion 
# of classes as the original dataset


In [None]:
X_train.shape
# the train rows took 8000 from the total 10000 rows which is 80%  
# the column numbers is the same in the train and test sets as the model should 
# learn on the same features during training and testing

In [None]:
X_test.shape
# the test rows took 2000 from the total 10000 rows which is 20%

## Modeling

### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)


In [None]:
y_pred_log = log_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf_model.fit(X_train, y_train)


In [None]:
y_pred_rf = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))


# I was going to use XGBoost but it caused memory issues on my system.
# So instead I used Gradient Boosting improves prediction accuracy by
# sequentially correcting errors made by previous models,
# making it more robust than single estimators.


In [None]:
# Visualizing the Confusion Matrix for Gradient Boosting Model
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_gb)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Gradient Boosting")
plt.show()

#The most critical error is predicting a churned customer as non-churned
#because the company loses the opportunity to take preventive action

In [None]:
import pandas as pd

importance = gb_model.feature_importances_
features = X.columns

feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(10)


In [None]:
plt.figure(figsize=(8,5))
sns.barplot(
    x='Importance',
    y='Feature',
    data=feature_importance_df.head(10)
)
plt.title("Top Features Affecting Customer Churn")
plt.show()


In [None]:
import joblib

joblib.dump(gb_model, "gradient_boosting_churn_model.pkl")
joblib.dump(scaler, "scaler.pkl")

#I saved the trained model and scaler to ensure
#consistent preprocessing during deployment

### Deployment

In [None]:
import streamlit as st
import joblib
import numpy as np

In [None]:
# Load model and scaler
model = joblib.load("gradient_boosting_churn_model.pkl")
scaler = joblib.load("scaler.pkl")

st.title("Customer Churn Prediction App")

st.write("Enter customer details to predict churn")

In [None]:
# Inputs
age = st.number_input("Age", 18, 100)
balance = st.number_input("Balance")
credit_score = st.number_input("Credit Score", 300, 900)
products = st.number_input("Number of Products", 1, 4)
is_active = st.selectbox("Is Active Member?", [0, 1])
salary = st.number_input("Estimated Salary")

if st.button("Predict Churn"):
    data = np.array([[credit_score, age, balance, products, is_active, salary]])
    data = scaler.transform(data)

    prediction = model.predict(data)

    if prediction[0] == 1:
        st.error("⚠️ Customer is likely to churn")
    else:
        st.success("✅ Customer is likely to stay")