In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
df = pd.read_csv('cleaned_churn_dataset.csv')

## Encoding:

Dataset has no categorical column so I skipped encoding becasue there is no need for it.

## Train/Test Data

I have separate the features (X) and target variable (y) then split the dataset into training and testing sets.
This helped in training model on one portion of the data and evaluate how well it performs on unseen data.

In [None]:
X = df.drop(columns='Churn', axis =1)
Y = df['Churn']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

## Training RandomForestClassifier on Data

I have train a *Random Forest Classifier* to predict customer churn.

Random Forest works by training multiple decision trees and combining their predictions, making it more accurate and stable. It also provides feature importance scores allowing us to understand which factors influence churn the most.

In [None]:
RF = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, n_jobs= -1 )

RF.fit(X_train, Y_train)

Y_prediction_RF = RF.predict(X_test)

In [None]:
# Confusion Matrix

cm = confusion_matrix(Y_test, Y_prediction_RF)

sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

In [None]:
# Classification Report & Accuracy
print('classification_report:')
print(classification_report(Y_test, Y_prediction_RF))

accuracy = accuracy_score(Y_test, Y_prediction_RF)
print(f'Accuracy Score: {accuracy * 100:.2f}% ')

## Feature Importance

In [None]:
importances = RF.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12,6))
plt.title("Feature Importance (Random Forest)")

# apply colormap
colors = plt.cm.summer(np.linspace(0, 1, len(importances)))

plt.bar(range(len(importances)), importances[indices], color=colors)
plt.xticks(range(len(importances)), X.columns[indices], rotation=90)

plt.tight_layout()
plt.show()

## Closing Words:

Based on the Random Forest feature importance results the strongest factors influencing churn are related to customer activity and engagement, such as total transactions, variety of activities, and monthly transaction frequency. 

Overall, the model clearly highlights that customer engagement is the primary driver of churn, and the bank can reduce churn by increasing transaction activity, improving digital engagement, and providing personalized services to low-activity customers.