In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from sklearn.cluster import KMeans
import warnings

from IPython.display import Markdown

In [6]:
df = pd.read_csv('BankChurners.csv')
df['Churn'] = (df['Attrition_Flag'] == 'Attrited Customer').astype(int)

In [7]:
# Creating new features that might predict churn
display(Markdown("\n **FEATURE ENGINEERING**"))
display(Markdown("\n Creating new predictive features"))


# 1. Engagement Score (transactions per month on book)
df['Engagement_Score'] = df['Total_Trans_Ct'] / df['Months_on_book']

# 2. Inactivity Rate
df['Inactivity_Rate'] = df['Months_Inactive_12_mon'] / 12

# 3. Credit Utilization Category
df['High_Utilization'] = (df['Avg_Utilization_Ratio'] > 0.7).astype(int)

# 4. Low Transaction Flag
df['Low_Transaction'] = (df['Total_Trans_Ct'] < 50).astype(int)

# 5. Product Concentration (fewer products = higher risk)
df['Few_Products'] = (df['Total_Relationship_Count'] <= 2).astype(int)

# 6. Spending Momentum (transaction change Q4 vs Q1)
df['Declining_Spending'] = (df['Total_Amt_Chng_Q4_Q1'] < 0.7).astype(int)

display(Markdown("**Definitions**"))
print("   Engagement Score = Transactions per month")
print("   Inactivity Rate = Percentage of year inactive ")
print("   High Utilization = >70% credit used")
print("   Low Transaction = <50 transactions made")
print("   Few Products = ≤2 products bought")
print("   Declining Spending = Q4 vs Q1 decrease in spending \n")


# Correlation of new features
display(Markdown("\n **New Features - Correlation with Churn:**"))

new_features = ['Engagement_Score', 'Inactivity_Rate', 'High_Utilization',
                'Low_Transaction', 'Few_Products', 'Declining_Spending']

for feature in new_features:
    corr = df[[feature, 'Churn']].corr().iloc[0, 1]
    print(f"   {feature:<25} {corr:>+.4f}")


 **FEATURE ENGINEERING**


 Creating new predictive features

**Definitions**

   Engagement Score = Transactions per month
   Inactivity Rate = Percentage of year inactive 
   High Utilization = >70% credit used
   Low Transaction = <50 transactions made
   Few Products = ≤2 products bought
   Declining Spending = Q4 vs Q1 decrease in spending 




 **New Features - Correlation with Churn:**

   Engagement_Score          -0.2872
   Inactivity_Rate           +0.1524
   High_Utilization          -0.0291
   Low_Transaction           +0.3909
   Few_Products              +0.1532
   Declining_Spending        +0.0821


In [15]:
display(Markdown("\n **PREPARING DATA FOR MACHINE LEARNING**"))

# Encoding categorical variables
print("Encoding categorical variables")
categorical_cols = ['Gender', 'Education_Level', 'Marital_Status', 
                    'Income_Category', 'Card_Category']

le = LabelEncoder()
for col in categorical_cols:
    df[col + '_Encoded'] = le.fit_transform(df[col])
    print(f"    Encoded {col}")




 **PREPARING DATA FOR MACHINE LEARNING**

Encoding categorical variables
    Encoded Gender
    Encoded Education_Level
    Encoded Marital_Status
    Encoded Income_Category
    Encoded Card_Category


In [17]:
# Selecting features for modeling
print("Selecting features for prediction")

# Original features
original_features = ['Customer_Age', 'Dependent_count', 'Months_on_book',
                     'Total_Relationship_Count', 'Months_Inactive_12_mon',
                     'Contacts_Count_12_mon', 'Credit_Limit', 
                     'Total_Revolving_Bal', 'Total_Trans_Amt', 
                     'Total_Trans_Ct', 'Avg_Utilization_Ratio']

# Encoded categorical features
encoded_features = [col + '_Encoded' for col in categorical_cols]

# New engineered features
engineered_features = ['Engagement_Score', 'Inactivity_Rate', 
                       'High_Utilization', 'Low_Transaction', 
                       'Few_Products', 'Declining_Spending']

# Combining all features
all_features = original_features + encoded_features + engineered_features

print(f"   Total features selected: {len(all_features)}")
print(f"      - Original: {len(original_features)}")
print(f"      - Categorical (encoded): {len(encoded_features)}")
print(f"      - Engineered: {len(engineered_features)}")



Selecting features for prediction
   Total features selected: 22
      - Original: 11
      - Categorical (encoded): 5
      - Engineered: 6


In [24]:
# Preparing X and y
X = df[all_features]
y = df['Churn']

# Split data
print("Splitting data (80% for training, 20% for testing)")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\n  Training set:")
print(f"    Training set: {len(X_train):,} customers")
print(f"    Churn rate in train: {y_train.mean()*100:.2f}%")

print("\n  Testing set:")
print(f"    Testing set: {len(X_test):,} customers")
print(f"    Churn rate in test: {y_test.mean()*100:.2f}%")



Splitting data (80% for training, 20% for testing)

  Training set:
    Training set: 8,101 customers
    Churn rate in train: 16.07%

  Testing set:
    Testing set: 2,026 customers
    Churn rate in test: 16.04%


In [28]:
# Standardizing features
print("Standardizing features")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("   Features standardized (mean=0, std=1)")

Standardizing features
   Features standardized (mean=0, std=1)


In [32]:
# Training Logistic Regression
display(Markdown("\n **MODEL 01: LOGISTIC REGRESSION**"))

# Training model
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)
print("Model trained successfully!")



 **MODEL 01: LOGISTIC REGRESSION**

Model trained successfully!


In [43]:
# Making predictions
print("\nMaking predictions")
y_pred_log = log_model.predict(X_test_scaled)
y_pred_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
print("\nModel Performance:")
print("-" * 80)

accuracy = accuracy_score(y_test, y_pred_log)
auc = roc_auc_score(y_test, y_pred_prob_log)

print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nAUC Score: {auc:.4f}")
print("\n")

if auc > 0.90:
    display(Markdown("**EXCELLENT - Outstanding Predictive Power!**"))
elif auc > 0.80:
    display(Markdown("**VERY GOOD - Strong Predictive Ability!**"))
elif auc > 0.70:
     display(Markdown("**GOOD - Decent Predictive Performance**"))
else:
    display(Markdown("**FAIR - Needs Improvement**"))

print("-" * 80)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_log, 
                          target_names=['Retained', 'Churned']))



Making predictions

Model Performance:
--------------------------------------------------------------------------------
Accuracy: 0.8968 (89.68%)

AUC Score: 0.9033




**EXCELLENT - Outstanding Predictive Power!**

--------------------------------------------------------------------------------

Classification Report:
              precision    recall  f1-score   support

    Retained       0.91      0.97      0.94      1701
     Churned       0.77      0.51      0.61       325

    accuracy                           0.90      2026
   macro avg       0.84      0.74      0.78      2026
weighted avg       0.89      0.90      0.89      2026



In [45]:
# Feature importance
display(Markdown("**TOP 15 MOST IMPORTANT FEATURES AND THEIR COEFFICIENT:**"))

feature_importance = pd.DataFrame({
    'Feature': all_features,
    'Coefficient': np.abs(log_model.coef_[0])
}).sort_values('Coefficient', ascending=False)

for i, row in feature_importance.head(15).iterrows():
    print(f"   {row['Feature']:<35} {row['Coefficient']:.4f}")

**TOP 15 MOST IMPORTANT FEATURES AND THEIR COEFFICIENT:**

   Total_Trans_Ct                      2.3628
   Total_Trans_Amt                     1.3120
   Few_Products                        0.6571
   Total_Revolving_Bal                 0.6479
   Contacts_Count_12_mon               0.6301
   Avg_Utilization_Ratio               0.5297
   High_Utilization                    0.4636
   Gender_Encoded                      0.3588
   Engagement_Score                    0.2850
   Total_Relationship_Count            0.2794
   Inactivity_Rate                     0.2550
   Months_Inactive_12_mon              0.2550
   Months_on_book                      0.1981
   Marital_Status_Encoded              0.1964
   Dependent_count                     0.1946


In [47]:
# Training Random Forest
display(Markdown("**MODEL 02: RANDOM FOREST CLASSIFIER**"))

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, 
                                  random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("   Model trained successfully!")

**MODEL 02: RANDOM FOREST CLASSIFIER**

   Model trained successfully!


In [52]:
# Making predictions
print("\nMaking predictions")

y_pred_rf = rf_model.predict(X_test)
y_pred_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate
print("\nModel performance:")
print("-" * 80)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

print(f"Accuracy: {accuracy_rf:.4f} ({accuracy_rf*100:.2f}%)")
print(f"\nAUC Score: {auc_rf:.4f}")
print("\n")

if auc_rf > 0.90:
    display(Markdown("**EXCELLENT - Outstanding Predictive Power!**"))
elif auc_rf > 0.80:
    display(Markdown("**VERY GOOD - Strong Predictive Ability!**"))
elif auc_rf > 0.70:
    display(Markdown("**GOOD - Decent Predictive Performance**"))
else:
    display(Markdown("**FAIR - Needs Improvement**"))


print("-" * 80)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, 
                          target_names=['Retained', 'Churned']))




Making predictions

Model performance:
--------------------------------------------------------------------------------
Accuracy: 0.9388 (93.88%)

AUC Score: 0.9755




**EXCELLENT - Outstanding Predictive Power!**

--------------------------------------------------------------------------------

Classification Report:
              precision    recall  f1-score   support

    Retained       0.95      0.98      0.96      1701
     Churned       0.89      0.70      0.79       325

    accuracy                           0.94      2026
   macro avg       0.92      0.84      0.88      2026
weighted avg       0.94      0.94      0.94      2026



In [57]:
# Feature importance
display(Markdown("**TOP 15 MOST IMPORTANT FEATURES AND THEIR IMPORTANCE PERCENTAGE:**"))


feature_importance_rf = pd.DataFrame({
    'Feature': all_features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

for i, row in feature_importance_rf.head(15).iterrows():
    # Calculate percentage
    importance_pct = row['Importance'] * 100
    print(f"   {row['Feature']:<35} {row['Importance']:.4f} ({importance_pct:.2f}%)")

# Identify the most important feature
top_feature = feature_importance_rf.iloc[0]
print(f"\n{top_feature['Feature']}, is the number one predictor and it accounts for {top_feature['Importance']*100:.2f}% of prediction power.")

**TOP 15 MOST IMPORTANT FEATURES AND THEIR IMPORTANCE PERCENTAGE:**

   Total_Trans_Amt                     0.1844 (18.44%)
   Total_Trans_Ct                      0.1581 (15.81%)
   Total_Revolving_Bal                 0.1129 (11.29%)
   Engagement_Score                    0.0822 (8.22%)
   Avg_Utilization_Ratio               0.0793 (7.93%)
   Low_Transaction                     0.0638 (6.38%)
   Total_Relationship_Count            0.0624 (6.24%)
   Few_Products                        0.0420 (4.20%)
   Credit_Limit                        0.0325 (3.25%)
   Contacts_Count_12_mon               0.0300 (3.00%)
   Customer_Age                        0.0272 (2.72%)
   Months_on_book                      0.0212 (2.12%)
   Months_Inactive_12_mon              0.0211 (2.11%)
   Inactivity_Rate                     0.0211 (2.11%)
   Gender_Encoded                      0.0107 (1.07%)

Total_Trans_Amt, is the number one predictor and it accounts for 18.44% of prediction power.


In [59]:
# Model comparison
display(Markdown("**MODEL COMPARISION**"))
print("Logistic Regression Vs Random Forest ")


**MODEL COMPARISION**

Logistic Regression Vs Random Forest 
