In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from sklearn.cluster import KMeans
import warnings

from IPython.display import Markdown

warnings.filterwarnings('ignore')

#visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

display(Markdown("**Variable Importance Analysis**"))
display(Markdown(
    "Identifying which variables best predict customer churn and "
    "build models to identify at risk customers before they leave"
))


**Variable Importance Analysis**

Identifying which variables best predict customer churn and build models to identify at risk customers before they leave

In [11]:
df = pd.read_csv('BankChurners.csv')

print(f" Dataset loaded successfully!")
print(f"   Total customers: {len(df):,}")
print(f"   Total features: {len(df.columns)}")

# Create binary churn variable
df['Churn'] = (df['Attrition_Flag'] == 'Attrited Customer').astype(int)

churn_count = df['Churn'].sum()
churn_rate = (churn_count / len(df)) * 100

print(f"\n Churn Overview:")
print(f"   Churned customers: {churn_count:,} ({churn_rate:.2f}%)")
print(f"   Retained customers: {len(df) - churn_count:,} ({100-churn_rate:.2f}%)")

 Dataset loaded successfully!
   Total customers: 10,127
   Total features: 23

 Churn Overview:
   Churned customers: 1,627 (16.07%)
   Retained customers: 8,500 (83.93%)
