# Customer Churn Prediction for Banking Service

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc)

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib
import pickle

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

: 

# 1. Load and Explore the Dataset

In [None]:
df = pd.read_csv('Churn_Modelling.csv')
print("Dataset shape: ", df.shape)
print('ramdom 5 rows:')
df.sample(5)

In [None]:
print("Dataset Info: ")
df.info()
print()
print('Basic Statestics: ')
df.describe()

In [None]:
print("Missing Values:")
print(df.isnull().sum())

In [None]:
print("Column Names: \n", df.columns.tolist())

# 2. Data Cleaning

In [None]:
# Drop unwanted columns
df_cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis =1)
# check cleaned dataset
df_cleaned.sample(5)

In [None]:
# checking missing values in cleanded dataset
df_cleaned.isnull().sum()

In [None]:
# check datatype
df_cleaned.dtypes

In [None]:
# check unique values for categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())
for col in categorical_cols:
    print(f"\n {col}: {df_cleaned[col].unique()}")
    print(f"Value counts: \n {df_cleaned[col].value_counts()}")

In [None]:
# Check the target variable distribution 
print("Target variable distribution: ")
print(df_cleaned["Exited"].value_counts())
print(f"\nChurn Rate: {df_cleaned['Exited'].mean(): .2%}")

In [None]:
# Visualize the target variable
plt.figure(figsize =(8,6))
sns.countplot(data= df_cleaned, x= 'Exited')
plt.title('Distribution of Churn (Exited)')
plt.xlabel('Exited (0= No, 1 =Yes)')
plt.ylabel('Count')
plt.show()

# 3. Exploratory Data Analysis (EDA)

In [None]:
fig, axes = plt.subplots(2,3, figsize =(18, 12))
# 1. geography vs Churn
sns.countplot(data= df_cleaned, x= 'Geography', hue = 'Exited', ax = axes[0,0])
axes[0,0].set_title('Churn by Geography')
axes[0,0].set_xlabel('Geography')
axes[0,0].set_ylabel('Count')

# 2. Gender vs Churn
sns.countplot(data = df_cleaned, x= 'Gender', hue = 'Exited', ax= axes[0,1])
axes[0,1].set_title('Churn by Gender')
axes[0,1].set_xlabel('Gender')
axes[0,1].set_ylabel('Count')

# 3. HasCrCard vs Churn
sns.countplot(data= df_cleaned, x= 'HasCrCard', hue = 'Exited', ax= axes[0,2])
axes[0,2].set_title('Churn by Credit Card Ownership')
axes[0,2].set_xlabel('Has Credit Card (0= No, 1 =Yes)')
axes[0,2].set_ylabel('Count')

# 4. IsActiveMember vs Churn
sns.countplot(data= df_cleaned, x = 'IsActiveMember', hue='Exited', ax= axes[1,0])
axes[1,0].set_title('Churn by Active Membership')
axes[1,0].set_xlabel('Is Active Member (0= No, 1= Yes)')
axes[1,0].set_ylabel('Count')

# 5. NumOfProducts vs Churn
sns.countplot(data= df_cleaned, x = 'NumOfProducts', hue = 'Exited', ax= axes[1,1])
axes[1,1].set_title('Churn by Number of Products')
axes[1,1].set_xlabel('Number')
axes[1,1].set_ylabel('Count')

# 6. Tenure vs Churn
df_cleaned.groupby('Tenure')['Exited'].mean().plot(kind = 'line', ax = axes[1,2], marker= 'o')
axes[1,2].set_title('Churn Rate by Tenure')
axes[1,2].set_xlabel('Tenure (Years)')
axes[1,2].set_ylabel('Churn Rate')

plt.tight_layout()
plt.show()

# Numerical features analysis

In [None]:
numerical_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
fig, axes= plt.subplots(2,2, figsize= (15,10))
for idx, col in enumerate(numerical_cols):
    row, col_idx= idx // 2, idx %2
    sns.boxplot(data = df_cleaned, x= 'Exited', y = col, ax= axes[row, col_idx])
    axes[row, col_idx].set_title(f'{col} vs Churn')
    axes[row, col_idx].set_xlabel('Exited (0= No, 1= Yes)')
    axes[row, col_idx].set_ylabel(col)

plt.tight_layout()
plt.show()

# Correlation matrix

In [None]:
plt.figure(figsize= (12,10))
# Encode categorical variables for correlation analysis
df_encoded = df_cleaned.copy()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

corr_matrix = df_encoded.corr()
sns.heatmap(corr_matrix, annot= True, fmt='.2f', cmap= 'coolwarm', center = 0)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Display top correlations with target 
print("Top correlations with Exited (Churn): ")
corr_with_target = corr_matrix['Exited'].sort_values(ascending = False)
print(corr_with_target)

# 4. Features Engineering and Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df_cleaned.copy()

In [None]:
# Handle categorical variables using one-hot encoding
df_processed = pd.get_dummies(df_processed, columns=['Geography', 'Gender'], drop_first= True)


In [None]:
# Display processed dataset
print("Processed dataset shape:", df_processed.shape)
print("\nProcessed columns: ")
print(df_processed.columns.tolist())


# Split features and target

In [None]:
X= df_processed.drop('Exited', axis = 1)
y= df_processed['Exited']
print('Features shape:', X.shape)

# Split into train and test ssets

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=  0.2, random_state= 42, stratify =y)

# Scale numerical features

In [None]:
scaler = StandardScaler()
# Identify numerical columns (excluding the encoded ones)
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Fit and transform on training data, transform on test data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
print("Scaling completed!")

# 5. Handle Class Imbalance

In [None]:
# Check class distribution 
print("Before handling imbalance: ")
print(f"Class 0 (No Churn): {sum(y_train == 0)} samples")
print(f"Class 1 (Churn): {sum(y_train==1)} samples")
print(f"Churn rates: {y_train.mean():.2%}" )

# Option 1: SMOTE (Synthetic Minority Over-sampling Technique)


In [None]:
smote = SMOTE(random_state= 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("\nAfter SMOTE: ")
print(f" Class 0 (No Churn): {sum(y_train_smote == 0)} samples")

print(f" Class 1 (Churn): {sum(y_train_smote == 1)} samples")

# 6. Model Building and Training

# Initialize models

In [None]:
models ={
    'Logistic Regression': LogisticRegression(random_state = 42, max_iter = 1000),
    'Decision Tree': DecisionTreeClassifier(random_state= 42),
    'Gradient Boosting': GradientBoostingClassifier(random_state= 42),
    'Random Forest': RandomForestClassifier(random_state= 42, n_jobs=1)
    
}


In [None]:
# Train and evaluate each model
results= {}
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")

    # Train the model
    if model_name == 'Logistic Regression':
        # Use original data for Logistic Regression (works better without SMOTE sometimes)
        model.fit(X_train_scaled, y_train)
        y_pred= model.predict(X_test_scaled)
        y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        # Use SMOTE data for tree-based models
        model.fit(X_train_smote, y_train_smote)
        y_pred = model.predict(X_test_scaled)
        y_pred_prob = model.predict_proba(X_test_scaled)[:,1]

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

In [None]:
# store results
results[model_name]= {
    'model': model,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'roc_auc': roc_auc,
    'y_pred': roc_auc,
    'y_pred': y_pred,
    'y_pred_prob': y_pred_prob
}

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall: .4f}")
print(f"F1-Score: {f1: .4f}")
print(f"ROC-AUC: {roc_auc: .4f}")

# 7. Model Comparison and Evaluation

In [None]:
#  Create a comparison DataFrame
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'Precision': [results[m]['precision'] for m in results],
    'Recall': [results[m]['recall'] for m in results],
    'F1-Score': [results[m]['f1'] for m in results],
    'ROC-AUC': [results[m]['roc_auc'] for m in results]
})
print("Model Performance Comparison:")
print(results_df.sort_values('F1-Score', ascending =False))

# Visualize model performance

In [None]:
fig, axes = plt.subplots(2,2 , figsize= (15,12))
# 1. Bar chart for accuracy and F1-Score
metrics_to_plot = ['Accuracy', 'F1-Score', 'Recall', 'ROC-AUC']
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold']
for idx, metric in enumerate(metrics_to_plot):
    row, col = idx//2, idx %2
    axes[row, col].bar(results_df['Model'], results_df[metric], color= colors[idx])
    axes[row, col].set_title(f'{metric} Comparison')
    axes[row, col].set_ylabel(metric)
    axes[row, col].tick_params(axis= 'x', rotation =45)

    # Add value labels on bars
    for i, v in enumerate(results_df[metric]):
        axes[row, col].text(i,v+0.01, f'{v: .3f}', ha= 'center', va= 'bottom')
plt.tight_layout()
plt.show()

# Confusion matrices for all models

In [None]:
fig, axes = plt.subplots( figsize=(15,12))
for idx, (model_name, result) in enumerate(results.items()):
    row, col =idx // 2,  idx % 2
    cm= confusion_matrix(y_test, result['y_pred'])
    sns.heatmap(cm, annot = True, fmt='d', cmap= 'Blues', ax= axes[row,col])
    axes[row, col].set_title(f'Confusion Matrix - {model_name}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')
    axes[row, col].set_xticklabels(['No Churn', 'Churn'])
    axes[row, col].set_yticklabels(['No Churn', 'Churn'])

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
for model_name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_pred_prob'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label= f'{model_name} (AUC = {roc_auc:.3f})')
plt.plot([0,1],[0,1], color = 'navy', lw=2, linestyle= '--', label = 'Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparision')
plt.legend(loc = 'lower right')
plt.grid(True)
plt.show()

# 8. Feature Importance Analysis

In [None]:
best_model = results["Random Forest"]['model']
feature_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print('Top 10 Most Important Features:')
print(feature_importance.head(10))

# Visualize feature importance

In [None]:
plt.figure(figsize = (12, 8))
sns.barplot(data=feature_importance.head(15), x= 'importance', y ='feature')
plt.title('Top 15 Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

# 9. Hyperparameter tuning for bet model

In [None]:
print("Hyperparameter tuning for Random Forest")
# Define parameter grid
param_grid ={
    'n_estimators': [100,200,300],
    'max_depth': [10,20, 30, None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'max_features':['sqrt','log2']
}
# Initialize grid search 
rf = RandomForestClassifier(random_state = 42, n_jobs =-1)
grid_search = GridSearchCV(
    estimator = rf,
    param_grid = param_grid,
    cv = 3,
    n_jobs = -1,
    verbose = 1,
    scoring = 'f1'
)


In [None]:
# fit grid search (using SMOTE data)
print("Performing gred search...")
grid_search.fit(X_train_smote, y_train_smote)

# Get best parameteres and model
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Evaluate tuned model

In [None]:
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test_scaled)
y_pred_prob_tuned = best_rf.predict_proba(X_test_scaled)[:,1]

# Cakcykate metrics
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test,y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)
roc_auc_tuned = roc_auc_score(y_test, y_pred_prob_tuned)
print("Tuned Random Forest performance:")
print(f"Accuracy: {accuracy_tuned: .4f}")
print(f"Precision: {precision_tuned: .4f}")
print(f"Recall: {recall_tuned: .4f}")
print(f"ROC-AUC: {roc_auc_tuned: .4f}")

# Compare with original
print(f"\n Improvement in F1-Score: {(f1_tuned - results['Random Forest']['f1']):.4f}")

# 10. Bussiness Insights and Recommendations 

# Analyze key factors contributing to Churn


In [None]:
print("Key Factors Contributing to Customer Churn:")
# 1. Geography
geo_churn = df_cleaned.groupby('Geography')['Exited'].mean().sort_values(ascending = False)
print("\n1. Churn Rate by Geography: ")
for geo, rate in geo_churn.items():
    print(f" {geo}: {rate: .2%}")

# 2. Gender
gender_churn = df_cleaned.groupby('Gender')['Exited'].mean()
print("\n2. Churn Rage by Gender:")
for gender, rate in gender_churn.items():
    print(f" {gender}: {rate:.2%}")

# 3. Active Membership
active_churn = df_cleaned.groupby('IsActiveMember')['Exited'].mean()
print("\n3. Churn Rate by Active Membership:")
print(f"  Inactive Members: {active_churn[0]: .2%}")
print(f"  Active Members: {active_churn[1]: .2%}")

# 4. Age groups
df_cleaned['AgeGroup'] = pd.cut(df_cleaned['Age'],
                               bins = [0,30,40,50,60, 100],
                               labels = ['<30', '30-40', '40-50','50-60','60+'])
age_churn = df_cleaned.groupby('AgeGroup')['Exited'].mean()
print("\n4. Churn Rate by Age Group")
for age_group, rate in age_churn.items():
    print(f"  {age_group}: {rate:.2%}")

# 5. Balance analysis
high_balance_churn = df_cleaned[df_cleaned['Balance'] > df_cleaned['Balance'].median()]['Exited'].mean()
low_balance_churn = df_cleaned[df_cleaned['Balance']<= df_cleaned['Balance'].median()]['Exited'].mean()
print('\n5. Churn Rate by Balance:')
print(f"  High Balance(> median): {high_balance_churn: .2%}")
print(f"  Low Balance(<= median): {low_balance_churn:.2%}")

# 11. Save the Best Model

In [None]:
best_model_name = 'tuned_random_forest' if 'best_rf' in locals() else 'random_forest'
best_model_instance = best_rf if 'best_rf' in locals() else results['Random Forest']['model']


In [None]:
# Save model using joblib
joblib.dump(best_model_instance, f'best_churn_model_{best_model_name}.pkl')
joblib.dump(scaler, 'scaler.pkl')

# save results
with open('model_results.pkl', 'wb')as f:
    pickle.dump(results, f)

print(f"Best model saved as 'best_churn_model_{best_model_name}.pkl'")
print("Scalar saved as 'scaler.pkl'")
print("Results saved as 'model_results.pkl'")


In [None]:
# Create a function for making predictions on new data
def predict_churn(customer_data, model_path = 'best_churn_model_tuned_random_forest.pkl',
                 scaler_path = 'scaler.pkl'):
    """
    Predict churn for new customer data.

    Parameters:
    customer_data (DataFrame or dict): New customer data
    model_path (str): Path to saved model
    scaler_path (str): Path to saved scaler

    Returns: 
    tuple: (prediction, probaility, interpretation)
    """
    #  Load model and scaler
    model = joblib.load(model_path)
    scaler = jobliv.load(scaler_path)
    # convert to DataFrame if dict
    if isinstance(customer_data, dict):
        customer_df = pd.DataFrame([customer_data])
    else:
        customer_df = customer_data.copy()

    # Preprocess (assuming same preprocessing steps as training)
    # Drop irrelevant columns if present
    cols_to_drop = ['RowNumber', 'CustomerId', 'Surname']
    for col in cols_to_drop:
        if col in customer_df.columns:
            customer_df = customer_df.drop(col, axis = 1)
    # one-hot encode categorical variables
    customer_df = pd.get_dummies(customer_df, columns = ['Geography', 'Gender'], drop_first = True)
    # Ensure all expected columns are present
    expected_cols = X_train.columns.tolist()
    for col in expected_cols:
        if col not in customer_df.columns:
            customer_df[col] = 0
    # Reorder columns
    customer_df= customer_df[expected_cols]

    # Scale numerical features
    num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    customer_df[num_cols] = scaler.transform(customer_df[num_cols])

    # Make Prediction 
    prediction = model.predict(customer_df)[0]
    probability = model.predict_proba(customer_df)[0][1]

    # Interpretation
    interpretation = 'High risk of churn' if prediction == 1 else "Low risk of churn"
    return prediction, probability, interpretation
        

# 12. Conclusion and Summary

In [None]:
print("-"*60)
print("CUSTOMER CHURN PREDICTION PROJECT - SUMMARY")
print("="*60)

print(f"\nDataset: {df.shape[0]} customers, {df.shape[1]} features")
print(f"Churn Rate: {df_cleaned['Exited'].mean():.2%}")

print("\nBest Performing Model:")
best_model_name = results_df.loc[results_df['F1-Score'].idxmax(), 'Model']
best_metrics = results_df.loc[results_df['F1-Score'].idxmax()]
print(f"  Model: {best_model_name}")
print(f"  F1-Score: {best_metrics['F1-Score']:.4f}")
print(f"  Accuracy: {best_metrics['Accuracy']:.4f}")
print(f"  Recall: {best_metrics['Recall']:.4f}")
print(f"  Precision: {best_metrics['Precision']:.4f}")

print("\nKey Findings:")
print("1. Geography: Germany has the highest churn rate")
print("2. Age: Older customers are more likely to churn")
print("3. Activity: Inactive members have higher churn rates")
print("4. Products: Customers with 1 product have highest churn")
print("5. Balance: Customers with high balance are more likely to churn")

print("\nBusiness Recommendations:")
print("1. Focus retention efforts on German customers")
print("2. Create engagement programs for inactive members")
print("3. Develop targeted offers for customers with single products")
print("4. Implement loyalty programs for high-balance customers")
print("5. Use the model to identify at-risk customers proactively")

print("\n" + "="*60)
print("Project completed successfully!")
print("="*60)