### PYCARET AUTOML FOR CUSTOMER CHURN PREDICTION

In [9]:
# import necessary libraries

import pandas as pd
import numpy as np
from pycaret.classification import *
import warnings
warnings.filterwarnings('ignore')

print("Done!")

Done!


In [10]:
# pycaret AutoML for customer churn

print("\n ======================LOADING PREPROCESSED DATA======================")

# Load the data you prepared earlier
churn_predictive_data = pd.read_csv('data/churn_predictive_data.csv')
# make copy before manipulations
churn_data = churn_predictive_data.copy()


print(f"\nData loaded: {churn_data.shape[0]:,} customers, {churn_data.shape[1]} features")
display(churn_data.head())



Data loaded: 10,000 customers, 11 features


Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### PYCARET SETUP WITH IMPACTFUL PARAMETERS

### PyCaret's AutoML:

- Finds the best model and hyperparameters

- Handles preprocessing and feature engineering

- Does cross-validation and model comparison

In [11]:

print("\n =====================PYCARET SETUP WITH ALL USEFUL PARAMETERS=====================")

setup_params = {
    'data': churn_data,                   # Your data
    'target': 'exited',                   # Target column
    'train_size': 0.8,                    # 80% for training
    'test_data': None,                    # Will be created automatically
    'preprocess': True,                   # Let PyCaret handle preprocessing
    'imputation_type': 'simple',          # Simple imputation for any missing
    'numeric_imputation': 'median',       # Fill numeric with median
    'categorical_imputation': 'mode',     # Fill categorical with mode
    
    # critical for our imbalance of (3.9:1 ratio)
    'fix_imbalance': True,                # Handle class imbalance
    'fix_imbalance_method': 'smote',      # Use SMOTE for balancing
    
    # Feature engineering
    'remove_multicollinearity': True,     # Remove highly correlated features
    'multicollinearity_threshold': 0.9,   # Threshold for correlation
    'remove_outliers': True,              # Remove outliers using PCA
    'outliers_threshold': 0.05,           # Remove top 5% outliers
    'feature_selection': True,            # Perform feature selection
    'feature_selection_method': 'classic',        # Classic feature selection
    'feature_selection_estimator': 'lightgbm',    # Use LightGBM for selection
    
    # Encoding
    'categorical_features': ['geography', 'gender'], # Specify categorical
   
    
    # Scaling
    'normalize': True,                    # Normalize features
    'normalize_method': 'zscore',         # Use z-score normalization
    
    # Transformation
    'transformation': True,               # Make data more Gaussian-like
    'transformation_method': 'yeo-johnson', # Yeo-Johnson transformation
    
    # Ignore features
    'ignore_features': [],                # No features to ignore
    
    # Session
    'session_id': 42,                     # For reproducibility
    'verbose': True,                      # Detailed output
    'profile': False,                     # Don't create data profile (can be heavy)
}

# Initialize setup
print("\nInitializing PyCaret setup...")
# setup_params.pop("high_cardinality_features", None)
# setup_params.pop("high_cardinality_method", None)
 
clf_setup = setup(**setup_params)

# Display setup information
print("\n===============================SETUP COMPLETE===============================")
print(f"   - Dataset Shape: {clf_setup.dataset.shape}")
print(f"   - Features Processed: {clf_setup.X_train.shape[1]}")


print(
    f"   - Target Distribution: "
    f"{clf_setup.y_train.value_counts().to_dict()}"
)




Initializing PyCaret setup...


Unnamed: 0,Description,Value
0,Session id,42
1,Target,exited
2,Target type,Binary
3,Original data shape,"(10000, 11)"
4,Transformed data shape,"(14306, 3)"
5,Transformed train set shape,"(12306, 3)"
6,Transformed test set shape,"(2000, 3)"
7,Numeric features,8
8,Categorical features,2
9,Preprocess,True



   - Dataset Shape: (10000, 11)
   - Features Processed: 10
   - Target Distribution: {0: 6370, 1: 1630}


### COMPARE MODELS WITH IMBALANCE-FOCUSED METRICS

In [12]:
# define metrics that matter for imbalanced data

sort_by_metric = 'F1'  # F1-score balances precision and recall

# F1: balances both false positives and false negatives.
# Other good options: 'Recall', 'AUC', 'Precision'

print(f"Comparing models sorted by: '{sort_by_metric}'")
# For imbalanced data, F1 and AUC are most important

print("For imbalanced data, F1 and AUC are most important")

# Compare all models
print("Training and comparing models...")
best_model = compare_models(
    fold=10,              # 10-fold cross-validation
    round=4,             # Round metrics to 4 decimals
    sort=sort_by_metric, # Sort by chosen metric
    verbose=True,        # Show progress
)

Comparing models sorted by: 'F1'
For imbalanced data, F1 and AUC are most important
Training and comparing models...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.513,0.6135,0.7485,0.2675,0.3911,0.1271,0.1672,1.076
ridge,Ridge Classifier,0.5127,0.6135,0.7485,0.2673,0.3909,0.1268,0.1669,0.537
lda,Linear Discriminant Analysis,0.5127,0.6135,0.7485,0.2673,0.3909,0.1268,0.1669,0.497
svm,SVM - Linear Kernel,0.5128,0.6157,0.7454,0.2673,0.3899,0.1259,0.1655,0.59
qda,Quadratic Discriminant Analysis,0.5064,0.6117,0.7595,0.2636,0.3897,0.1231,0.1662,0.53
nb,Naive Bayes,0.5064,0.6138,0.7583,0.2635,0.3894,0.1227,0.1654,0.544
gbc,Gradient Boosting Classifier,0.5883,0.6132,0.5859,0.2902,0.3782,0.1374,0.1525,0.927
ada,Ada Boost Classifier,0.5756,0.6206,0.6141,0.277,0.3776,0.1306,0.1502,0.637
catboost,CatBoost Classifier,0.6306,0.5884,0.4282,0.2879,0.3293,0.1031,0.1086,6.444
lightgbm,Light Gradient Boosting Machine,0.6364,0.5836,0.408,0.2881,0.324,0.0994,0.1036,0.602


### lr: Logistic Regression. This is the best model
* Selected from prioritizing `F1`
* `F1`: Best for imbalanced data and it balances the precion and recall
* `F1`: balances both false positives and false negatives.

### CREATE LOGISTIC REGRESSION MODEL WITH HYPERPARAMETERS

In [13]:
### Creating the best Model
print("\n ===============================CREATING LOGISTIC REGRESSION MODEL===============================")

# Create Logistic Regression with specific parameters for imbalance
lr_model = create_model(
    'lr',    
    fold=10,
    cross_validation=True,
    class_weight='balanced',   # VERY important for minority churn
    verbose=True
)




Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.4538,0.5702,0.7423,0.2345,0.3564,0.0677,0.1029
1,0.5025,0.5764,0.7485,0.2547,0.3801,0.1092,0.1545
2,0.4762,0.5947,0.8037,0.2529,0.3847,0.1083,0.1654
3,0.7062,0.7663,0.7301,0.3839,0.5032,0.3221,0.3557
4,0.4562,0.5898,0.7669,0.2395,0.365,0.0789,0.1215
5,0.6762,0.7375,0.7239,0.3554,0.4768,0.28,0.3172
6,0.4688,0.5767,0.7546,0.2421,0.3666,0.084,0.1257
7,0.455,0.5478,0.6933,0.2265,0.3414,0.0494,0.0726
8,0.4462,0.5732,0.7485,0.2328,0.3552,0.0644,0.0995
9,0.4888,0.6021,0.773,0.253,0.3812,0.1071,0.1571


### From the above

* Recall is consistently high => Mean = 74.85%
This is good for churn prediction, because the model catches ~75% of `actual churners across folds`.
* Precision is low, Mean = 26.75% Means many predicted churners are actually not churning (high false positives).
* For BankCo: This could mean retention offers are sent to many loyal customers, `higher cost`.
### Pupose of the Bank

* If BankCo wants to save as many churners as possible, Thi model (Recall = 74%) is okay despite higher cost.

### Model Analysis

`Strengths for churn prediction`:

- High recall => catches most churners.

- Better than random guessing (AUC > 0.5).

### Weaknesses:

* Low precision => high false positive rate => costly campaigns.

* Low precision is problematic => Model may be too expensive.

* Model instability across folds => unreliable in production.

* Overall discriminative power is weak (AUC only 0.61).



<!-- ### CALIBRATE PROBABILITIES, `IMPORTANT FOR IMBALANCE` -->

In [None]:
# print("\n ================================CALIBRATING PROBABILITIES================================")

# print("Calibrating probabilities for better churn predictions...")
# calibrated_model = calibrate_model(
#     lr_model,
#     method='isotonic',  # Better for imbalanced data than 'sigmoid'
#     fold=10,
#     verbose=True
# )




### `After calibration , calibrate_model`:
* When the model outputs `0.40`, it means that about 40 out of 100 similar customers are expected to churn, making the predicted probabilities `reliable` for decisions.


### FINALIZE ON ENTIRE DATASET

In [14]:

print("\n =====================FINALIZING MODEL=====================")

print("finalizing model on entire dataset...")
finalized_model = finalize_model(lr_model)

print("=> Model finalized")
print("   - Trained on 100% of available data")
print("   - and Ready for deployment")


finalizing model on entire dataset...
=> Model finalized
   - Trained on 100% of available data
   - and Ready for deployment


### Plot confusion matrix

In [22]:
plot_model(
    finalized_model,
    plot='confusion_matrix',
    plot_kwargs={'percent': True},
    save=True
)

'Confusion Matrix.png'

In [16]:



print("AUC-ROC Curve:")
plot_model(
    finalized_model,
    plot='auc',
    save=True
)



# 7.3 Precision-Recall Curve, BEST for imbalance
print(" Precision-Recall Curve (recommended for imbalance):")
plot_model(
    finalized_model,
    plot='pr',
    save=True
)

# 7.4 Feature Importance
print(" Feature Importance:")
try:
    plot_model(
        finalized_model,
        plot='feature',
        save=True
    )
except:
    print(" Feature importance plot not available for ensemble")
    # Plot learning curve instead
    plot_model(
        finalized_model,
        plot='learning',
        save=True
    )

AUC-ROC Curve:


 Precision-Recall Curve (recommended for imbalance):


 Feature Importance:
 Feature importance plot not available for ensemble


### THRESHOLD ANALYSIS FOR IMBALANCED DATA


### `For 3.9:1 imbalance ratio, we can consider these thresholds`: 
- data is imbalanced (here 3.9 non-churners for every 1 churner)
- 0.5 (default): Balanced but misses churners
- 0.3-0.4: Better recall, catches more churners
- 0.25: High recall, many false positives

---
Problem for weak churn prediction:  80% of available data show customer who didn't churn, while only 20% are those who churned. And I'm interested to predict the customer who might leave, the minority!!

How can I approach this problem of imbalanced data to ensure I can have the best model that can predict minority!?
The problem is that: A model can get 80% accuracy just by predicting “everyone stayed” but that’s useless for churn.

---


### SAVE EVERYTHING

In [17]:

# Save model

save_model(finalized_model, 'saved_model/churn_lr_final')
print("Model saved: 'churn_lr_final.pkl'")

Transformation Pipeline and Model Successfully Saved
Model saved: 'churn_lr_final.pkl'
