## 0. Environment & Dependencies

Import everything needed for data prep, modeling, plotting, and evaluation plus constants for reproducible splits and Top-K metrics.


In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score,
    f1_score,
    balanced_accuracy_score,
    brier_score_loss,
)

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

RANDOM_STATE = 42
TOPK_RATIO = 0.1


## 1. Load Data

Bring the cleaned churn dataset into memory and inspect the head to confirm schema.


In [None]:
DATA_PATH = Path('churn_clean.csv')
assert DATA_PATH.exists(), f"Data file not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
df.head()


## 2. Prepare Features & Target

Separate the `Exited` label from features and capture categorical/numeric column names for preprocessing.


In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()
cat_cols, num_cols


## 3. Metric Helpers

Reusable helper functions for Precision@K/Recall@K and a consolidated evaluation dict.


In [None]:
def topk_precision_recall(y_true, y_proba, k_ratio: float = TOPK_RATIO):
    """Compute Precision@K and Recall@K."""
    assert 0 < k_ratio <= 1, 'k_ratio must be in (0, 1].'
    y_true = np.asarray(y_true)
    y_proba = np.asarray(y_proba)

    n = len(y_true)
    k = max(1, int(np.floor(n * k_ratio)))
    order = np.argsort(-y_proba)
    topk_idx = order[:k]

    y_topk = y_true[topk_idx]
    tp_at_k = y_topk.sum()
    total_pos = y_true.sum()

    precision_at_k = tp_at_k / k
    recall_at_k = tp_at_k / total_pos if total_pos > 0 else 0.0

    return {
        'K': k,
        'Precision@K': precision_at_k,
        'Recall@K': recall_at_k,
    }


def evaluate_classifier(y_true, y_proba, threshold: float = 0.5, k_ratio: float = TOPK_RATIO):
    """Return ROC-AUC, PR-AUC, F1, Balanced Accuracy, Brier Score, Precision@K/Recall@K."""
    y_true_arr = y_true.values if hasattr(y_true, 'values') else np.asarray(y_true)
    y_proba_arr = np.asarray(y_proba)
    y_pred = (y_proba_arr >= threshold).astype(int)

    metrics = {
        'ROC-AUC': roc_auc_score(y_true_arr, y_proba_arr),
        'PR-AUC': average_precision_score(y_true_arr, y_proba_arr),
        'F1': f1_score(y_true_arr, y_pred),
        'BalancedAccuracy': balanced_accuracy_score(y_true_arr, y_pred),
        'BrierScore': brier_score_loss(y_true_arr, y_proba_arr),
    }

    topk = topk_precision_recall(y_true_arr, y_proba_arr, k_ratio=k_ratio)
    metrics.update(topk)
    return metrics


## 4. 60/20/20 Split

Perform stratified train/validation/test splits (60/20/20) to maintain class balance.


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.4,
    stratify=y,
    random_state=RANDOM_STATE,
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=RANDOM_STATE,
)

print('Train:', X_train.shape, 'Target ratio:', y_train.mean().round(3))
print('Validation:', X_val.shape, 'Target ratio:', y_val.mean().round(3))
print('Test:', X_test.shape, 'Target ratio:', y_test.mean().round(3))


## 5. Preprocessing & Random Forest Pipeline

One-hot encode categoricals, scale numeric features, and attach a class-weighted, depth-limited random forest.


In [None]:
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocess = ColumnTransformer([
    ('categorical', ohe, cat_cols),
    ('numeric', StandardScaler(), num_cols)
])

rf_params = dict(
    n_estimators=500,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=4,
    class_weight='balanced_subsample',
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

rf_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(**rf_params))
])
rf_pipeline


## 6. Train The Random Forest

Fit the pipeline on the training split so preprocessing and the ensemble stay coupled.


In [None]:
rf_pipeline.fit(X_train, y_train)
rf_pipeline


## 7. Generate Validation/Test Predictions

Store predictions and probabilities for validation/test splits to drive metrics and plots.


In [None]:
predictions = {}
for dataset_name, (X_split, y_split) in {
    'Validation': (X_val, y_val),
    'Test': (X_test, y_test),
}.items():
    y_pred = rf_pipeline.predict(X_split)
    y_proba = rf_pipeline.predict_proba(X_split)[:, 1]
    predictions[dataset_name] = {
        'y_true': y_split,
        'y_pred': y_pred,
        'y_proba': y_proba,
    }

predictions


## 8. Aggregate Key Metrics

Summarize ROC-AUC, PR-AUC, F1, Balanced Accuracy, Brier Score, and Top-K metrics per split.


In [None]:
metrics_rows = []
for dataset_name, values in predictions.items():
    metrics = evaluate_classifier(values['y_true'], values['y_proba'], k_ratio=TOPK_RATIO)
    metrics_rows.append({'Dataset': dataset_name, **metrics})

metrics_df = pd.DataFrame(metrics_rows)
metrics_df


## 9. Classification Reports & Confusion Matrices

Print detailed classification reports and keep confusion matrices for visualization.


In [None]:
conf_matrices = {}
for dataset_name, values in predictions.items():
    cm = confusion_matrix(values['y_true'], values['y_pred'])
    conf_matrices[dataset_name] = cm
    print(f"Random Forest - {dataset_name} classification report")
    print(classification_report(values['y_true'], values['y_pred']))

conf_matrices


## 10. Figure Output Directory

Ensure the figure directory exists before saving plots.


In [None]:
FIG_DIR = Path('figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)


## 11. Compute ROC/PR Curve Data

Capture FPR/TPR and Precision/Recall arrays plus AUC/AP values for both splits.


In [None]:
curve_data = {}
for dataset_name, values in predictions.items():
    fpr, tpr, _ = roc_curve(values['y_true'], values['y_proba'])
    prec, rec, _ = precision_recall_curve(values['y_true'], values['y_proba'])
    auc = roc_auc_score(values['y_true'], values['y_proba'])
    ap = average_precision_score(values['y_true'], values['y_proba'])
    curve_data[dataset_name] = {
        'fpr': fpr,
        'tpr': tpr,
        'precision': prec,
        'recall': rec,
        'auc': auc,
        'ap': ap,
    }
curve_data


## 12. Plot ROC Curves

Overlay validation/test ROC curves to compare ensemble performance.


In [None]:
plt.figure(figsize=(6, 4))
for dataset_name, roc_info in curve_data.items():
    linestyle = '-' if dataset_name == 'Validation' else '--'
    plt.plot(roc_info['fpr'], roc_info['tpr'], linestyle=linestyle, label=f"{dataset_name} AUC = {roc_info['auc']:.3f}")
plt.plot([0, 1], [0, 1], 'k--', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve — Random Forest Baseline')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(FIG_DIR / 'roc_curve_rf_baseline.png', dpi=200)
plt.show()


## 13. Plot Precision-Recall Curves

Visualize the recall–precision trade-off for each split.


In [None]:
plt.figure(figsize=(6, 4))
for dataset_name, pr_info in curve_data.items():
    linestyle = '-' if dataset_name == 'Validation' else '--'
    plt.plot(pr_info['recall'], pr_info['precision'], linestyle=linestyle, label=f"{dataset_name} AP = {pr_info['ap']:.3f}")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision–Recall Curve — Random Forest Baseline')
plt.legend(loc='lower left')
plt.tight_layout()
plt.savefig(FIG_DIR / 'pr_curve_rf_baseline.png', dpi=200)
plt.show()


## 14. Plot Confusion Matrices

Heatmaps for validation and test confusion matrices offer a quick error view.


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(9, 4))
for ax, dataset_name in zip(axes, ['Validation', 'Test']):
    sns.heatmap(conf_matrices[dataset_name], annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
    ax.set_title(f'{dataset_name} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
plt.tight_layout()
plt.savefig(FIG_DIR / 'confusion_matrices_rf_baseline.png', dpi=200)
plt.show()


## 15. Feature Importances

Inspect which engineered features drive the random forest decisions.


In [None]:
feature_names = rf_pipeline.named_steps['preprocess'].get_feature_names_out()
importances = pd.Series(rf_pipeline.named_steps['model'].feature_importances_, index=feature_names)
top_features = importances.sort_values(ascending=False).head(20)
top_features


In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=top_features.values, y=top_features.index, color='tab:green')
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.title('Top 20 Features — Random Forest Baseline')
plt.tight_layout()
plt.show()


## 16. Export Metrics & Predictions

Persist the summary metrics and raw prediction probabilities for downstream comparison.


In [None]:
OUTPUT_DIR = Path('reports')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

metrics_df.to_csv(OUTPUT_DIR / 'random_forest_metrics_en.csv', index=False)

pred_rows = []
for dataset_name, values in predictions.items():
    pred_rows.append(pd.DataFrame({
        'dataset': dataset_name,
        'y_true': values['y_true'].values,
        'y_proba': values['y_proba'],
    }))
pred_df = pd.concat(pred_rows, ignore_index=True)
pred_df.to_csv(OUTPUT_DIR / 'random_forest_predictions_en.csv', index=False)

print('Saved metrics and predictions to', OUTPUT_DIR)


## 17. Persist The Trained Pipeline

Save the fitted RF pipeline (preprocessing + estimator) as a joblib artifact for reuse.


In [None]:
MODEL_DIR = Path('models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = MODEL_DIR / 'random_forest_baseline.joblib'
joblib.dump(rf_pipeline, MODEL_PATH)
MODEL_PATH
