# =============================================================
# MILESTONE 3: Machine Learning Model Development and Optimization
# =============================================================

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)
from xgboost import XGBClassifier

In [None]:
PEACH      = '#FFCBA4'
PEACH_DARK = '#FF9A76'
SAGE       = '#A8C686'
SAGE_DARK  = '#7A9B57'
NEUTRAL    = '#F5F5DC'
ACCENT     = '#E07B39'

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

In [None]:
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "final_processed_data.csv"
STATIC_VIZ = PROJECT_ROOT / "visualizations" / "static" 
INTER_VIZ  = PROJECT_ROOT / "visualizations" / "interactive" 
MODELS_DIR = PROJECT_ROOT / "models" / "trained_models"

In [None]:
df = pd.read_csv(DATA_PATH)
X = df.drop('Churn', axis=1)
y = df['Churn'].astype(int)
print(f"Dataset: {df.shape[0]:,} samples × {df.shape[1]-1} features | Churn rate: {y.mean()*100:.2f}%")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")

In [None]:
print("\n[2/9] Training baseline models...")
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=200, eval_metric='logloss', random_state=42, n_jobs=-1)
}

results = []
for name, model in models.items():
    print(f"  → {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    results.append({
        'model_name': name,
        'model_obj': model,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_proba)
    })

results_df = pd.DataFrame([{
    'Model': r['model_name'],
    'Accuracy': r['accuracy'],
    'Precision': r['precision'],
    'Recall': r['recall'],
    'F1': r['f1'],
    'ROC-AUC': r['auc']
} for r in results])

print("\nBaseline Results:")
print(results_df.round(4))

In [None]:
print("\n[3/9] Hyperparameter tuning...")

# XGBoost Tuning
print("  → Tuning XGBoost...")
xgb_grid = GridSearchCV(
    XGBClassifier(eval_metric='logloss', random_state=42),
    param_grid={
        'n_estimators': [200, 300],
        'mav_depth': [4, 6],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    },
    scoring='roc_auc', cv=3, n_jobs=-1, verbose=0
)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
print(f"     Best AUC: {xgb_grid.best_score_:.4f}")

# RF Tuning
print("  → Tuning Random Forest...")
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid={
        'n_estimators': [300, 500],
        'max_depth': [None, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    scoring='roc_auc', cv=3, n_jobs=-1, verbose=0
)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Retrain best models
best_xgb.fit(X_train, y_train)
best_rf.fit(X_train, y_train)

# Final predictions
for name, model in [('XGBoost (Tuned)', best_xgb), ('Random Forest (Tuned)', best_rf)]:
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    results.append({
        'model_name': name,
        'model_obj': model,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_proba)
    })

final_df = pd.DataFrame([{
    'Model': r['model_name'],
    'Accuracy': r['accuracy'],
    'Precision': r['precision'],
    'Recall': r['recall'],
    'F1': r['f1'],
    'ROC-AUC': r['auc']
} for r in results]).round(4)

print("\nFinal Model Comparison:")
print(final_df)

In [None]:
best_idx = final_df['ROC-AUC'].idxmax()
best_result = results[best_idx]
best_model = best_result['model_obj']
best_name = best_result['model_name']

print(f"\nBEST MODEL: {best_name} | ROC-AUC: {best_result['auc']:.4f}")

# Save best agent
joblib.dump(best_model, MODELS_DIR / "best_churn_model.pkl")
print(f"Model saved → {MODELS_DIR}/best_churn_model.pkl")

In [None]:
cm = confusion_matrix(y_test, best_result['y_pred'])
tn, fp, fn, tp = cm.ravel()

# Interactive (Plotly)
fig = go.Figure(data=go.Heatmap(
    z=cm, x=['No Churn', 'Churn'], y=['No Churn', 'Churn'],
    colorscale=[[0, SAGE_DARK], [0.5, NEUTRAL], [1, PEACH_DARK]],
    text=cm, texttemplate="%{text}", textfont={"size": 20}
))
fig.update_layout(title=f'Confusion Matrix - {best_name}', height=500)
fig.write_html(INTER_VIZ / "01_confusion_matrix.html")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("ROC Curve", "Precision-Recall Curve"))

colors = [SAGE_DARK, PEACH_DARK, ACCENT]
for i, r in enumerate(results[-3:]):  # Top 3 models
    # ROC
    fpr, tpr, _ = roc_curve(y_test, r['y_proba'])
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines',
                             name=f"{r['model_name']} (AUC={r['auc']:.3f})",
                             line=dict(color=colors[i])), row=1, col=1)
    # PR
    p, r_curve, _ = precision_recall_curve(y_test, r['y_proba'])
    fig.add_trace(go.Scatter(x=r_curve, y=p, mode='lines',
                             name=r['model_name'], line=dict(color=colors[i]),
                             showlegend=False), row=1, col=2)

# Reference lines
fig.add_trace(go.Scatter(x=[0,1], y=[0,1], line=dict(dash='dash', color='gray'),
                         showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(x=[0,1], y=[y_test.mean()]*2, line=dict(dash='dash', color='gray'),
                         showlegend=False), row=1, col=2)

fig.update_layout(height=550, title_text="Model Comparison: ROC & Precision-Recall Curves", template="plotly_white")
fig.write_html(INTER_VIZ / "02_roc_pr_curves.html")
fig.show()

In [None]:
if hasattr(best_model, 'feature_importances_'):
    imp = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False).head(15)

    # Interactive
    fig = px.bar(imp, x='Importance', y='Feature', orientation='h',
                 color='Importance', color_continuous_scale='Peach')
    fig.update_layout(title=f'Top 15 Features - {best_name}', height=600)
    fig.write_html(INTER_VIZ / "03_feature_importance.html")
    fig.show()