# Phase 5: Monitoring & Automation

This notebook demonstrates the monitoring and automation capabilities for the Telco Churn Prediction system, focusing on:

1. **Drift Detection**: Using PSI and KS tests to detect data, prediction, and label drift
2. **Performance Monitoring**: Tracking model performance over time and comparing with baseline
3. **Alerting**: Setting up alerts for drift and performance degradation
4. **Dashboards**: Generating monitoring dashboards and visualizations
5. **Retraining Pipeline**: Orchestrating automated retraining workflows

## Objectives
- Understand drift detection mechanisms
- Monitor model performance in production
- Set up alerting for critical issues
- Visualize monitoring metrics
- Execute automated retraining pipeline


In [1]:
print("=" * 80)
print("INITIALIZING NOTEBOOK ENVIRONMENT")
print("=" * 80)

import sys
import warnings
from pathlib import Path

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Add project root to path
print("\nLocating project root...")
# Find project root by looking for 'src' directory
PROJECT_ROOT = (
    Path().resolve().parent if (Path().resolve().parent / "src").exists() else Path().resolve()
)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"[OK] Project root found: {PROJECT_ROOT}")

# Import required libraries
print("\nImporting required libraries...")
from datetime import UTC, datetime

import numpy as np
import pandas as pd
from rich.console import Console
from rich.table import Table

from src.monitoring.alerts import AlertManager
from src.monitoring.dashboard import MonitoringDashboard

# Import monitoring modules
from src.monitoring.drift import DriftDetector
from src.monitoring.performance import PerformanceMetrics, PerformanceMonitor
from src.pipelines.retraining_dag import RetrainingDAG

console = Console()

print("\n" + "=" * 80)
print("ENVIRONMENT INITIALIZED")
print("=" * 80)
print("\n[OK] All imports successful")
print("[OK] Notebook ready for execution")

INITIALIZING NOTEBOOK ENVIRONMENT

Locating project root...
[OK] Project root found: C:\Users\tiwar\OneDrive - The University of Melbourne\Desktop\New folder\telco-churn-retention

Importing required libraries...



ENVIRONMENT INITIALIZED

[OK] All imports successful
[OK] Notebook ready for execution


## 1. Prerequisites Check

Before running monitoring, we need to ensure:
- Processed data exists (from Phase 1-2)
- Models have been trained (Phase 3)
- Reference data is available for comparison


In [2]:
# Check prerequisites
console.print("\n[bold cyan]Checking Prerequisites...[/bold cyan]")

data_dir = PROJECT_ROOT / "data" / "processed"
models_dir = PROJECT_ROOT / "models"
reports_dir = PROJECT_ROOT / "reports" / "monitoring"

# Check processed data
if data_dir.exists():
    processed_dirs = [d for d in data_dir.iterdir() if d.is_dir()]
    if processed_dirs:
        latest_processed = max(processed_dirs, key=lambda p: p.name)
        console.print(f"[green][OK] Processed data found: {latest_processed.name}[/green]")

        train_path = latest_processed / "train.parquet"
        target_path = latest_processed / "target.parquet"

        if train_path.exists() and target_path.exists():
            console.print(f"  - Train data: {train_path}")
            console.print(f"  - Target data: {target_path}")
        else:
            console.print(
                "[yellow][WARN] Train/target files not found in latest processed directory[/yellow]"
            )
    else:
        console.print(
            "[red][FAIL] No processed data directories found. Please run Phase 1-2 first.[/red]"
        )
else:
    console.print("[red][FAIL] Processed data directory not found.[/red]")

# Check models
if models_dir.exists():
    model_dirs = [d for d in models_dir.iterdir() if d.is_dir()]
    if model_dirs:
        latest_model = max(model_dirs, key=lambda p: p.name)
        console.print(f"[green][OK] Models found: {latest_model.name}[/green]")
    else:
        console.print(
            "[yellow][WARN] No model directories found. Monitoring can still run without models.[/yellow]"
        )
else:
    console.print("[yellow][WARN] Models directory not found.[/yellow]")

# Create reports directory
reports_dir.mkdir(parents=True, exist_ok=True)
console.print(f"[green][OK] Reports directory ready: {reports_dir}[/green]")

## 2. Drift Detection

We'll demonstrate drift detection using PSI (Population Stability Index) and KS (Kolmogorov-Smirnov) tests.


In [3]:
# Initialize drift detector
drift_detector = DriftDetector(psi_threshold=0.2, ks_threshold=0.05)

console.print("\n[bold cyan]Drift Detection Setup[/bold cyan]")
console.print(f"PSI threshold: {drift_detector.psi_threshold}")
console.print(f"KS p-value threshold: {drift_detector.ks_threshold}")

In [4]:
# Load reference and current data
# In practice, reference would be training data and current would be production data
data_dir = PROJECT_ROOT / "data" / "processed"

if data_dir.exists():
    processed_dirs = sorted([d for d in data_dir.iterdir() if d.is_dir()], key=lambda p: p.name)

    if len(processed_dirs) >= 2:
        # Use two different timestamps as reference and current
        reference_dir = processed_dirs[-2]  # Second to last
        current_dir = processed_dirs[-1]  # Latest

        reference_data = pd.read_parquet(reference_dir / "train.parquet")
        current_data = pd.read_parquet(current_dir / "train.parquet")

        console.print(
            f"\n[green][OK] Loaded reference data: {reference_dir.name} ({len(reference_data)} samples)[/green]"
        )
        console.print(
            f"[green][OK] Loaded current data: {current_dir.name} ({len(current_data)} samples)[/green]"
        )
    elif len(processed_dirs) == 1:
        # Use same data but simulate drift by adding noise
        reference_data = pd.read_parquet(processed_dirs[0] / "train.parquet")
        current_data = reference_data.copy()

        # Simulate drift by adding noise to numeric columns
        numeric_cols = current_data.select_dtypes(include=[np.number]).columns
        for col in numeric_cols[:5]:  # Add drift to first 5 numeric columns
            noise = np.random.normal(0, current_data[col].std() * 0.1, len(current_data))
            current_data[col] = current_data[col] + noise

        console.print(f"\n[green][OK] Using data from {processed_dirs[0].name}[/green]")
        console.print(
            "[yellow][WARN] Simulating drift by adding noise to numeric features[/yellow]"
        )
    else:
        console.print("[red][FAIL] Not enough processed data directories found[/red]")
        reference_data = None
        current_data = None
else:
    console.print("[red][FAIL] Processed data directory not found[/red]")
    reference_data = None
    current_data = None

In [5]:
# Generate drift report
if reference_data is not None and current_data is not None:
    console.print("\n[bold cyan]Generating Drift Report...[/bold cyan]")

    drift_report = drift_detector.generate_drift_report(
        reference_data=reference_data,
        current_data=current_data,
    )

    # Save report
    report_path = reports_dir / "drift_report.json"
    drift_report.to_json(report_path)
    console.print(f"[green][OK] Drift report saved to {report_path}[/green]")

    # Display summary
    console.print("\n[bold]Drift Detection Summary:[/bold]")
    console.print(f"  Overall drift detected: {drift_report.overall_drift_detected}")

    if drift_report.drift_summary:
        summary = drift_report.drift_summary
        console.print(f"  Total features checked: {summary.get('total_features_checked', 0)}")
        console.print(f"  Features with drift: {summary.get('features_with_drift', 0)}")

        # Show top drifting features
        drifting_features = [m for m in drift_report.data_drift if m.drift_detected]
        if drifting_features:
            console.print("\n[bold]Top Drifting Features:[/bold]")
            drifting_features.sort(key=lambda x: x.psi or 0, reverse=True)

            table = Table(title="Drift Metrics")
            table.add_column("Feature", style="cyan")
            table.add_column("PSI", justify="right")
            table.add_column("Severity", style="magenta")
            table.add_column("Type", style="blue")

            for metric in drifting_features[:10]:
                psi_val = f"{metric.psi:.4f}" if metric.psi is not None else "N/A"
                table.add_row(
                    metric.feature_name, psi_val, metric.drift_severity, metric.feature_type
                )

            console.print(table)
else:
    console.print("[red][FAIL] Cannot generate drift report - data not available[/red]")

## 3. Performance Monitoring

Track model performance over time and compare with baseline metrics.


In [6]:
# Initialize performance monitor
performance_monitor = PerformanceMonitor(
    roc_auc_threshold=0.05, accuracy_threshold=0.05, f1_threshold=0.05
)

console.print("\n[bold cyan]Performance Monitoring Setup[/bold cyan]")
console.print(f"ROC-AUC degradation threshold: {performance_monitor.roc_auc_threshold}")
console.print(f"Accuracy degradation threshold: {performance_monitor.accuracy_threshold}")
console.print(f"F1 degradation threshold: {performance_monitor.f1_threshold}")

In [7]:
# Create sample performance metrics
# In practice, these would come from model evaluation on production data

# Baseline metrics (from training)
baseline_metrics = PerformanceMetrics(
    timestamp=datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ"),
    roc_auc=0.88,
    accuracy=0.82,
    precision=0.75,
    recall=0.70,
    f1=0.72,
    sample_size=1000,
)

# Current metrics (simulated - slightly degraded)
current_metrics = PerformanceMetrics(
    timestamp=datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ"),
    roc_auc=0.85,  # Slight degradation
    accuracy=0.80,
    precision=0.72,
    recall=0.68,
    f1=0.70,
    sample_size=1000,
)

console.print("\n[bold]Baseline Metrics:[/bold]")
console.print(f"  ROC-AUC: {baseline_metrics.roc_auc:.4f}")
console.print(f"  Accuracy: {baseline_metrics.accuracy:.4f}")
console.print(f"  F1: {baseline_metrics.f1:.4f}")

console.print("\n[bold]Current Metrics:[/bold]")
console.print(f"  ROC-AUC: {current_metrics.roc_auc:.4f}")
console.print(f"  Accuracy: {current_metrics.accuracy:.4f}")
console.print(f"  F1: {current_metrics.f1:.4f}")

# Compare with baseline
performance_report = performance_monitor.compare_with_baseline(current_metrics, baseline_metrics)

console.print("\n[bold]Performance Comparison:[/bold]")
console.print(f"  Degradation detected: {performance_report.performance_degradation}")
console.print(f"  Severity: {performance_report.degradation_severity}")

if performance_report.metric_changes:
    console.print("\n[bold]Metric Changes:[/bold]")
    for metric, change in performance_report.metric_changes.items():
        console.print(f"  {metric}: {change:+.4f}")

# Save report
perf_report_path = reports_dir / "performance_report.json"
performance_report.to_json(perf_report_path)
console.print(f"\n[green][OK] Performance report saved to {perf_report_path}[/green]")

## 4. Alerting System

Demonstrate alerting capabilities for drift and performance degradation.


In [8]:
# Initialize alert manager
# Using file-based alerts for demonstration (no actual Slack/email needed)
alert_file = reports_dir / "alerts.json"
alert_manager = AlertManager(alert_file=alert_file)

console.print("\n[bold cyan]Alert Manager Setup[/bold cyan]")
console.print(f"Alert logging file: {alert_file}")
console.print("[yellow]Note: Slack/email alerts require webhook URLs/SMTP config[/yellow]")

# Send alerts if drift or degradation detected
if "drift_report" in locals() and drift_report.overall_drift_detected:
    console.print("\n[bold cyan]Sending Drift Alert...[/bold cyan]")
    alert_sent = alert_manager.alert_on_drift(drift_report, threshold_severity="low")
    if alert_sent:
        console.print("[green][OK] Drift alert sent[/green]")
    else:
        console.print("[yellow][WARN] Drift alert not sent (below threshold)[/yellow]")

if "performance_report" in locals() and performance_report.performance_degradation:
    console.print("\n[bold cyan]Sending Performance Degradation Alert...[/bold cyan]")
    alert_sent = alert_manager.alert_on_performance_degradation(
        performance_report, threshold_severity="low"
    )
    if alert_sent:
        console.print("[green][OK] Performance degradation alert sent[/green]")
    else:
        console.print("[yellow][WARN] Performance alert not sent (below threshold)[/yellow]")

# Display alerts if file exists
if alert_file.exists():
    import json

    with open(alert_file) as f:
        alerts = json.load(f)

    if alerts:
        console.print(f"\n[bold]Recent Alerts ({len(alerts)}):[/bold]")
        for alert in alerts[-5:]:  # Show last 5
            console.print(f"  [{alert['severity']}] {alert['title']}")

## 5. Monitoring Dashboards

Generate visualizations for drift metrics and performance trends.


In [9]:
# Initialize dashboard generator
dashboard = MonitoringDashboard(reports_dir)

console.print("\n[bold cyan]Generating Monitoring Dashboards...[/bold cyan]")

# Generate drift metrics dashboard
if "drift_report" in locals():
    dashboard.plot_drift_metrics(drift_report)
    console.print("[green][OK] Drift metrics dashboard generated[/green]")

# Generate performance trends (simulate history)
if "baseline_metrics" in locals() and "current_metrics" in locals():
    # Create a simple history
    metrics_history = [baseline_metrics, current_metrics]
    dashboard.plot_performance_trends(metrics_history)
    console.print("[green][OK] Performance trends dashboard generated[/green]")

# Generate summary dashboard
if "drift_report" in locals() and "performance_report" in locals():
    dashboard.generate_summary_dashboard(
        drift_report=drift_report,
        performance_report=performance_report,
        metrics_history=metrics_history if "metrics_history" in locals() else None,
    )
    console.print("[green][OK] Summary dashboard generated[/green]")

console.print(f"\n[bold]Dashboards saved to:[/bold] {reports_dir}")

## 6. Retraining Pipeline

Demonstrate the automated retraining pipeline that orchestrates the full workflow.


In [10]:
# Note: Running the full retraining pipeline can take time
# This cell demonstrates how to initialize and run it

console.print("\n[bold cyan]Retraining Pipeline Setup[/bold cyan]")
console.print(
    "[yellow]Note: Full pipeline execution is commented out to avoid long runtime[/yellow]"
)
console.print("[yellow]Uncomment the dag.run() line to execute the full pipeline[/yellow]")

# Initialize retraining DAG
raw_data_path = PROJECT_ROOT / "data" / "raw" / "telco_data_28_11_2025.csv"
processed_dir = PROJECT_ROOT / "data" / "processed"
models_dir = PROJECT_ROOT / "models"
validation_dir = PROJECT_ROOT / "reports" / "validation"

if raw_data_path.exists():
    dag = RetrainingDAG(
        raw_data_path=raw_data_path,
        processed_dir=processed_dir,
        models_dir=models_dir,
        validation_report_dir=validation_dir,
        mlflow_experiment="telco_churn",
        min_roc_auc=0.85,
        enable_promotion=True,
    )

    console.print("[green][OK] Retraining DAG initialized[/green]")
    console.print(f"  Raw data: {raw_data_path}")
    console.print(f"  Processed dir: {processed_dir}")
    console.print(f"  Models dir: {models_dir}")
    console.print("  Min ROC-AUC for promotion: 0.85")

    # Uncomment to run the full pipeline:
    # console.print("\n[bold yellow]Executing retraining pipeline...[/bold yellow]")
    # summary = dag.run()
    # console.print(f"\n[bold]Pipeline Status:[/bold] {summary['status']}")
    # console.print(f"Successful tasks: {summary['successful_tasks']}/{summary['total_tasks']}")

else:
    console.print(
        "[red][FAIL] Raw data file not found. Cannot initialize retraining pipeline.[/red]"
    )