# Module 4.3 Solutions: MLOps & Experiment Tracking

This notebook contains solutions to the exercises and challenges from Module 4.3.

---

## Table of Contents

1. [MLflow Setup Exercise](#mlflow-exercise)
2. [W&B Learning Rate Scheduler Comparison](#wandb-exercise)
3. [Custom Evaluation Pipeline](#evaluation-exercise)
4. [Drift Monitoring Simulation](#drift-exercise)
5. [Model Versioning Workflow](#registry-exercise)
6. [Reproducibility Pipeline](#reproducibility-exercise)

---

In [None]:
# Common imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import os
import json
from datetime import datetime

# Set seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

---

## 1. MLflow Setup Exercise <a id="mlflow-exercise"></a>

**Task:** Compare different optimization algorithms using MLflow tracking.

This solution demonstrates:
- Creating experiments and runs
- Logging parameters, metrics, and models
- Comparing runs programmatically

In [None]:
import mlflow

# Configure MLflow
TRACKING_DIR = "./mlruns"
os.makedirs(TRACKING_DIR, exist_ok=True)
mlflow.set_tracking_uri(f"file://{os.path.abspath(TRACKING_DIR)}")

# Create synthetic data
n_samples = 1000
n_features = 20
X = torch.randn(n_samples, n_features)
y = (X @ torch.randn(n_features) > 0).float()
train_X, val_X = X[:800], X[800:]
train_y, val_y = y[:800], y[800:]

# Simple classifier
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

In [None]:
# SOLUTION: Optimizer Comparison

mlflow.set_experiment("Optimizer-Comparison")

optimizers_to_try = [
    ("Adam", lambda p: optim.Adam(p, lr=0.01)),
    ("SGD", lambda p: optim.SGD(p, lr=0.01, momentum=0.9)),
    ("AdamW", lambda p: optim.AdamW(p, lr=0.01)),
    ("RMSprop", lambda p: optim.RMSprop(p, lr=0.01)),
]

results = []

for opt_name, opt_fn in optimizers_to_try:
    set_seed(42)  # Reset seed for fair comparison
    
    with mlflow.start_run(run_name=opt_name):
        # Log parameters
        mlflow.log_params({
            "optimizer": opt_name,
            "learning_rate": 0.01,
            "hidden_dim": 64,
            "epochs": 20
        })
        
        # Create model and optimizer
        model = SimpleClassifier(n_features, 64)
        optimizer = opt_fn(model.parameters())
        criterion = nn.BCELoss()
        
        # Training loop
        best_val_acc = 0
        for epoch in range(20):
            model.train()
            optimizer.zero_grad()
            outputs = model(train_X)
            loss = criterion(outputs, train_y)
            loss.backward()
            optimizer.step()
            
            # Validation
            model.eval()
            with torch.no_grad():
                val_out = model(val_X)
                val_acc = ((val_out > 0.5).float() == val_y).float().mean().item()
            
            # Log metrics
            mlflow.log_metrics({
                "train_loss": loss.item(),
                "val_accuracy": val_acc
            }, step=epoch)
            
            best_val_acc = max(best_val_acc, val_acc)
        
        mlflow.log_metric("best_val_accuracy", best_val_acc)
        results.append((opt_name, best_val_acc))
        print(f"{opt_name}: {best_val_acc:.4f}")

# Summary
print("\nRanked by accuracy:")
for opt_name, acc in sorted(results, key=lambda x: -x[1]):
    print(f"  {opt_name}: {acc:.4f}")

---

## 2. W&B Learning Rate Scheduler Comparison <a id="wandb-exercise"></a>

**Task:** Compare different learning rate schedulers and visualize results.

In [None]:
# SOLUTION: Learning Rate Scheduler Comparison

from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR, OneCycleLR

# Since W&B requires authentication, we'll simulate the tracking
class SimpleTracker:
    """Simple tracker for demonstration."""
    def __init__(self, name):
        self.name = name
        self.logs = []
    
    def log(self, metrics):
        self.logs.append(metrics)

def train_with_scheduler(scheduler_name, scheduler_fn, epochs=30):
    set_seed(42)
    tracker = SimpleTracker(scheduler_name)
    
    model = SimpleClassifier(n_features, 64)
    optimizer = optim.Adam(model.parameters(), lr=0.1)
    
    # Create scheduler
    if scheduler_name == "step":
        scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    elif scheduler_name == "cosine":
        scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    elif scheduler_name == "onecycle":
        scheduler = OneCycleLR(optimizer, max_lr=0.1, epochs=epochs, steps_per_epoch=1)
    else:
        scheduler = None
    
    criterion = nn.BCELoss()
    best_acc = 0
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        loss = criterion(model(train_X), train_y)
        loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
        
        model.eval()
        with torch.no_grad():
            val_acc = ((model(val_X) > 0.5).float() == val_y).float().mean().item()
        
        current_lr = optimizer.param_groups[0]['lr']
        tracker.log({
            "epoch": epoch,
            "loss": loss.item(),
            "val_accuracy": val_acc,
            "learning_rate": current_lr
        })
        
        best_acc = max(best_acc, val_acc)
    
    return tracker, best_acc

# Run comparison
schedulers = ["step", "cosine", "onecycle", "none"]
scheduler_results = {}

for sched in schedulers:
    tracker, acc = train_with_scheduler(sched, None)
    scheduler_results[sched] = {"tracker": tracker, "accuracy": acc}
    print(f"{sched}: {acc:.4f}")

print("\nBest scheduler:", max(scheduler_results.items(), key=lambda x: x[1]['accuracy'])[0])

---

## 3. Custom Evaluation Pipeline <a id="evaluation-exercise"></a>

**Task:** Create a code generation evaluation pipeline.

In [None]:
# SOLUTION: Code Generation Evaluation

from dataclasses import dataclass
from typing import List, Dict, Any

@dataclass
class EvaluationResult:
    metric_name: str
    score: float
    details: dict = None

class CodeEvaluator:
    """Evaluator for code generation tasks."""
    
    @staticmethod
    def check_syntax(code: str) -> EvaluationResult:
        """Check if code has valid Python syntax."""
        try:
            compile(code, '<string>', 'exec')
            return EvaluationResult("syntax", 1.0, {"valid": True})
        except SyntaxError as e:
            return EvaluationResult("syntax", 0.0, {"error": str(e)})
    
    @staticmethod
    def check_docstring(code: str) -> EvaluationResult:
        """Check if code contains a docstring."""
        has_docstring = '"""' in code or "'''" in code
        return EvaluationResult(
            "docstring", 
            1.0 if has_docstring else 0.0,
            {"has_docstring": has_docstring}
        )
    
    @staticmethod
    def check_function_presence(code: str, required: List[str]) -> EvaluationResult:
        """Check if required functions are present."""
        found = sum(1 for fn in required if f"def {fn}" in code)
        score = found / len(required) if required else 0
        return EvaluationResult(
            "functions",
            score,
            {"found": found, "required": len(required)}
        )
    
    @staticmethod  
    def check_type_hints(code: str) -> EvaluationResult:
        """Check for type hints."""
        has_hints = "->" in code or ": str" in code or ": int" in code
        return EvaluationResult(
            "type_hints",
            1.0 if has_hints else 0.5,
            {"has_type_hints": has_hints}
        )
    
    def evaluate(self, code: str, required_functions: List[str] = None) -> Dict[str, Any]:
        """Run all evaluations."""
        results = [
            self.check_syntax(code),
            self.check_docstring(code),
            self.check_function_presence(code, required_functions or []),
            self.check_type_hints(code)
        ]
        
        scores = {r.metric_name: r.score for r in results}
        scores["overall"] = np.mean(list(scores.values()))
        
        return scores

# Test the evaluator
evaluator = CodeEvaluator()

test_codes = [
    # Good code
    '''
def calculate_sum(a: int, b: int) -> int:
    """Calculate the sum of two numbers."""
    return a + b
''',
    # Missing docstring
    '''
def calculate_product(a, b):
    return a * b
''',
    # Invalid syntax
    '''
def broken(
    return x
'''
]

for i, code in enumerate(test_codes):
    scores = evaluator.evaluate(code, required_functions=["calculate_sum"])
    print(f"\nCode {i+1}:")
    for metric, score in scores.items():
        print(f"  {metric}: {score:.2f}")

---

## 4. Drift Monitoring Simulation <a id="drift-exercise"></a>

**Task:** Simulate data arriving over time with increasing drift.

In [None]:
# SOLUTION: Drift Simulation Over Time

def generate_data_with_drift(n_samples, drift_intensity):
    """Generate data with controllable drift."""
    data = {
        'feature1': np.random.normal(0 + drift_intensity, 1, n_samples),
        'feature2': np.random.normal(5, 2, n_samples),
        'feature3': np.random.poisson(3 + int(drift_intensity * 2), n_samples),
    }
    df = pd.DataFrame(data)
    df['target'] = (df['feature1'] > 0).astype(int)
    df['prediction'] = df['target']  # Assume perfect predictions
    return df

class SimpleDriftMonitor:
    """Simple drift monitor for simulation."""
    
    def __init__(self, reference_data):
        self.reference = reference_data
        self.history = []
    
    def check(self, current_data):
        """Check for drift using simple mean comparison."""
        drift_scores = {}
        
        for col in ['feature1', 'feature2', 'feature3']:
            ref_mean = self.reference[col].mean()
            cur_mean = current_data[col].mean()
            ref_std = self.reference[col].std()
            
            # Normalized difference
            drift_scores[col] = abs(cur_mean - ref_mean) / (ref_std + 1e-10)
        
        overall_drift = np.mean(list(drift_scores.values()))
        
        self.history.append({
            'overall_drift': overall_drift,
            **drift_scores
        })
        
        return overall_drift

# Simulate 30 days of data
reference = generate_data_with_drift(1000, 0)
monitor = SimpleDriftMonitor(reference)

print("Simulating 30 days of data with increasing drift...")
print("Day | Drift Score | Alert")
print("-" * 40)

for day in range(30):
    # Drift increases over time
    drift_intensity = day / 20  # 0 to 1.5
    daily_data = generate_data_with_drift(100, drift_intensity)
    
    drift_score = monitor.check(daily_data)
    
    # Alert logic
    if drift_score > 1.0:
        alert = "CRITICAL"
    elif drift_score > 0.5:
        alert = "WARNING"
    else:
        alert = "OK"
    
    if day % 5 == 0 or alert != "OK":  # Print every 5 days or on alert
        print(f"{day:3d} | {drift_score:11.3f} | {alert}")

# Summary
history_df = pd.DataFrame(monitor.history)
print(f"\nMax drift observed: {history_df['overall_drift'].max():.3f}")
print(f"Days with warnings: {(history_df['overall_drift'] > 0.5).sum()}")

---

## 5. Model Versioning Workflow <a id="registry-exercise"></a>

**Task:** Implement a complete model versioning workflow.

In [None]:
# SOLUTION: Complete Model Versioning Workflow

from mlflow.tracking import MlflowClient

mlflow.set_experiment("Model-Versioning-Workflow")
client = MlflowClient()

# Model configurations to try
configs = [
    {"hidden_dim": 32, "lr": 0.01, "name": "small"},
    {"hidden_dim": 64, "lr": 0.005, "name": "medium"},
    {"hidden_dim": 128, "lr": 0.001, "name": "large"},
]

model_results = []

# Train and register all models
for config in configs:
    set_seed(42)
    
    with mlflow.start_run(run_name=f"v-{config['name']}"):
        # Log config
        mlflow.log_params(config)
        
        # Train
        model = SimpleClassifier(n_features, config['hidden_dim'])
        optimizer = optim.Adam(model.parameters(), lr=config['lr'])
        criterion = nn.BCELoss()
        
        for epoch in range(20):
            model.train()
            loss = criterion(model(train_X), train_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Evaluate
        model.eval()
        with torch.no_grad():
            val_acc = ((model(val_X) > 0.5).float() == val_y).float().mean().item()
        
        mlflow.log_metric("val_accuracy", val_acc)
        
        # Register model
        mlflow.pytorch.log_model(
            model, 
            artifact_path="model",
            registered_model_name="VersionedClassifier"
        )
        
        model_results.append({
            "name": config['name'],
            "run_id": mlflow.active_run().info.run_id,
            "accuracy": val_acc
        })
        
        print(f"{config['name']}: accuracy = {val_acc:.4f}")

# Find best model
best = max(model_results, key=lambda x: x['accuracy'])
print(f"\nBest model: {best['name']} (accuracy: {best['accuracy']:.4f})")

In [None]:
# Promotion workflow
print("\nPromotion Workflow:")
print("-" * 40)

try:
    versions = client.search_model_versions("name='VersionedClassifier'")
    
    # Find best version
    best_version = None
    best_accuracy = 0
    
    for version in versions:
        run = client.get_run(version.run_id)
        acc = run.data.metrics.get('val_accuracy', 0)
        if acc > best_accuracy:
            best_accuracy = acc
            best_version = version
    
    if best_version:
        print(f"Best version: {best_version.version} (accuracy: {best_accuracy:.4f})")
        
        # Promote to staging
        client.transition_model_version_stage(
            name="VersionedClassifier",
            version=best_version.version,
            stage="Staging"
        )
        print(f"Version {best_version.version} promoted to Staging")
        
        # Simulate validation checks passing
        print("Running validation checks...")
        print("  Accuracy check: PASSED")
        print("  Latency check: PASSED")
        
        # Promote to production
        client.transition_model_version_stage(
            name="VersionedClassifier",
            version=best_version.version,
            stage="Production",
            archive_existing_versions=True
        )
        print(f"Version {best_version.version} promoted to Production!")
        
except Exception as e:
    print(f"Note: {e}")

---

## 6. Reproducibility Pipeline <a id="reproducibility-exercise"></a>

**Task:** Create a complete reproducible training pipeline.

In [None]:
# SOLUTION: Complete Reproducibility Pipeline

import sys
import platform

class ReproducibleTrainer:
    """Trainer with built-in reproducibility."""
    
    def __init__(self, config):
        self.config = config
        self.seed = config.get('seed', 42)
        self.env_info = self._capture_environment()
    
    def _set_seeds(self):
        """Set all random seeds."""
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(self.seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
    
    def _capture_environment(self):
        """Capture environment info."""
        return {
            "python": sys.version.split()[0],
            "torch": torch.__version__,
            "numpy": np.__version__,
            "platform": platform.system(),
            "cuda": torch.version.cuda if torch.cuda.is_available() else None
        }
    
    def train(self):
        """Run training with reproducibility."""
        self._set_seeds()
        
        # Create model
        model = SimpleClassifier(
            n_features, 
            self.config['hidden_dim']
        )
        optimizer = optim.Adam(model.parameters(), lr=self.config['lr'])
        criterion = nn.BCELoss()
        
        # Training
        losses = []
        for epoch in range(self.config['epochs']):
            model.train()
            loss = criterion(model(train_X), train_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            val_acc = ((model(val_X) > 0.5).float() == val_y).float().mean().item()
            first_param = model.net[0].weight[0, 0].item()
        
        return {
            "final_loss": losses[-1],
            "val_accuracy": val_acc,
            "first_param": first_param  # For verification
        }
    
    def verify_reproducibility(self, n_runs=3):
        """Verify training is reproducible."""
        results = []
        for i in range(n_runs):
            result = self.train()
            results.append(result)
            print(f"Run {i+1}: loss={result['final_loss']:.6f}, "
                  f"acc={result['val_accuracy']:.4f}, "
                  f"param={result['first_param']:.6f}")
        
        # Check all runs are identical
        all_same = all(
            abs(r['first_param'] - results[0]['first_param']) < 1e-6
            for r in results
        )
        
        return all_same

# Run reproducibility verification
config = {
    "seed": 42,
    "hidden_dim": 64,
    "lr": 0.01,
    "epochs": 10
}

trainer = ReproducibleTrainer(config)

print("Environment:")
for key, value in trainer.env_info.items():
    print(f"  {key}: {value}")

print("\nVerifying reproducibility with 3 runs:")
is_reproducible = trainer.verify_reproducibility()

if is_reproducible:
    print("\n✅ PASSED: All runs produced identical results!")
else:
    print("\n❌ FAILED: Runs produced different results")

---

## Summary

This solutions notebook demonstrated:

1. **MLflow Tracking**: Comparing optimizers with proper experiment tracking
2. **W&B-style Tracking**: Learning rate scheduler comparison with logging
3. **Custom Evaluation**: Building a code quality evaluator
4. **Drift Detection**: Simulating and monitoring drift over time
5. **Model Registry**: Complete versioning and promotion workflow
6. **Reproducibility**: Building a fully reproducible training pipeline

All these patterns are essential for production ML systems!

---

## Cleanup

In [None]:
# Cleanup
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Solutions notebook complete!")