# Task 15.3 Solution: MLflow Experiment Tracking

This notebook provides solutions for the MLflow exercise.

---

## Exercise Solution: Complete Experiment Tracking Workflow

**Task:** Create a complete experiment tracking workflow.

Requirements:
1. Create a new experiment
2. Run 5+ training simulations with different hyperparameters
3. Log parameters, metrics, and artifacts
4. Query for the best run
5. Visualize results

In [None]:
import mlflow
import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path

# Setup MLflow
NOTEBOOK_DIR = Path(os.getcwd())
MLFLOW_DIR = str((NOTEBOOK_DIR / "../mlflow").resolve())
os.makedirs(MLFLOW_DIR, exist_ok=True)

mlflow.set_tracking_uri(f"file://{MLFLOW_DIR}")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

In [None]:
# Step 1: Create a new experiment
EXPERIMENT_NAME = "my-first-experiment"

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if experiment is None:
    experiment_id = mlflow.create_experiment(
        EXPERIMENT_NAME,
        tags={
            "project": "module-15-solution",
            "author": "student",
            "purpose": "hyperparameter-exploration"
        }
    )
    print(f"‚úÖ Created experiment: {EXPERIMENT_NAME}")
else:
    experiment_id = experiment.experiment_id
    print(f"‚úÖ Using existing experiment: {EXPERIMENT_NAME}")

mlflow.set_experiment(EXPERIMENT_NAME)

In [None]:
# Step 2: Define hyperparameter grid for 5+ simulations

HYPERPARAM_GRID = [
    {"learning_rate": 1e-5, "batch_size": 8, "dropout": 0.1, "warmup_steps": 100},
    {"learning_rate": 1e-4, "batch_size": 8, "dropout": 0.1, "warmup_steps": 100},
    {"learning_rate": 1e-4, "batch_size": 16, "dropout": 0.2, "warmup_steps": 200},
    {"learning_rate": 1e-4, "batch_size": 32, "dropout": 0.1, "warmup_steps": 100},
    {"learning_rate": 1e-3, "batch_size": 16, "dropout": 0.2, "warmup_steps": 50},
    {"learning_rate": 5e-4, "batch_size": 16, "dropout": 0.15, "warmup_steps": 150},
]

print(f"üìã Planned {len(HYPERPARAM_GRID)} experiments:")
for i, hp in enumerate(HYPERPARAM_GRID):
    print(f"   {i+1}. lr={hp['learning_rate']}, bs={hp['batch_size']}, dropout={hp['dropout']}")

In [None]:
def simulate_training(hyperparams: dict, n_epochs: int = 10) -> dict:
    """
    Simulate a training run with given hyperparameters.
    Returns metrics for each epoch and final results.
    """
    lr = hyperparams['learning_rate']
    bs = hyperparams['batch_size']
    dropout = hyperparams['dropout']
    warmup = hyperparams['warmup_steps']
    
    # Simulate training dynamics
    # Higher LR = faster convergence but more variance
    # Larger batch = more stable
    # Higher dropout = slower convergence but better generalization
    
    lr_factor = 1.0 + (lr - 1e-4) * 5000  # LR effect
    bs_factor = 1.0 - (bs - 16) * 0.005   # Batch size effect
    dropout_penalty = dropout * 0.5       # Dropout slows learning
    
    history = []
    base_loss = 2.0
    
    for epoch in range(n_epochs):
        # Add some randomness
        noise = random.gauss(0, 0.02 * (1.5 - lr_factor * 0.1))
        
        # Training loss decreases
        train_loss = base_loss * (0.8 ** (epoch * lr_factor * bs_factor - dropout_penalty)) + noise
        train_loss = max(0.05, train_loss)
        
        # Validation loss with slight gap
        val_loss = train_loss * (1.1 + dropout * 0.2) + noise * 0.5
        
        # Accuracy improves
        accuracy = min(0.98, 0.5 + epoch * 0.05 * lr_factor - dropout_penalty * 0.1 + noise)
        
        history.append({
            "epoch": epoch,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "accuracy": accuracy
        })
    
    return {
        "history": history,
        "final_train_loss": history[-1]["train_loss"],
        "final_val_loss": history[-1]["val_loss"],
        "final_accuracy": history[-1]["accuracy"],
        "best_accuracy": max(h["accuracy"] for h in history)
    }

# Test simulation
test_result = simulate_training(HYPERPARAM_GRID[0])
print(f"Test simulation: acc={test_result['final_accuracy']:.4f}")

In [None]:
def create_training_plot(history: list, output_path: str) -> None:
    """Create and save a training progress plot."""
    epochs = [h['epoch'] for h in history]
    train_losses = [h['train_loss'] for h in history]
    val_losses = [h['val_loss'] for h in history]
    accuracies = [h['accuracy'] for h in history]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    ax1.plot(epochs, train_losses, 'b-', label='Train Loss')
    ax1.plot(epochs, val_losses, 'r--', label='Val Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title('Loss Curves')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    ax2.plot(epochs, accuracies, 'g-', linewidth=2)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Accuracy')
    ax2.grid(True, alpha=0.3)
    ax2.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=100)
    plt.close()

In [None]:
# Step 3: Run all experiments with full logging

print("\nüî¨ Running hyperparameter sweep...")
print("=" * 60)

run_ids = []

for i, hyperparams in enumerate(HYPERPARAM_GRID):
    run_name = f"run-{i+1}_lr{hyperparams['learning_rate']}_bs{hyperparams['batch_size']}"
    
    with mlflow.start_run(run_name=run_name) as run:
        # Log hyperparameters
        mlflow.log_params(hyperparams)
        mlflow.log_param("n_epochs", 10)
        mlflow.log_param("model_type", "simulated")
        
        # Add tags
        mlflow.set_tag("run_index", str(i + 1))
        mlflow.set_tag("sweep_id", "sweep-001")
        
        # Run training simulation
        results = simulate_training(hyperparams)
        
        # Log metrics for each epoch
        for h in results['history']:
            mlflow.log_metrics({
                "train_loss": h['train_loss'],
                "val_loss": h['val_loss'],
                "accuracy": h['accuracy']
            }, step=h['epoch'])
        
        # Log final metrics
        mlflow.log_metrics({
            "final_train_loss": results['final_train_loss'],
            "final_val_loss": results['final_val_loss'],
            "final_accuracy": results['final_accuracy'],
            "best_accuracy": results['best_accuracy']
        })
        
        # Create and log artifact (training plot)
        plot_path = f"/tmp/training_plot_{i+1}.png"
        create_training_plot(results['history'], plot_path)
        mlflow.log_artifact(plot_path, "plots")
        
        # Log config as JSON artifact
        config_path = f"/tmp/config_{i+1}.json"
        with open(config_path, 'w') as f:
            json.dump({"hyperparams": hyperparams, "results": results}, f, indent=2)
        mlflow.log_artifact(config_path, "configs")
        
        run_ids.append(run.info.run_id)
        
        print(f"Run {i+1}/{len(HYPERPARAM_GRID)}: "
              f"lr={hyperparams['learning_rate']}, bs={hyperparams['batch_size']} "
              f"‚Üí acc={results['final_accuracy']:.4f}")

print(f"\n‚úÖ Completed {len(HYPERPARAM_GRID)} experiments!")

In [None]:
# Step 4: Query for best run

runs_df = mlflow.search_runs(
    experiment_ids=[experiment_id],
    filter_string="",
    order_by=["metrics.final_accuracy DESC"]
)

print("\nüìä All Runs (sorted by accuracy):")
print("=" * 80)

display_cols = [
    'run_id', 'params.learning_rate', 'params.batch_size', 'params.dropout',
    'metrics.final_accuracy', 'metrics.final_val_loss'
]
available_cols = [c for c in display_cols if c in runs_df.columns]

print(runs_df[available_cols].head(10).to_string(index=False))

In [None]:
# Best run details
best_run = runs_df.iloc[0]

print("\nüèÜ BEST RUN:")
print("=" * 60)
print(f"Run ID: {best_run['run_id'][:16]}...")
print(f"\nHyperparameters:")
print(f"  Learning Rate: {best_run.get('params.learning_rate', 'N/A')}")
print(f"  Batch Size: {best_run.get('params.batch_size', 'N/A')}")
print(f"  Dropout: {best_run.get('params.dropout', 'N/A')}")
print(f"  Warmup Steps: {best_run.get('params.warmup_steps', 'N/A')}")
print(f"\nResults:")
print(f"  Final Accuracy: {best_run.get('metrics.final_accuracy', 'N/A'):.4f}")
print(f"  Final Val Loss: {best_run.get('metrics.final_val_loss', 'N/A'):.4f}")

In [None]:
# Step 5: Visualize results

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Filter runs with data
viz_df = runs_df[runs_df['params.learning_rate'].notna()].copy()
viz_df['lr'] = viz_df['params.learning_rate'].astype(float)
viz_df['bs'] = viz_df['params.batch_size'].astype(float)
viz_df['dropout'] = viz_df['params.dropout'].astype(float)

# 1. Learning Rate vs Accuracy
ax1 = axes[0, 0]
scatter = ax1.scatter(viz_df['lr'], viz_df['metrics.final_accuracy'] * 100, 
                      c=viz_df['bs'], cmap='viridis', s=150)
ax1.set_xscale('log')
ax1.set_xlabel('Learning Rate')
ax1.set_ylabel('Final Accuracy (%)')
ax1.set_title('Learning Rate vs Accuracy')
plt.colorbar(scatter, ax=ax1, label='Batch Size')
ax1.grid(True, alpha=0.3)

# 2. Batch Size vs Validation Loss
ax2 = axes[0, 1]
scatter2 = ax2.scatter(viz_df['bs'], viz_df['metrics.final_val_loss'],
                       c=viz_df['dropout'], cmap='coolwarm', s=150)
ax2.set_xlabel('Batch Size')
ax2.set_ylabel('Final Validation Loss')
ax2.set_title('Batch Size vs Val Loss')
plt.colorbar(scatter2, ax=ax2, label='Dropout')
ax2.grid(True, alpha=0.3)

# 3. Accuracy Distribution
ax3 = axes[1, 0]
accuracies = viz_df['metrics.final_accuracy'].values * 100
ax3.hist(accuracies, bins=10, edgecolor='black', alpha=0.7)
ax3.axvline(x=accuracies.max(), color='green', linestyle='--', label=f'Best: {accuracies.max():.1f}%')
ax3.axvline(x=accuracies.mean(), color='blue', linestyle='--', label=f'Mean: {accuracies.mean():.1f}%')
ax3.set_xlabel('Final Accuracy (%)')
ax3.set_ylabel('Count')
ax3.set_title('Accuracy Distribution')
ax3.legend()

# 4. Hyperparameter Importance (simple correlation)
ax4 = axes[1, 1]
correlations = {
    'Learning Rate': viz_df['lr'].corr(viz_df['metrics.final_accuracy']),
    'Batch Size': viz_df['bs'].corr(viz_df['metrics.final_accuracy']),
    'Dropout': viz_df['dropout'].corr(viz_df['metrics.final_accuracy'])
}
colors = ['green' if c > 0 else 'red' for c in correlations.values()]
ax4.barh(list(correlations.keys()), list(correlations.values()), color=colors)
ax4.set_xlabel('Correlation with Accuracy')
ax4.set_title('Hyperparameter Importance')
ax4.axvline(x=0, color='black', linestyle='-')
ax4.set_xlim(-1, 1)

plt.tight_layout()
plt.savefig(f"{MLFLOW_DIR}/sweep_analysis.png", dpi=150)
plt.show()

print(f"\nüìÅ Analysis saved to {MLFLOW_DIR}/sweep_analysis.png")

---

## Key Takeaways

1. **Structured Experiments**: Named experiments group related runs
2. **Complete Logging**: Log params, metrics over time, and artifacts
3. **Easy Querying**: MLflow's search API makes finding best runs simple
4. **Visualization**: Combine MLflow data with matplotlib for insights

---

## Next Steps

To view these results in the MLflow UI:

```bash
mlflow ui --backend-store-uri ../mlflow --host 0.0.0.0 --port 5000
```

Then open http://localhost:5000

In [None]:
print(f"\n‚úÖ Solution complete!")
print(f"\nTo view results in MLflow UI, run:")
print(f"   mlflow ui --backend-store-uri {MLFLOW_DIR}")