# Experiment Tracking Example

This notebook demonstrates how to use the Experiment Tracker to log, track, and compare machine learning experiments in the Mental Health Risk Assessment System.

## Setup

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from src.ds.experiment_tracker import ExperimentTracker
from src.ds.storage import FileSystemStorage
from src.database.connection import get_db_connection

## Initialize Experiment Tracker

In [None]:
# Initialize storage and database
storage = FileSystemStorage(base_path="../experiments/artifacts")
db = get_db_connection()

# Create experiment tracker
tracker = ExperimentTracker(storage_backend=storage, db_connection=db)

print("✓ Experiment tracker initialized")

## Load Sample Data

In [None]:
# Generate synthetic patient assessment data
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'age': np.random.randint(18, 80, n_samples),
    'phq9_score': np.random.randint(0, 27, n_samples),
    'gad7_score': np.random.randint(0, 21, n_samples),
    'pcl5_score': np.random.randint(0, 80, n_samples),
    'sleep_hours': np.random.uniform(3, 10, n_samples),
    'previous_episodes': np.random.randint(0, 5, n_samples)
})

# Create target variable (high risk if scores are elevated)
data['high_risk'] = (
    (data['phq9_score'] > 15) | 
    (data['gad7_score'] > 10) | 
    (data['pcl5_score'] > 40)
).astype(int)

print(f"Dataset shape: {data.shape}")
print(f"High risk cases: {data['high_risk'].sum()} ({data['high_risk'].mean()*100:.1f}%)")
data.head()

## Experiment 1: Baseline Random Forest

In [None]:
# Start a new run
run = tracker.start_run(
    experiment_name="mental_health_risk_prediction",
    run_name="baseline_random_forest",
    tags={
        "model_type": "random_forest",
        "purpose": "baseline",
        "dataset": "synthetic_v1"
    }
)

print(f"Started run: {run.run_id}")

In [None]:
# Prepare data
X = data.drop('high_risk', axis=1)
y = data['high_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Log dataset parameters
tracker.log_params({
    "n_samples": len(data),
    "n_features": X.shape[1],
    "test_size": 0.2,
    "random_state": 42
})

In [None]:
# Train model
model_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 5,
    "random_state": 42
}

# Log hyperparameters
tracker.log_params(model_params)

# Train
model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)

print("✓ Model trained")

In [None]:
# Evaluate and log metrics
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred_proba)
}

tracker.log_metrics(metrics)

print("Model Performance:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Save and log model artifact
import joblib
import os

os.makedirs("../models", exist_ok=True)
model_path = "../models/baseline_rf.pkl"
joblib.dump(model, model_path)

tracker.log_artifact(model_path, artifact_type="model")
print(f"✓ Model saved and logged")

In [None]:
# Create and log feature importance plot
import matplotlib.pyplot as plt

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Baseline Random Forest')
plt.tight_layout()

plot_path = "../plots/baseline_feature_importance.png"
os.makedirs("../plots", exist_ok=True)
plt.savefig(plot_path)
plt.show()

tracker.log_artifact(plot_path, artifact_type="plot")
print("✓ Feature importance plot logged")

In [None]:
# End the run
tracker.end_run(status="FINISHED")
print("✓ Run completed")

## Experiment 2: Tuned Random Forest

In [None]:
# Start new run with different hyperparameters
run2 = tracker.start_run(
    experiment_name="mental_health_risk_prediction",
    run_name="tuned_random_forest",
    tags={
        "model_type": "random_forest",
        "purpose": "hyperparameter_tuning",
        "dataset": "synthetic_v1"
    }
)

# Log dataset parameters
tracker.log_params({
    "n_samples": len(data),
    "n_features": X.shape[1],
    "test_size": 0.2,
    "random_state": 42
})

# Tuned hyperparameters
tuned_params = {
    "n_estimators": 200,
    "max_depth": 15,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "random_state": 42
}

tracker.log_params(tuned_params)

# Train
model2 = RandomForestClassifier(**tuned_params)
model2.fit(X_train, y_train)

# Evaluate
y_pred2 = model2.predict(X_test)
y_pred_proba2 = model2.predict_proba(X_test)[:, 1]

metrics2 = {
    "accuracy": accuracy_score(y_test, y_pred2),
    "f1_score": f1_score(y_test, y_pred2),
    "roc_auc": roc_auc_score(y_test, y_pred_proba2)
}

tracker.log_metrics(metrics2)

print("Tuned Model Performance:")
for metric, value in metrics2.items():
    print(f"  {metric}: {value:.4f}")

# Save model
model_path2 = "../models/tuned_rf.pkl"
joblib.dump(model2, model_path2)
tracker.log_artifact(model_path2, artifact_type="model")

tracker.end_run(status="FINISHED")
print("\n✓ Tuned model run completed")

## Compare Experiments

In [None]:
# Search all runs in the experiment
runs = tracker.search_runs(
    experiment_name="mental_health_risk_prediction",
    order_by=["metrics.roc_auc DESC"]
)

print(f"Found {len(runs)} runs\n")

for run in runs:
    print(f"Run: {run.run_name}")
    print(f"  ID: {run.run_id}")
    print(f"  Status: {run.status}")
    print(f"  Metrics:")
    for metric, values in run.metrics.items():
        # Get the latest value
        latest_value = values[-1][0] if values else None
        print(f"    {metric}: {latest_value:.4f}")
    print()

In [None]:
# Compare specific runs
comparison = tracker.compare_runs(
    run_ids=[run.run_id, run2.run_id],
    metric_names=["accuracy", "f1_score", "roc_auc"]
)

print("Run Comparison:")
print(comparison)

## Retrieve and Use Logged Artifacts

In [None]:
# Get the best run
best_run = runs[0]  # Already sorted by ROC AUC

print(f"Best run: {best_run.run_name}")
print(f"ROC AUC: {best_run.metrics['roc_auc'][-1][0]:.4f}")

# List artifacts
print(f"\nArtifacts ({len(best_run.artifacts)}):")
for artifact in best_run.artifacts:
    print(f"  - {artifact.artifact_type}: {artifact.path}")

## Summary

This notebook demonstrated:

1. **Initializing** the experiment tracker
2. **Starting runs** with metadata and tags
3. **Logging parameters** (hyperparameters and dataset info)
4. **Logging metrics** (accuracy, F1, ROC AUC)
5. **Logging artifacts** (models and plots)
6. **Searching and comparing** runs
7. **Retrieving** the best model

### Next Steps

- Integrate with hyperparameter optimization
- Link experiments to model registry
- Set up automated experiment tracking in production pipelines
- Use experiment tracking for A/B testing