# Hyperparameter Optimization Example

This notebook demonstrates how to use the Hyperparameter Optimizer with experiment tracking to systematically tune model parameters for the Mental Health Risk Assessment System.

## Setup

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score

from src.ds.hyperparameter_optimizer import HyperparameterOptimizer
from src.ds.experiment_tracker import ExperimentTracker
from src.ds.storage import FileSystemStorage
from src.database.connection import get_db_connection

## Initialize Components

In [None]:
# Initialize experiment tracker
storage = FileSystemStorage(base_path="../experiments/artifacts")
db = get_db_connection()
tracker = ExperimentTracker(storage_backend=storage, db_connection=db)

# Initialize hyperparameter optimizer
optimizer = HyperparameterOptimizer(
    experiment_tracker=tracker,
    strategy="bayesian"  # or "grid", "random"
)

print("✓ Components initialized")

## Prepare Data

In [None]:
# Generate synthetic patient assessment data
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'age': np.random.randint(18, 80, n_samples),
    'phq9_score': np.random.randint(0, 27, n_samples),
    'gad7_score': np.random.randint(0, 21, n_samples),
    'pcl5_score': np.random.randint(0, 80, n_samples),
    'sleep_hours': np.random.uniform(3, 10, n_samples),
    'previous_episodes': np.random.randint(0, 5, n_samples),
    'social_support': np.random.randint(1, 10, n_samples)
})

# Create target (high risk if multiple elevated scores)
data['high_risk'] = (
    ((data['phq9_score'] > 15) & (data['gad7_score'] > 10)) |
    (data['pcl5_score'] > 50) |
    ((data['previous_episodes'] >= 3) & (data['social_support'] < 4))
).astype(int)

# Split data
X = data.drop('high_risk', axis=1)
y = data['high_risk']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Positive class ratio: {y_train.mean():.2%}")

## Example 1: Random Forest Optimization

In [None]:
# Define objective function
def rf_objective(params):
    """Objective function for Random Forest optimization"""
    model = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=42
    )
    
    # Use cross-validation for robust evaluation
    scores = cross_val_score(
        model, X_train, y_train,
        cv=5, scoring='roc_auc', n_jobs=-1
    )
    
    return scores.mean()

# Define parameter space
rf_param_space = {
    'n_estimators': (50, 300),
    'max_depth': (5, 30),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2']
}

print("Parameter space defined")

In [None]:
# Run optimization
print("Starting Random Forest optimization...")
print("This may take a few minutes...\n")

rf_result = optimizer.optimize(
    objective_function=rf_objective,
    param_space=rf_param_space,
    n_trials=50,
    n_jobs=4
)

print("\n✓ Optimization complete!")
print(f"\nBest ROC AUC: {rf_result.best_score:.4f}")
print(f"\nBest parameters:")
for param, value in rf_result.best_params.items():
    print(f"  {param}: {value}")

In [None]:
# Visualize optimization history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

# Optimization history
plt.subplot(1, 2, 1)
plt.plot(rf_result.optimization_history, marker='o')
plt.axhline(y=rf_result.best_score, color='r', linestyle='--', label='Best Score')
plt.xlabel('Trial')
plt.ylabel('ROC AUC')
plt.title('Optimization History - Random Forest')
plt.legend()
plt.grid(True, alpha=0.3)

# Parameter importance
plt.subplot(1, 2, 2)
importance_df = pd.DataFrame({
    'parameter': list(rf_result.param_importance.keys()),
    'importance': list(rf_result.param_importance.values())
}).sort_values('importance', ascending=True)

plt.barh(importance_df['parameter'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Parameter Importance')
plt.tight_layout()
plt.show()

In [None]:
# Train final model with best parameters
best_rf = RandomForestClassifier(
    n_estimators=int(rf_result.best_params['n_estimators']),
    max_depth=int(rf_result.best_params['max_depth']),
    min_samples_split=int(rf_result.best_params['min_samples_split']),
    min_samples_leaf=int(rf_result.best_params['min_samples_leaf']),
    max_features=rf_result.best_params['max_features'],
    random_state=42
)

best_rf.fit(X_train, y_train)

# Evaluate on test set
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]
y_pred = best_rf.predict(X_test)

test_auc = roc_auc_score(y_test, y_pred_proba)
test_f1 = f1_score(y_test, y_pred)

print("Test Set Performance:")
print(f"  ROC AUC: {test_auc:.4f}")
print(f"  F1 Score: {test_f1:.4f}")

## Example 2: Gradient Boosting Optimization

In [None]:
# Define objective for Gradient Boosting
def gb_objective(params):
    """Objective function for Gradient Boosting optimization"""
    model = GradientBoostingClassifier(
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate'],
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        subsample=params['subsample'],
        random_state=42
    )
    
    scores = cross_val_score(
        model, X_train, y_train,
        cv=5, scoring='roc_auc', n_jobs=-1
    )
    
    return scores.mean()

# Define parameter space
gb_param_space = {
    'n_estimators': (50, 200),
    'learning_rate': (0.01, 0.3),
    'max_depth': (3, 10),
    'min_samples_split': (2, 20),
    'subsample': (0.6, 1.0)
}

print("Gradient Boosting parameter space defined")

In [None]:
# Run optimization
print("Starting Gradient Boosting optimization...")
print("This may take a few minutes...\n")

gb_result = optimizer.optimize(
    objective_function=gb_objective,
    param_space=gb_param_space,
    n_trials=50,
    n_jobs=4
)

print("\n✓ Optimization complete!")
print(f"\nBest ROC AUC: {gb_result.best_score:.4f}")
print(f"\nBest parameters:")
for param, value in gb_result.best_params.items():
    print(f"  {param}: {value}")

In [None]:
# Visualize GB optimization
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(gb_result.optimization_history, marker='o', color='green')
plt.axhline(y=gb_result.best_score, color='r', linestyle='--', label='Best Score')
plt.xlabel('Trial')
plt.ylabel('ROC AUC')
plt.title('Optimization History - Gradient Boosting')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
importance_df = pd.DataFrame({
    'parameter': list(gb_result.param_importance.keys()),
    'importance': list(gb_result.param_importance.values())
}).sort_values('importance', ascending=True)

plt.barh(importance_df['parameter'], importance_df['importance'], color='green')
plt.xlabel('Importance')
plt.title('Parameter Importance')
plt.tight_layout()
plt.show()

## Compare Models

In [None]:
# Train best GB model
best_gb = GradientBoostingClassifier(
    n_estimators=int(gb_result.best_params['n_estimators']),
    learning_rate=gb_result.best_params['learning_rate'],
    max_depth=int(gb_result.best_params['max_depth']),
    min_samples_split=int(gb_result.best_params['min_samples_split']),
    subsample=gb_result.best_params['subsample'],
    random_state=42
)

best_gb.fit(X_train, y_train)

# Evaluate
gb_pred_proba = best_gb.predict_proba(X_test)[:, 1]
gb_pred = best_gb.predict(X_test)

gb_test_auc = roc_auc_score(y_test, gb_pred_proba)
gb_test_f1 = f1_score(y_test, gb_pred)

# Compare results
comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting'],
    'CV ROC AUC': [rf_result.best_score, gb_result.best_score],
    'Test ROC AUC': [test_auc, gb_test_auc],
    'Test F1': [test_f1, gb_test_f1]
})

print("\nModel Comparison:")
print("=" * 70)
print(comparison.to_string(index=False))
print("=" * 70)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC AUC comparison
models = comparison['Model']
x = np.arange(len(models))
width = 0.35

axes[0].bar(x - width/2, comparison['CV ROC AUC'], width, label='CV ROC AUC', alpha=0.8)
axes[0].bar(x + width/2, comparison['Test ROC AUC'], width, label='Test ROC AUC', alpha=0.8)
axes[0].set_ylabel('ROC AUC')
axes[0].set_title('Model Performance Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# F1 Score comparison
axes[1].bar(models, comparison['Test F1'], color=['blue', 'green'], alpha=0.7)
axes[1].set_ylabel('F1 Score')
axes[1].set_title('Test F1 Score Comparison')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Analyze Optimization Trials

In [None]:
# View all trials for Random Forest
trials_df = pd.DataFrame(rf_result.all_trials)

print(f"Total trials: {len(trials_df)}")
print(f"\nTop 5 trials:")
print(trials_df.nlargest(5, 'score'))

print(f"\nBottom 5 trials:")
print(trials_df.nsmallest(5, 'score'))

In [None]:
# Analyze parameter relationships
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# n_estimators vs score
axes[0, 0].scatter(trials_df['n_estimators'], trials_df['score'], alpha=0.6)
axes[0, 0].set_xlabel('n_estimators')
axes[0, 0].set_ylabel('ROC AUC')
axes[0, 0].set_title('n_estimators vs Performance')
axes[0, 0].grid(True, alpha=0.3)

# max_depth vs score
axes[0, 1].scatter(trials_df['max_depth'], trials_df['score'], alpha=0.6, color='orange')
axes[0, 1].set_xlabel('max_depth')
axes[0, 1].set_ylabel('ROC AUC')
axes[0, 1].set_title('max_depth vs Performance')
axes[0, 1].grid(True, alpha=0.3)

# min_samples_split vs score
axes[1, 0].scatter(trials_df['min_samples_split'], trials_df['score'], alpha=0.6, color='green')
axes[1, 0].set_xlabel('min_samples_split')
axes[1, 0].set_ylabel('ROC AUC')
axes[1, 0].set_title('min_samples_split vs Performance')
axes[1, 0].grid(True, alpha=0.3)

# min_samples_leaf vs score
axes[1, 1].scatter(trials_df['min_samples_leaf'], trials_df['score'], alpha=0.6, color='red')
axes[1, 1].set_xlabel('min_samples_leaf')
axes[1, 1].set_ylabel('ROC AUC')
axes[1, 1].set_title('min_samples_leaf vs Performance')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Export Optimization Results

In [None]:
# Save optimization visualization
import os
os.makedirs("../optimization_results", exist_ok=True)

viz_path = optimizer.visualize_optimization(
    output_path="../optimization_results/rf_optimization.png"
)

print(f"✓ Optimization visualization saved to: {viz_path}")

In [None]:
# Save best parameters
import json

best_params_path = "../optimization_results/best_params.json"
with open(best_params_path, 'w') as f:
    json.dump({
        'random_forest': rf_result.best_params,
        'gradient_boosting': gb_result.best_params
    }, f, indent=2)

print(f"✓ Best parameters saved to: {best_params_path}")

## Summary

This notebook demonstrated:

1. **Setting up** hyperparameter optimization with experiment tracking
2. **Defining** objective functions and parameter spaces
3. **Running** Bayesian optimization for multiple models
4. **Visualizing** optimization history and parameter importance
5. **Comparing** optimized models
6. **Analyzing** trial results and parameter relationships
7. **Exporting** results for documentation

### Key Findings

- Bayesian optimization efficiently explores parameter space
- Parameter importance helps focus future tuning efforts
- Cross-validation provides robust performance estimates
- Automated tracking ensures reproducibility

### Next Steps

- Try different optimization strategies (grid, random)
- Optimize for different metrics (F1, precision, recall)
- Implement early stopping for faster optimization
- Set up automated hyperparameter tuning pipelines