# Task 13.6: Agent Benchmark - Solutions

This notebook contains complete solutions for the Agent Benchmarking exercises.

---

## Challenge: Continuous Evaluation Pipeline

**Solution:** A pipeline that tracks performance over time and alerts on degradation.

In [None]:
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Callable, Optional
from dataclasses import dataclass, field
import json
import statistics

sys.path.insert(0, str(Path.cwd().parent / 'scripts'))

from benchmark_utils import (
    TestCase, BenchmarkResults, AgentEvaluator, TestCategory, Difficulty,
    generate_report, save_results_json
)

print("Imports ready!")

In [None]:
@dataclass
class PerformanceSnapshot:
    """A snapshot of performance at a point in time."""
    timestamp: datetime
    overall_score: float
    category_scores: Dict[str, float]
    mean_latency_ms: float
    tests_passed: int
    tests_total: int
    version: str = "unknown"

@dataclass
class Alert:
    """Alert for performance issues."""
    severity: str  # info, warning, critical
    message: str
    metric: str
    current_value: float
    threshold: float
    timestamp: datetime = field(default_factory=datetime.now)

print("Data classes defined!")

In [None]:
class ContinuousEvaluationPipeline:
    """Continuous evaluation pipeline with alerting."""
    
    def __init__(
        self,
        agent_func: Callable,
        test_cases: List[TestCase],
        agent_name: str = "Agent"
    ):
        self.agent_func = agent_func
        self.test_cases = test_cases
        self.agent_name = agent_name
        self.history: List[PerformanceSnapshot] = []
        self.alerts: List[Alert] = []
        
        # Thresholds
        self.thresholds = {
            "min_score": 0.7,
            "max_latency_ms": 5000,
            "min_pass_rate": 0.8,
            "score_degradation": 0.1,  # Alert if score drops by 10%
        }
    
    def run_evaluation(self, version: str = "current") -> PerformanceSnapshot:
        """Run a single evaluation and record results."""
        print(f"\nüîç Running evaluation for {self.agent_name} (v{version})...")
        
        evaluator = AgentEvaluator(self.agent_func, verbose=False)
        results = evaluator.run_benchmark(self.test_cases, self.agent_name)
        
        # Create snapshot
        snapshot = PerformanceSnapshot(
            timestamp=datetime.now(),
            overall_score=results.overall_score,
            category_scores=results.category_scores,
            mean_latency_ms=results.timing_stats["mean_latency_ms"],
            tests_passed=sum(1 for r in results.results if r.passed),
            tests_total=len(results.results),
            version=version
        )
        
        self.history.append(snapshot)
        
        # Check for alerts
        self._check_alerts(snapshot)
        
        return snapshot
    
    def _check_alerts(self, snapshot: PerformanceSnapshot):
        """Check thresholds and create alerts."""
        
        # Check minimum score
        if snapshot.overall_score < self.thresholds["min_score"]:
            self.alerts.append(Alert(
                severity="warning",
                message=f"Score below minimum threshold",
                metric="overall_score",
                current_value=snapshot.overall_score,
                threshold=self.thresholds["min_score"]
            ))
        
        # Check latency
        if snapshot.mean_latency_ms > self.thresholds["max_latency_ms"]:
            self.alerts.append(Alert(
                severity="warning",
                message=f"Latency above threshold",
                metric="mean_latency_ms",
                current_value=snapshot.mean_latency_ms,
                threshold=self.thresholds["max_latency_ms"]
            ))
        
        # Check pass rate
        pass_rate = snapshot.tests_passed / snapshot.tests_total
        if pass_rate < self.thresholds["min_pass_rate"]:
            self.alerts.append(Alert(
                severity="critical",
                message=f"Pass rate below minimum",
                metric="pass_rate",
                current_value=pass_rate,
                threshold=self.thresholds["min_pass_rate"]
            ))
        
        # Check for degradation from previous run
        if len(self.history) >= 2:
            prev = self.history[-2]
            score_drop = prev.overall_score - snapshot.overall_score
            if score_drop > self.thresholds["score_degradation"]:
                self.alerts.append(Alert(
                    severity="critical",
                    message=f"Significant score degradation from previous run",
                    metric="score_degradation",
                    current_value=score_drop,
                    threshold=self.thresholds["score_degradation"]
                ))
    
    def get_trend_report(self) -> str:
        """Generate a trend report from history."""
        if not self.history:
            return "No evaluation history available."
        
        lines = [
            "="*60,
            f"PERFORMANCE TREND REPORT: {self.agent_name}",
            "="*60,
            f"Total evaluations: {len(self.history)}",
            ""
        ]
        
        # Score trend
        scores = [s.overall_score for s in self.history]
        lines.append("Score Trend:")
        lines.append(f"  Latest: {scores[-1]:.1%}")
        lines.append(f"  Average: {statistics.mean(scores):.1%}")
        lines.append(f"  Min: {min(scores):.1%}")
        lines.append(f"  Max: {max(scores):.1%}")
        
        # Latency trend
        latencies = [s.mean_latency_ms for s in self.history]
        lines.append("\nLatency Trend:")
        lines.append(f"  Latest: {latencies[-1]:.0f}ms")
        lines.append(f"  Average: {statistics.mean(latencies):.0f}ms")
        
        # Recent alerts
        recent_alerts = [a for a in self.alerts if (datetime.now() - a.timestamp).seconds < 3600]
        if recent_alerts:
            lines.append("\n‚ö†Ô∏è Recent Alerts:")
            for alert in recent_alerts[-5:]:
                lines.append(f"  [{alert.severity.upper()}] {alert.message}")
        else:
            lines.append("\n‚úÖ No recent alerts")
        
        lines.append("\n" + "="*60)
        
        return "\n".join(lines)
    
    def save_history(self, path: str):
        """Save evaluation history to JSON."""
        data = {
            "agent_name": self.agent_name,
            "history": [
                {
                    "timestamp": s.timestamp.isoformat(),
                    "overall_score": s.overall_score,
                    "category_scores": s.category_scores,
                    "mean_latency_ms": s.mean_latency_ms,
                    "tests_passed": s.tests_passed,
                    "tests_total": s.tests_total,
                    "version": s.version
                }
                for s in self.history
            ],
            "alerts": [
                {
                    "severity": a.severity,
                    "message": a.message,
                    "metric": a.metric,
                    "current_value": a.current_value,
                    "threshold": a.threshold,
                    "timestamp": a.timestamp.isoformat()
                }
                for a in self.alerts
            ]
        }
        
        with open(path, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"History saved to: {path}")

print("ContinuousEvaluationPipeline defined!")

In [None]:
# Create test cases
test_cases = [
    TestCase(
        id="test_001",
        query="What is DGX Spark's memory?",
        expected_answer="128GB unified memory",
        category=TestCategory.FACTUAL_RETRIEVAL,
        difficulty=Difficulty.EASY,
        keywords=["128GB", "unified"]
    ),
    TestCase(
        id="test_002",
        query="How many CUDA cores?",
        expected_answer="6,144 CUDA cores",
        category=TestCategory.FACTUAL_RETRIEVAL,
        difficulty=Difficulty.EASY,
        keywords=["6144", "CUDA"]
    ),
]

# Create a simple test agent
from langchain_community.llms import Ollama
llm = Ollama(model="llama3.1:8b", temperature=0.1)

def test_agent(query: str) -> str:
    knowledge = "DGX Spark has 128GB unified memory and 6,144 CUDA cores."
    return llm.invoke(f"Based on: {knowledge}\n\nAnswer: {query}")

print("Test setup complete!")

In [None]:
# Create and use the pipeline
pipeline = ContinuousEvaluationPipeline(
    agent_func=test_agent,
    test_cases=test_cases,
    agent_name="Test RAG Agent"
)

# Simulate multiple evaluation runs
for version in ["1.0.0", "1.0.1", "1.1.0"]:
    snapshot = pipeline.run_evaluation(version=version)
    print(f"  Score: {snapshot.overall_score:.1%}, Latency: {snapshot.mean_latency_ms:.0f}ms")

In [None]:
# View the trend report
print(pipeline.get_trend_report())

In [None]:
# Save history
output_dir = Path.cwd().parent / "data" / "benchmark_results"
output_dir.mkdir(parents=True, exist_ok=True)
pipeline.save_history(str(output_dir / "continuous_eval_history.json"))

## Key Takeaways

1. **Continuous evaluation** catches regressions early
2. **Alerting thresholds** provide automatic monitoring
3. **Version tracking** helps identify which changes caused issues
4. **Trend analysis** shows long-term performance patterns
5. **Persistent history** enables post-mortem analysis