# Lab 3.6.6: Agent Benchmark - SOLUTIONS

**Complete solutions with explanations and alternative approaches**

---

## Setup

In [None]:
from typing import Dict, Any, List, Optional, Callable, Tuple
from dataclasses import dataclass, field
from enum import Enum
import json
import time
import statistics
from collections import defaultdict

print("Setup complete!")

---

## Exercise 1 Solution: Comprehensive Benchmark Framework

**Task**: Build a complete agent evaluation framework with multiple metrics.

In [None]:
class TestCategory(Enum):
    """Categories of benchmark tests."""
    FACTUAL = "factual"
    REASONING = "reasoning"
    CALCULATION = "calculation"
    TOOL_USE = "tool_use"
    MULTI_STEP = "multi_step"
    CREATIVE = "creative"


@dataclass
class BenchmarkTest:
    """A single benchmark test case."""
    id: str
    question: str
    expected_answer: str
    category: TestCategory
    keywords: List[str] = field(default_factory=list)
    required_tools: List[str] = field(default_factory=list)
    difficulty: str = "medium"
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class TestResult:
    """Result of running a benchmark test."""
    test_id: str
    agent_response: str
    expected_answer: str
    scores: Dict[str, float]
    latency_ms: float
    tools_used: List[str]
    passed: bool
    error: Optional[str] = None


class MetricCalculator:
    """
    Collection of evaluation metrics for agent responses.
    """
    
    @staticmethod
    def keyword_match(response: str, keywords: List[str]) -> float:
        """
        Calculate keyword match score.
        
        Score = (matched keywords) / (total keywords)
        """
        if not keywords:
            return 1.0
        
        response_lower = response.lower()
        matched = sum(1 for kw in keywords if kw.lower() in response_lower)
        return matched / len(keywords)
    
    @staticmethod
    def exact_match(response: str, expected: str) -> float:
        """
        Check for exact match (case-insensitive, whitespace-normalized).
        """
        norm_response = " ".join(response.lower().split())
        norm_expected = " ".join(expected.lower().split())
        return 1.0 if norm_expected in norm_response else 0.0
    
    @staticmethod
    def token_f1(response: str, expected: str) -> float:
        """
        Calculate token-level F1 score.
        
        Measures overlap between response and expected tokens.
        """
        response_tokens = set(response.lower().split())
        expected_tokens = set(expected.lower().split())
        
        if not expected_tokens:
            return 1.0 if not response_tokens else 0.0
        
        intersection = response_tokens & expected_tokens
        
        if not intersection:
            return 0.0
        
        precision = len(intersection) / len(response_tokens) if response_tokens else 0
        recall = len(intersection) / len(expected_tokens)
        
        if precision + recall == 0:
            return 0.0
        
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1
    
    @staticmethod
    def semantic_similarity(response: str, expected: str) -> float:
        """
        Approximate semantic similarity using character n-gram overlap.
        
        Note: For production, use proper embedding-based similarity.
        """
        def get_ngrams(text: str, n: int = 3) -> set:
            text = text.lower().replace(" ", "")
            return set(text[i:i+n] for i in range(len(text)-n+1))
        
        response_ngrams = get_ngrams(response)
        expected_ngrams = get_ngrams(expected)
        
        if not expected_ngrams or not response_ngrams:
            return 0.0
        
        intersection = response_ngrams & expected_ngrams
        union = response_ngrams | expected_ngrams
        
        return len(intersection) / len(union) if union else 0.0
    
    @staticmethod
    def tool_usage_score(tools_used: List[str], required_tools: List[str]) -> float:
        """
        Score based on correct tool usage.
        """
        if not required_tools:
            return 1.0
        
        tools_used_set = set(t.lower() for t in tools_used)
        required_set = set(t.lower() for t in required_tools)
        
        # Precision: used correct tools
        correct_used = tools_used_set & required_set
        precision = len(correct_used) / len(tools_used_set) if tools_used_set else 0
        
        # Recall: used all required tools
        recall = len(correct_used) / len(required_set)
        
        if precision + recall == 0:
            return 0.0
        
        return 2 * (precision * recall) / (precision + recall)
    
    @staticmethod
    def response_length_penalty(response: str, min_len: int = 10, max_len: int = 1000) -> float:
        """
        Penalize responses that are too short or too long.
        """
        length = len(response.split())
        
        if length < min_len:
            return length / min_len
        elif length > max_len:
            return max_len / length
        else:
            return 1.0


class AgentBenchmark:
    """
    Comprehensive agent benchmarking framework.
    
    Features:
    - Multiple evaluation metrics
    - Category-based testing
    - Performance tracking
    - Comparative analysis
    """
    
    def __init__(self, name: str = "AgentBenchmark", pass_threshold: float = 0.6):
        self.name = name
        self.pass_threshold = pass_threshold
        self.tests: List[BenchmarkTest] = []
        self.results: List[TestResult] = []
        self.metrics = MetricCalculator()
    
    def add_test(self, test: BenchmarkTest) -> None:
        """Add a benchmark test."""
        self.tests.append(test)
    
    def add_tests_from_dict(self, tests_data: List[Dict]) -> None:
        """Add multiple tests from dictionary format."""
        for data in tests_data:
            test = BenchmarkTest(
                id=data["id"],
                question=data["question"],
                expected_answer=data["expected_answer"],
                category=TestCategory(data.get("category", "factual")),
                keywords=data.get("keywords", []),
                required_tools=data.get("required_tools", []),
                difficulty=data.get("difficulty", "medium"),
            )
            self.add_test(test)
    
    def evaluate_response(self, test: BenchmarkTest, response: str, 
                          tools_used: List[str], latency_ms: float) -> TestResult:
        """Evaluate a single response against a test."""
        scores = {
            "keyword_match": self.metrics.keyword_match(response, test.keywords),
            "exact_match": self.metrics.exact_match(response, test.expected_answer),
            "token_f1": self.metrics.token_f1(response, test.expected_answer),
            "semantic": self.metrics.semantic_similarity(response, test.expected_answer),
            "tool_usage": self.metrics.tool_usage_score(tools_used, test.required_tools),
            "length_quality": self.metrics.response_length_penalty(response),
        }
        
        # Weighted overall score
        weights = {
            "keyword_match": 0.25,
            "exact_match": 0.15,
            "token_f1": 0.20,
            "semantic": 0.15,
            "tool_usage": 0.15,
            "length_quality": 0.10,
        }
        
        overall = sum(scores[k] * weights[k] for k in scores)
        scores["overall"] = overall
        
        return TestResult(
            test_id=test.id,
            agent_response=response,
            expected_answer=test.expected_answer,
            scores=scores,
            latency_ms=latency_ms,
            tools_used=tools_used,
            passed=overall >= self.pass_threshold,
        )
    
    def run_benchmark(self, agent_fn: Callable[[str], Tuple[str, List[str]]],
                      categories: Optional[List[TestCategory]] = None) -> Dict[str, Any]:
        """
        Run benchmark against an agent.
        
        Args:
            agent_fn: Function that takes question and returns (response, tools_used)
            categories: Filter to specific categories (None = all)
            
        Returns:
            Benchmark results summary
        """
        tests = self.tests
        if categories:
            tests = [t for t in tests if t.category in categories]
        
        print(f"\n{'='*60}")
        print(f"RUNNING BENCHMARK: {self.name}")
        print(f"{'='*60}")
        print(f"Tests: {len(tests)}")
        
        self.results = []
        
        for i, test in enumerate(tests, 1):
            print(f"\n[{i}/{len(tests)}] {test.id}: {test.question[:50]}...")
            
            start_time = time.perf_counter()
            try:
                response, tools_used = agent_fn(test.question)
                latency_ms = (time.perf_counter() - start_time) * 1000
                
                result = self.evaluate_response(test, response, tools_used, latency_ms)
                
            except Exception as e:
                latency_ms = (time.perf_counter() - start_time) * 1000
                result = TestResult(
                    test_id=test.id,
                    agent_response="",
                    expected_answer=test.expected_answer,
                    scores={"overall": 0.0},
                    latency_ms=latency_ms,
                    tools_used=[],
                    passed=False,
                    error=str(e),
                )
            
            self.results.append(result)
            status = "PASS" if result.passed else "FAIL"
            print(f"  Status: {status} (score: {result.scores.get('overall', 0):.2%}, latency: {result.latency_ms:.1f}ms)")
        
        return self.get_summary()
    
    def get_summary(self) -> Dict[str, Any]:
        """Generate benchmark summary."""
        if not self.results:
            return {"error": "No results"}
        
        passed = sum(1 for r in self.results if r.passed)
        overall_scores = [r.scores.get("overall", 0) for r in self.results]
        latencies = [r.latency_ms for r in self.results]
        
        # Category breakdown
        by_category = defaultdict(list)
        for result, test in zip(self.results, self.tests):
            by_category[test.category.value].append(result.scores.get("overall", 0))
        
        category_scores = {
            cat: statistics.mean(scores) if scores else 0
            for cat, scores in by_category.items()
        }
        
        return {
            "benchmark": self.name,
            "total_tests": len(self.results),
            "passed": passed,
            "failed": len(self.results) - passed,
            "pass_rate": passed / len(self.results),
            "scores": {
                "mean": statistics.mean(overall_scores),
                "median": statistics.median(overall_scores),
                "std": statistics.stdev(overall_scores) if len(overall_scores) > 1 else 0,
                "min": min(overall_scores),
                "max": max(overall_scores),
            },
            "latency_ms": {
                "mean": statistics.mean(latencies),
                "median": statistics.median(latencies),
                "p95": sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0,
            },
            "by_category": category_scores,
        }


# Create benchmark with test cases
print("=" * 60)
print("AGENT BENCHMARK FRAMEWORK SOLUTION")
print("=" * 60)

benchmark = AgentBenchmark(name="AI Agent Evaluation", pass_threshold=0.5)

# Add test cases
test_cases = [
    {
        "id": "FACT001",
        "question": "What is the capital of France?",
        "expected_answer": "The capital of France is Paris.",
        "category": "factual",
        "keywords": ["Paris", "capital", "France"],
        "difficulty": "easy",
    },
    {
        "id": "CALC001",
        "question": "What is 25 multiplied by 4?",
        "expected_answer": "25 multiplied by 4 equals 100.",
        "category": "calculation",
        "keywords": ["100"],
        "required_tools": ["calculator"],
        "difficulty": "easy",
    },
    {
        "id": "REASON001",
        "question": "If it takes 5 machines 5 minutes to make 5 widgets, how long would it take 100 machines to make 100 widgets?",
        "expected_answer": "It would take 5 minutes. Each machine makes 1 widget in 5 minutes, so 100 machines make 100 widgets in 5 minutes.",
        "category": "reasoning",
        "keywords": ["5 minutes", "each machine"],
        "difficulty": "medium",
    },
    {
        "id": "TOOL001",
        "question": "Search for information about machine learning and summarize the key concepts.",
        "expected_answer": "Machine learning is a subset of AI that enables systems to learn from data. Key concepts include supervised learning, unsupervised learning, and neural networks.",
        "category": "tool_use",
        "keywords": ["machine learning", "AI", "learning", "data"],
        "required_tools": ["search"],
        "difficulty": "medium",
    },
]

benchmark.add_tests_from_dict(test_cases)

# Simulate agent responses
def simulated_agent(question: str) -> Tuple[str, List[str]]:
    """Simulated agent for testing."""
    tools_used = []
    
    if "capital" in question.lower():
        return "Paris is the capital of France.", []
    elif "multipl" in question.lower() or "25" in question:
        tools_used = ["calculator"]
        return "The result of 25 multiplied by 4 is 100.", tools_used
    elif "machine" in question.lower() and "widget" in question.lower():
        return "It takes 5 minutes. Each machine independently produces one widget in 5 minutes.", []
    elif "search" in question.lower():
        tools_used = ["search"]
        return "Machine learning is an AI technology that allows systems to learn patterns from data. Key concepts include supervised learning, unsupervised learning, and deep neural networks.", tools_used
    else:
        return "I don't have enough information to answer this question.", []


# Run benchmark
results = benchmark.run_benchmark(simulated_agent)

print(f"\n--- Benchmark Summary ---")
print(f"Pass rate: {results['pass_rate']:.1%}")
print(f"Mean score: {results['scores']['mean']:.2%}")
print(f"Mean latency: {results['latency_ms']['mean']:.1f}ms")
print(f"\nBy category:")
for cat, score in results['by_category'].items():
    print(f"  {cat}: {score:.2%}")

---

## Exercise 2 Solution: Comparative Benchmarking

**Task**: Build a framework for comparing multiple agents on the same benchmark.

In [None]:
class ComparativeBenchmark:
    """
    Framework for comparing multiple agents.
    
    Features:
    - Run same tests across agents
    - Statistical comparison
    - Ranking and visualization
    """
    
    def __init__(self, benchmark: AgentBenchmark):
        self.benchmark = benchmark
        self.agent_results: Dict[str, Dict] = {}
    
    def evaluate_agent(self, name: str, 
                       agent_fn: Callable[[str], Tuple[str, List[str]]]) -> Dict:
        """Evaluate a single agent."""
        print(f"\n>>> Evaluating: {name}")
        results = self.benchmark.run_benchmark(agent_fn)
        self.agent_results[name] = results
        return results
    
    def compare(self) -> Dict[str, Any]:
        """Compare all evaluated agents."""
        if not self.agent_results:
            return {"error": "No agents evaluated"}
        
        # Rank by mean score
        rankings = sorted(
            self.agent_results.items(),
            key=lambda x: x[1]["scores"]["mean"],
            reverse=True
        )
        
        comparison = {
            "agents_compared": len(self.agent_results),
            "rankings": [
                {
                    "rank": i + 1,
                    "agent": name,
                    "mean_score": results["scores"]["mean"],
                    "pass_rate": results["pass_rate"],
                    "mean_latency_ms": results["latency_ms"]["mean"],
                }
                for i, (name, results) in enumerate(rankings)
            ],
            "best_agent": rankings[0][0] if rankings else None,
            "score_spread": {
                "best": rankings[0][1]["scores"]["mean"] if rankings else 0,
                "worst": rankings[-1][1]["scores"]["mean"] if rankings else 0,
                "difference": (
                    rankings[0][1]["scores"]["mean"] - rankings[-1][1]["scores"]["mean"]
                ) if rankings else 0,
            },
            "category_winners": self._get_category_winners(),
        }
        
        return comparison
    
    def _get_category_winners(self) -> Dict[str, str]:
        """Find best agent for each category."""
        categories = set()
        for results in self.agent_results.values():
            categories.update(results.get("by_category", {}).keys())
        
        winners = {}
        for cat in categories:
            best_agent = None
            best_score = -1
            for name, results in self.agent_results.items():
                score = results.get("by_category", {}).get(cat, 0)
                if score > best_score:
                    best_score = score
                    best_agent = name
            winners[cat] = best_agent
        
        return winners
    
    def generate_report(self) -> str:
        """Generate comparison report."""
        comparison = self.compare()
        
        lines = [
            "=" * 60,
            "AGENT COMPARISON REPORT",
            "=" * 60,
            f"\nAgents compared: {comparison['agents_compared']}",
            f"Best overall: {comparison['best_agent']}",
            f"\n--- Rankings ---",
        ]
        
        for entry in comparison["rankings"]:
            lines.append(
                f"{entry['rank']}. {entry['agent']}: "
                f"score={entry['mean_score']:.2%}, "
                f"pass={entry['pass_rate']:.0%}, "
                f"latency={entry['mean_latency_ms']:.1f}ms"
            )
        
        lines.append(f"\n--- Category Winners ---")
        for cat, winner in comparison["category_winners"].items():
            lines.append(f"  {cat}: {winner}")
        
        return "\n".join(lines)


# Define different agent variants
def agent_v1(question: str) -> Tuple[str, List[str]]:
    """Basic agent."""
    time.sleep(0.01)  # Simulate processing
    if "capital" in question.lower():
        return "Paris is the capital of France.", []
    elif "25" in question:
        return "100", ["calculator"]
    elif "widget" in question.lower():
        return "5 minutes", []
    elif "search" in question.lower():
        return "Machine learning is AI that learns from data.", ["search"]
    return "Unknown", []

def agent_v2(question: str) -> Tuple[str, List[str]]:
    """Improved agent with better responses."""
    time.sleep(0.015)  # Slightly slower but more thorough
    if "capital" in question.lower():
        return "The capital of France is Paris, known as the City of Light.", []
    elif "25" in question:
        return "25 multiplied by 4 equals 100. I used the calculator to verify.", ["calculator"]
    elif "widget" in question.lower():
        return "It takes 5 minutes. Each machine produces one widget in 5 minutes, so 100 machines can make 100 widgets simultaneously in 5 minutes.", []
    elif "search" in question.lower():
        return "Machine learning is a subset of AI that enables systems to learn from data. Key concepts include supervised learning, unsupervised learning, and neural networks for deep learning.", ["search"]
    return "I don't have information about that.", []

def agent_v3(question: str) -> Tuple[str, List[str]]:
    """Fast but less accurate agent."""
    time.sleep(0.005)  # Fast
    if "capital" in question.lower():
        return "Paris", []  # Too brief
    elif "25" in question:
        return "100", []  # Forgot to use calculator
    elif "widget" in question.lower():
        return "100 minutes", []  # Wrong answer!
    elif "search" in question.lower():
        return "ML is about data.", []  # Didn't use search
    return "?", []


# Run comparative benchmark
print("=" * 60)
print("COMPARATIVE BENCHMARKING SOLUTION")
print("=" * 60)

# Reset benchmark
benchmark = AgentBenchmark(name="Comparison Test", pass_threshold=0.5)
benchmark.add_tests_from_dict(test_cases)

comparator = ComparativeBenchmark(benchmark)

# Evaluate agents
comparator.evaluate_agent("Agent-v1-Basic", agent_v1)

# Reset benchmark for next agent
benchmark.results = []
comparator.evaluate_agent("Agent-v2-Improved", agent_v2)

benchmark.results = []
comparator.evaluate_agent("Agent-v3-Fast", agent_v3)

# Generate report
print(comparator.generate_report())

---

## Exercise 3 Solution: Stress Testing

**Task**: Build a stress testing framework for agent performance under load.

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading


@dataclass
class StressTestResult:
    """Result from stress testing."""
    total_requests: int
    successful: int
    failed: int
    total_time_s: float
    latencies_ms: List[float]
    errors: List[str]
    
    @property
    def success_rate(self) -> float:
        return self.successful / self.total_requests if self.total_requests else 0
    
    @property
    def throughput(self) -> float:
        return self.successful / self.total_time_s if self.total_time_s else 0
    
    @property
    def avg_latency(self) -> float:
        return statistics.mean(self.latencies_ms) if self.latencies_ms else 0
    
    @property
    def p99_latency(self) -> float:
        if not self.latencies_ms:
            return 0
        return sorted(self.latencies_ms)[int(len(self.latencies_ms) * 0.99)]


class AgentStressTest:
    """
    Stress testing framework for agents.
    
    Features:
    - Concurrent request simulation
    - Throughput measurement
    - Latency profiling
    - Error tracking
    """
    
    def __init__(self, agent_fn: Callable[[str], Tuple[str, List[str]]]):
        self.agent_fn = agent_fn
        self.lock = threading.Lock()
    
    def _run_single_request(self, question: str) -> Tuple[bool, float, Optional[str]]:
        """Run a single request."""
        start = time.perf_counter()
        try:
            response, _ = self.agent_fn(question)
            latency = (time.perf_counter() - start) * 1000
            return True, latency, None
        except Exception as e:
            latency = (time.perf_counter() - start) * 1000
            return False, latency, str(e)
    
    def run_load_test(self, questions: List[str], 
                      concurrent_users: int = 10,
                      requests_per_user: int = 5) -> StressTestResult:
        """
        Run load test with concurrent users.
        
        Args:
            questions: Pool of questions to sample from
            concurrent_users: Number of concurrent simulated users
            requests_per_user: Requests each user makes
        """
        total_requests = concurrent_users * requests_per_user
        print(f"\nRunning load test: {concurrent_users} users x {requests_per_user} requests = {total_requests} total")
        
        latencies = []
        errors = []
        successful = 0
        
        start_time = time.perf_counter()
        
        with ThreadPoolExecutor(max_workers=concurrent_users) as executor:
            # Create all request tasks
            import random
            tasks = [
                random.choice(questions)
                for _ in range(total_requests)
            ]
            
            futures = [executor.submit(self._run_single_request, q) for q in tasks]
            
            for future in as_completed(futures):
                success, latency, error = future.result()
                with self.lock:
                    latencies.append(latency)
                    if success:
                        successful += 1
                    elif error:
                        errors.append(error)
        
        total_time = time.perf_counter() - start_time
        
        return StressTestResult(
            total_requests=total_requests,
            successful=successful,
            failed=total_requests - successful,
            total_time_s=total_time,
            latencies_ms=latencies,
            errors=errors[:10],  # Keep first 10 errors
        )
    
    def run_ramp_test(self, questions: List[str],
                      user_counts: List[int] = [1, 5, 10, 20]) -> List[StressTestResult]:
        """
        Run tests with increasing user counts.
        """
        results = []
        
        for users in user_counts:
            result = self.run_load_test(questions, concurrent_users=users, requests_per_user=3)
            results.append(result)
            print(f"  {users} users: {result.throughput:.1f} req/s, {result.avg_latency:.1f}ms avg latency")
        
        return results


# Run stress test
print("=" * 60)
print("STRESS TESTING SOLUTION")
print("=" * 60)

test_questions = [
    "What is the capital of France?",
    "Calculate 100 * 5",
    "Explain machine learning",
    "What is 2 + 2?",
]

stress_tester = AgentStressTest(agent_v2)

# Single load test
print("\n--- Single Load Test ---")
result = stress_tester.run_load_test(test_questions, concurrent_users=5, requests_per_user=4)

print(f"\nResults:")
print(f"  Success rate: {result.success_rate:.1%}")
print(f"  Throughput: {result.throughput:.1f} requests/second")
print(f"  Avg latency: {result.avg_latency:.1f}ms")
print(f"  P99 latency: {result.p99_latency:.1f}ms")

# Ramp test
print("\n--- Ramp Test ---")
ramp_results = stress_tester.run_ramp_test(test_questions, user_counts=[1, 2, 5, 10])

print("\n--- Scaling Analysis ---")
for i, result in enumerate(ramp_results):
    users = [1, 2, 5, 10][i]
    print(f"{users} users: throughput={result.throughput:.1f}/s, latency={result.avg_latency:.1f}ms, success={result.success_rate:.1%}")

---

## Challenge Solution: Automated Regression Testing

**Task**: Build a regression testing system that tracks agent performance over time.

In [None]:
@dataclass
class BenchmarkRun:
    """A single benchmark run with timestamp."""
    run_id: str
    timestamp: float
    version: str
    results: Dict[str, Any]


class RegressionTracker:
    """
    Tracks agent performance over time.
    
    Features:
    - Historical tracking
    - Regression detection
    - Trend analysis
    - Alerting
    """
    
    def __init__(self, regression_threshold: float = 0.05):
        self.regression_threshold = regression_threshold
        self.history: List[BenchmarkRun] = []
        self.baseline: Optional[BenchmarkRun] = None
    
    def record_run(self, version: str, results: Dict[str, Any]) -> BenchmarkRun:
        """Record a benchmark run."""
        run = BenchmarkRun(
            run_id=f"run_{len(self.history) + 1:03d}",
            timestamp=time.time(),
            version=version,
            results=results,
        )
        
        self.history.append(run)
        
        # Set first run as baseline if none exists
        if self.baseline is None:
            self.baseline = run
        
        return run
    
    def set_baseline(self, run_id: str) -> None:
        """Set a specific run as baseline."""
        for run in self.history:
            if run.run_id == run_id:
                self.baseline = run
                return
        raise ValueError(f"Run not found: {run_id}")
    
    def check_regression(self, current: BenchmarkRun) -> Dict[str, Any]:
        """Check for regressions against baseline."""
        if not self.baseline:
            return {"status": "no_baseline"}
        
        baseline_score = self.baseline.results.get("scores", {}).get("mean", 0)
        current_score = current.results.get("scores", {}).get("mean", 0)
        
        baseline_latency = self.baseline.results.get("latency_ms", {}).get("mean", 0)
        current_latency = current.results.get("latency_ms", {}).get("mean", 0)
        
        score_change = current_score - baseline_score
        latency_change = current_latency - baseline_latency
        
        regressions = []
        
        if score_change < -self.regression_threshold:
            regressions.append({
                "metric": "score",
                "baseline": baseline_score,
                "current": current_score,
                "change": score_change,
                "severity": "high" if score_change < -0.1 else "medium",
            })
        
        latency_threshold = baseline_latency * 0.2  # 20% slowdown
        if latency_change > latency_threshold:
            regressions.append({
                "metric": "latency",
                "baseline": baseline_latency,
                "current": current_latency,
                "change": latency_change,
                "severity": "high" if latency_change > latency_threshold * 2 else "medium",
            })
        
        return {
            "status": "regression_detected" if regressions else "ok",
            "baseline_version": self.baseline.version,
            "current_version": current.version,
            "score_change": score_change,
            "latency_change": latency_change,
            "regressions": regressions,
        }
    
    def get_trend(self, metric: str = "score", window: int = 5) -> Dict[str, Any]:
        """Analyze trend over recent runs."""
        if len(self.history) < 2:
            return {"status": "insufficient_data"}
        
        recent = self.history[-window:]
        
        if metric == "score":
            values = [r.results.get("scores", {}).get("mean", 0) for r in recent]
        elif metric == "latency":
            values = [r.results.get("latency_ms", {}).get("mean", 0) for r in recent]
        elif metric == "pass_rate":
            values = [r.results.get("pass_rate", 0) for r in recent]
        else:
            return {"error": f"Unknown metric: {metric}"}
        
        # Simple linear trend
        if len(values) >= 2:
            slope = (values[-1] - values[0]) / len(values)
            if slope > 0.01:
                direction = "improving"
            elif slope < -0.01:
                direction = "degrading"
            else:
                direction = "stable"
        else:
            direction = "unknown"
        
        return {
            "metric": metric,
            "direction": direction,
            "values": values,
            "latest": values[-1] if values else None,
            "average": statistics.mean(values) if values else None,
        }
    
    def generate_report(self) -> str:
        """Generate regression tracking report."""
        if not self.history:
            return "No benchmark history"
        
        latest = self.history[-1]
        regression_check = self.check_regression(latest)
        score_trend = self.get_trend("score")
        
        lines = [
            "=" * 60,
            "REGRESSION TRACKING REPORT",
            "=" * 60,
            f"\nTotal runs: {len(self.history)}",
            f"Latest version: {latest.version}",
            f"Baseline version: {self.baseline.version if self.baseline else 'None'}",
            f"\n--- Regression Check ---",
            f"Status: {regression_check['status'].upper()}",
        ]
        
        if regression_check.get("regressions"):
            lines.append("\nRegressions detected:")
            for reg in regression_check["regressions"]:
                lines.append(
                    f"  [{reg['severity'].upper()}] {reg['metric']}: "
                    f"{reg['baseline']:.3f} -> {reg['current']:.3f} ({reg['change']:+.3f})"
                )
        
        lines.extend([
            f"\n--- Trend Analysis ---",
            f"Score trend: {score_trend.get('direction', 'unknown')}",
            f"Latest score: {score_trend.get('latest', 0):.2%}",
            f"Average score: {score_trend.get('average', 0):.2%}",
        ])
        
        return "\n".join(lines)


# Demonstrate regression tracking
print("=" * 60)
print("REGRESSION TRACKING SOLUTION")
print("=" * 60)

tracker = RegressionTracker(regression_threshold=0.05)

# Simulate multiple versions
versions = [
    ("v1.0.0", {"scores": {"mean": 0.75}, "latency_ms": {"mean": 50}, "pass_rate": 0.8}),
    ("v1.1.0", {"scores": {"mean": 0.78}, "latency_ms": {"mean": 48}, "pass_rate": 0.82}),
    ("v1.2.0", {"scores": {"mean": 0.80}, "latency_ms": {"mean": 52}, "pass_rate": 0.85}),
    ("v1.3.0", {"scores": {"mean": 0.72}, "latency_ms": {"mean": 75}, "pass_rate": 0.78}),  # Regression!
    ("v1.3.1", {"scores": {"mean": 0.79}, "latency_ms": {"mean": 55}, "pass_rate": 0.83}),  # Fixed
]

print("\n--- Recording Runs ---")
for version, results in versions:
    run = tracker.record_run(version, results)
    regression = tracker.check_regression(run)
    status = "REGRESSION" if regression["status"] == "regression_detected" else "OK"
    print(f"{version}: score={results['scores']['mean']:.2f} - {status}")

# Generate report
print(tracker.generate_report())

---

## Key Takeaways

1. **Multiple Metrics**: Use diverse metrics (exact match, F1, semantic) for robust evaluation
2. **Comparative Testing**: Compare agents systematically on the same benchmark
3. **Stress Testing**: Understand performance under concurrent load
4. **Regression Tracking**: Monitor performance over time to catch degradations
5. **Category Analysis**: Different agents excel at different task types