# MCP Agent Evaluation Gym

This notebook evaluates three agent types (RandomAgent, RuleBasedAgent, HillClimberAgent)
across all loaded scenarios, trains the hill-climber, and compares traces against the
production corpus.

## 1. Setup

Import all modules, create the environment, and load scenarios.

In [None]:
from __future__ import annotations

import json
from pathlib import Path

from mcp_gym.env import make_env
from mcp_gym.agents import RandomAgent, RuleBasedAgent, HillClimberAgent
from mcp_gym.rewards import MultiDimensionalReward, RewardBreakdown
from mcp_gym.scenarios import load_all_scenarios
from mcp_gym.trace import TraceLogger, TraceAnalyzer, DEFAULT_CORPUS, EpisodeTrace
from mcp_gym.training import train_hill_climber, TrainingConfig, compute_baseline
from mcp_gym.types import EpisodeConfig

SCENARIO_DIR = Path("../scenarios")
SEED = 42

# Load all scenarios
scenarios = load_all_scenarios(SCENARIO_DIR)
print(f"Loaded {len(scenarios)} scenarios:")
for s in scenarios:
    phases = len(s.phases)
    print(f"  - {s.name} ({s.domain}, {phases} phase{'s' if phases != 1 else ''})")

# Get available tools from the environment
env = make_env()
tools = env.surface.registered_actions()
print(f"\nRegistered tools ({len(tools)}): {tools[:8]}{'...' if len(tools) > 8 else ''}")

## 2. Run All Agents on All Scenarios

Execute each agent type against every scenario and collect rubric scores.

In [None]:
AGENT_NAMES = ["random", "rule-based", "hill-climber"]


def make_agent(name: str, tools: list[str], seed: int = SEED):
    """Construct an agent by name."""
    if name == "random":
        return RandomAgent(tools, seed=seed)
    if name == "rule-based":
        return RuleBasedAgent(tools, seed=seed)
    if name == "hill-climber":
        return HillClimberAgent(tools, seed=seed)
    raise ValueError(f"Unknown agent: {name}")


def run_episode(agent, scenario_def, seed=SEED):
    """Run one episode, return (reward, breakdown, call_history)."""
    max_steps = max(1, sum(p.max_steps for p in scenario_def.phases))
    config = EpisodeConfig(
        max_steps=max_steps,
        token_budget=scenario_def.token_budget,
        seed=seed,
    )
    env = make_env(config=config)
    obs, _ = env.reset(seed=seed)
    agent.reset()

    total_reward = 0.0
    terminated = False
    truncated = False

    while not terminated and not truncated:
        action_str = agent.act(obs)
        obs, reward, terminated, truncated, _ = env.step(action_str)
        total_reward += reward

    reward_sys = MultiDimensionalReward()
    breakdown = reward_sys.compute(env._call_history, {
        "expected_sequence": scenario_def.expected_sequence,
        "forbidden_actions": scenario_def.forbidden_actions,
        "token_budget": scenario_def.token_budget,
        "tokens_used": scenario_def.token_budget - env._token_budget,
    })
    return total_reward, breakdown, env._call_history

In [None]:
# results[agent_name][scenario_name] = RewardBreakdown
results: dict[str, dict[str, RewardBreakdown]] = {}

for agent_name in AGENT_NAMES:
    results[agent_name] = {}
    for scenario_def in scenarios:
        agent = make_agent(agent_name, tools)
        _, breakdown, _ = run_episode(agent, scenario_def)
        results[agent_name][scenario_def.name] = breakdown

print(f"Evaluated {len(AGENT_NAMES)} agents x {len(scenarios)} scenarios = {len(AGENT_NAMES) * len(scenarios)} runs")

## 3. Comparison Matrix

Agent x Scenario total rubric score.

In [None]:
# Compute column widths
name_w = max(len(s.name) for s in scenarios)
agent_w = 14

# Header
header = f"{'Scenario':<{name_w}}"
for a in AGENT_NAMES:
    header += f"  {a:>{agent_w}}"
print(header)
print("-" * len(header))

# Rows
agent_totals = {a: 0.0 for a in AGENT_NAMES}
for s in scenarios:
    row = f"{s.name:<{name_w}}"
    for a in AGENT_NAMES:
        score = results[a][s.name].total
        agent_totals[a] += score
        row += f"  {score:>{agent_w}.4f}"
    print(row)

# Averages
print("-" * len(header))
avg_row = f"{'AVERAGE':<{name_w}}"
n = len(scenarios)
for a in AGENT_NAMES:
    avg = agent_totals[a] / n if n > 0 else 0.0
    avg_row += f"  {avg:>{agent_w}.4f}"
print(avg_row)

### Per-Dimension Averages

Average score per reward dimension across all scenarios.

In [None]:
dims = ["judgment", "safety", "efficiency", "context_maintenance", "escalation"]
dim_w = 20

header = f"{'Dimension':<{dim_w}}"
for a in AGENT_NAMES:
    header += f"  {a:>{agent_w}}"
print(header)
print("-" * len(header))

for dim in dims:
    row = f"{dim:<{dim_w}}"
    for a in AGENT_NAMES:
        avg = sum(getattr(results[a][s.name], dim) for s in scenarios) / len(scenarios)
        row += f"  {avg:>{agent_w}.4f}"
    print(row)

## 4. Training Demo

Train the hill-climber for 50 episodes and show before/after improvement.

In [None]:
train_config = TrainingConfig(
    num_episodes=50,
    perturb_magnitude=0.3,
    seed=SEED,
    scenario_dir=str(SCENARIO_DIR),
)

train_result = train_hill_climber(train_config)

print(f"Training complete: {train_config.num_episodes} episodes")
print(f"Best reward:    {train_result.best_reward:+.4f} (episode {train_result.best_episode})")
print(f"Improvement:    {train_result.improvement:+.4f}")
print()
print("Initial weights:")
for k, v in sorted(train_result.initial_weights.items()):
    print(f"  {k}: {v:.4f}")
print()
print("Final weights:")
for k, v in sorted(train_result.final_weights.items()):
    print(f"  {k}: {v:.4f}")
print()

# Show reward progression (first 10, last 10)
rewards = train_result.episode_rewards
print("Reward progression:")
print(f"  First 10: {[f'{r:+.3f}' for r in rewards[:10]]}")
print(f"  Last  10: {[f'{r:+.3f}' for r in rewards[-10:]]}")
print(f"  Mean (first 10): {sum(rewards[:10]) / min(10, len(rewards)):+.4f}")
print(f"  Mean (last  10): {sum(rewards[-10:]) / min(10, len(rewards)):+.4f}")

## 5. Trace Analysis

Compare agent traces against the DEFAULT_CORPUS of production patterns.

In [None]:
analyzer = TraceAnalyzer(DEFAULT_CORPUS)
all_traces: list[EpisodeTrace] = []

for agent_name in AGENT_NAMES:
    agent = make_agent(agent_name, tools)
    for scenario_def in scenarios:
        logger = TraceLogger()
        max_steps = max(1, sum(p.max_steps for p in scenario_def.phases))
        config = EpisodeConfig(
            max_steps=max_steps,
            token_budget=scenario_def.token_budget,
            seed=SEED,
        )
        env = make_env(config=config)
        obs, _ = env.reset(seed=SEED)
        agent.reset()

        total_reward = 0.0
        terminated = False
        truncated = False
        step_num = 0

        while not terminated and not truncated:
            action_str = agent.act(obs)
            obs, reward, terminated, truncated, info = env.step(action_str)
            total_reward += reward
            step_num += 1

            parsed = json.loads(action_str)
            logger.log_step(
                step=step_num,
                server=parsed["server"],
                action=parsed["action"],
                params=parsed.get("params", {}),
                success=info.get("success", False),
            )

        trace = logger.finish_episode(agent_name, scenario_def.name, total_reward)
        all_traces.append(trace)

print(f"Collected {len(all_traces)} traces ({len(AGENT_NAMES)} agents x {len(scenarios)} scenarios)")

In [None]:
comparison = analyzer.compare_traces(all_traces)

# Display results
agent_w = 14
scenario_w = max(len(r["scenario_name"]) for r in comparison)
pattern_w = 28

header = f"{'Agent':<{agent_w}}  {'Scenario':<{scenario_w}}  {'Best Pattern':<{pattern_w}}  {'Score':>6}  {'Reward':>8}"
print(header)
print("-" * len(header))

for r in comparison[:20]:  # Top 20
    pattern_name = r["best_pattern"] or "(none)"
    print(
        f"{r['agent_name']:<{agent_w}}  "
        f"{r['scenario_name']:<{scenario_w}}  "
        f"{pattern_name:<{pattern_w}}  "
        f"{r['best_score']:>6.3f}  "
        f"{r['total_reward']:>+8.3f}"
    )

if len(comparison) > 20:
    print(f"... and {len(comparison) - 20} more rows")

## 6. Summary

Key findings from the evaluation run.

In [None]:
print("=" * 60)
print("MCP Agent Evaluation Gym -- Summary")
print("=" * 60)
print()

# Best average agent
n = len(scenarios)
avg_scores = {}
for a in AGENT_NAMES:
    avg_scores[a] = sum(results[a][s.name].total for s in scenarios) / n

best_agent = max(avg_scores, key=avg_scores.get)
print(f"1. Best overall agent: {best_agent} (avg rubric: {avg_scores[best_agent]:.4f})")
for a in AGENT_NAMES:
    print(f"   - {a}: {avg_scores[a]:.4f}")

print()

# Training improvement
print(f"2. Hill-climber training ({train_config.num_episodes} episodes):")
print(f"   - Improvement: {train_result.improvement:+.4f}")
print(f"   - Best reward: {train_result.best_reward:+.4f} (episode {train_result.best_episode})")

print()

# Best trace match
if comparison:
    best_trace = comparison[0]
    print(f"3. Best corpus match:")
    print(f"   - Agent: {best_trace['agent_name']}")
    print(f"   - Scenario: {best_trace['scenario_name']}")
    print(f"   - Pattern: {best_trace['best_pattern']}")
    print(f"   - Similarity: {best_trace['best_score']:.4f}")

print()

# Safety analysis
print("4. Safety scores (avg across all scenarios):")
for a in AGENT_NAMES:
    avg_safety = sum(results[a][s.name].safety for s in scenarios) / n
    print(f"   - {a}: {avg_safety:.4f}")

print()
print("Evaluation complete.")