From 3bf623230fd9f61272e1982e61225f6ded7b5aed Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Sun, 5 Apr 2026 19:33:34 -0400 Subject: [PATCH 01/12] integrate skillsbench --- benchmarks/skillsbench/README.md | 163 ++++++++++ benchmarks/skillsbench/__init__.py | 1 + benchmarks/skillsbench/config.py | 16 + benchmarks/skillsbench/eval_infer.py | 280 ++++++++++++++++ benchmarks/skillsbench/run_infer.py | 467 +++++++++++++++++++++++++++ benchmarks/utils/report_costs.py | 4 +- pyproject.toml | 2 + 7 files changed, 932 insertions(+), 1 deletion(-) create mode 100644 benchmarks/skillsbench/README.md create mode 100644 benchmarks/skillsbench/__init__.py create mode 100644 benchmarks/skillsbench/config.py create mode 100644 benchmarks/skillsbench/eval_infer.py create mode 100644 benchmarks/skillsbench/run_infer.py diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md new file mode 100644 index 000000000..60ff73652 --- /dev/null +++ b/benchmarks/skillsbench/README.md @@ -0,0 +1,163 @@ +# SkillsBench Evaluation + +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. + +## Overview + +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain + +- Software engineering +- Office & white collar +- Natural science +- Media & content production +- Cybersecurity +- Finance +- Robotics +- Manufacturing +- Energy +- Mathematics +- Healthcare + +## Prerequisites + +1. **Install Harbor**: Harbor is the official harness for running SkillsBench. + + ```bash + pip install harbor + # or + uv pip install harbor + ``` + +2. **Docker**: Harbor requires Docker to be installed and running. + +3. **LLM API Key**: Configure your LLM provider credentials. + +## Usage + +### Running Inference + +Run the SkillsBench evaluation using the OpenHands SDK agent: + +```bash +# Run full evaluation +uv run skillsbench-infer .llm_config/claude.json + +# Run specific tasks +uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc + +# Run tasks from a file +uv run skillsbench-infer .llm_config/claude.json --select tasks.txt + +# Limit the run to 5 tasks (useful for smoke tests) +uv run skillsbench-infer .llm_config/claude.json --n-limit 5 + +# Run with multiple workers +uv run skillsbench-infer .llm_config/claude.json --num-workers 4 +``` + +### LLM Configuration + +Create an LLM configuration file (e.g., `.llm_config/claude.json`): + +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "api_key": "YOUR_API_KEY" +} +``` + +Or use a LiteLLM proxy: + +```json +{ + "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514", + "base_url": "https://your-proxy.example.com", + "api_key": "YOUR_API_KEY" +} +``` + +### Evaluating Results + +After running inference, evaluate the results: + +```bash +uv run skillsbench-eval ./evaluation_outputs/.../output.jsonl +``` + +This generates a report file (`output.report.json`) with: +- Total/completed/resolved instance counts +- Success rate +- Aggregate metrics (cost, tokens) + +## Output Format + +### Inference Output (`output.jsonl`) + +Each line contains: + +```json +{ + "instance_id": "benchflow/task-name", + "test_result": { + "trial_name": "...", + "trial_uri": "...", + "rewards": {"reward": 1.0}, + "passed": true + }, + "instruction": "", + "error": null, + "history": [], + "metrics": { + "total_prompt_tokens": 5000, + "total_completion_tokens": 1000, + "total_cost_usd": 0.05 + } +} +``` + +### Evaluation Report (`output.report.json`) + +```json +{ + "total_instances": 100, + "completed_instances": 95, + "resolved_instances": 80, + "unresolved_instances": 15, + "error_instances": 5, + "aggregate_metrics": { + "total_cost_usd": 5.25, + "total_prompt_tokens": 500000, + "total_completion_tokens": 100000 + } +} +``` + +## Architecture + +The integration follows the Harbor agent adapter pattern: + +1. **Harbor Harness**: Manages task containers and lifecycle +2. **OpenHands SDK Agent**: Runs inside containers to solve tasks +3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format + +```text +┌──────────────────────────────────────────────────┐ +│ Harbor Harness │ +│ ┌────────────────────────────────────────────┐ │ +│ │ Task Container │ │ +│ │ ┌──────────────────────────────────────┐ │ │ +│ │ │ OpenHands SDK Agent │ │ │ +│ │ │ - Terminal tool │ │ │ +│ │ │ - File editor tool │ │ │ +│ │ │ - Task tracker tool │ │ │ +│ │ └──────────────────────────────────────┘ │ │ +│ └────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────┘ +``` + +## References + +- [SkillsBench](https://www.skillsbench.ai/) - The benchmark +- [Harbor](https://harborframework.com) - The evaluation harness +- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK +- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format diff --git a/benchmarks/skillsbench/__init__.py b/benchmarks/skillsbench/__init__.py new file mode 100644 index 000000000..c02f7bafb --- /dev/null +++ b/benchmarks/skillsbench/__init__.py @@ -0,0 +1 @@ +# SkillsBench evaluation benchmark diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py new file mode 100644 index 000000000..8b55a92b0 --- /dev/null +++ b/benchmarks/skillsbench/config.py @@ -0,0 +1,16 @@ +"""SkillsBench configuration defaults.""" + +# Default inference settings (only include values actually used by argparse) +INFER_DEFAULTS = { + "dataset": "benchflow/skillsbench", + "output_dir": "./evaluation_outputs", + "num_workers": 1, +} + +# Harbor configuration defaults +HARBOR_DEFAULTS = { + # Harbor executable + "harbor_executable": "harbor", + # Default agent name for openhands-sdk + "agent_name": "openhands-sdk", +} diff --git a/benchmarks/skillsbench/eval_infer.py b/benchmarks/skillsbench/eval_infer.py new file mode 100644 index 000000000..f55a91736 --- /dev/null +++ b/benchmarks/skillsbench/eval_infer.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""SkillsBench Evaluation Script. + +This script processes SkillsBench output and generates evaluation reports. +It reads the output.jsonl produced by run_infer, aggregates results, +and writes a summary report. + +Usage: + uv run skillsbench-eval +""" + +import argparse +import json +import sys +from pathlib import Path + +from benchmarks.utils.laminar import LaminarService +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def process_skillsbench_results( + input_file: str, + output_file: str, +) -> dict: + """Process SkillsBench output.jsonl and generate evaluation report. + + SkillsBench format (from harbor conversion): + { + "instance_id": "task_id", + "test_result": { + "trajectory_path": "...", + "total_steps": N, + "final_metrics": {...}, + "passed": true/false # May be populated by harbor grading + }, + "instruction": "...", + "history": [...] + } + + Report format (similar to SWE-Bench): + { + "total_instances": N, + "submitted_instances": N, + "completed_instances": N, + "incomplete_instances": N, + "resolved_instances": N, + "unresolved_instances": N, + "error_instances": N, + "submitted_ids": [...], + "completed_ids": [...], + "incomplete_ids": [...], + "resolved_ids": [...], + "unresolved_ids": [...] + } + """ + logger.info(f"Processing {input_file} to generate report: {output_file}") + + # Use sets for O(1) lookup and automatic deduplication + # Convert to sorted lists only when building final report + completed_ids: set[str] = set() + resolved_ids: set[str] = set() + unresolved_ids: set[str] = set() + incomplete_ids: set[str] = set() + error_ids: set[str] = set() + + # Aggregate metrics + total_cost_usd = 0.0 + total_prompt_tokens = 0 + total_completion_tokens = 0 + + with open(input_file) as infile: + for line_num, line in enumerate(infile, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + + # Extract required fields + instance_id = data.get("instance_id") + if not instance_id: + logger.warning(f"Line {line_num}: Missing instance_id") + continue + + if instance_id in completed_ids: + logger.warning( + f"Line {line_num}: Duplicate instance_id {instance_id}" + ) + continue + + # Check for errors + error = data.get("error") + if error: + error_ids.add(instance_id) + incomplete_ids.add(instance_id) + continue + + # Extract test result + test_result = data.get("test_result", {}) + + # Check if task passed (harbor may include this) + passed = test_result.get("passed") + # If not explicitly set, we mark as completed but ungraded + is_resolved = passed is True + + # Add to completed instances + completed_ids.add(instance_id) + + if is_resolved: + resolved_ids.add(instance_id) + else: + unresolved_ids.add(instance_id) + + # Aggregate metrics + # Use explicit None check to handle zero values correctly + # (using `or` would incorrectly fallback when value is 0) + metrics = data.get("metrics", {}) + final_metrics = test_result.get("final_metrics", {}) + + cost = metrics.get("total_cost_usd") + if cost is None: + cost = final_metrics.get("total_cost_usd", 0.0) + + prompt_tokens = metrics.get("total_prompt_tokens") + if prompt_tokens is None: + prompt_tokens = final_metrics.get("total_prompt_tokens", 0) + + completion_tokens = metrics.get("total_completion_tokens") + if completion_tokens is None: + completion_tokens = final_metrics.get("total_completion_tokens", 0) + + # After the None checks above, these values are guaranteed to be non-None + total_cost_usd += cost + total_prompt_tokens += prompt_tokens + total_completion_tokens += completion_tokens + + except json.JSONDecodeError as e: + logger.error(f"Line {line_num}: Invalid JSON - {e}") + except Exception as e: + logger.error(f"Line {line_num}: Unexpected error - {e}") + + # Check for separate error file (used in manual workflows where errors + # are extracted to a separate file for analysis/retry) + error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl") + if error_path.exists(): + with open(error_path) as error_file: + for line_num, line in enumerate(error_file, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + instance_id = data.get("instance_id") + if not instance_id: + continue + if instance_id in completed_ids or instance_id in incomplete_ids: + continue + + incomplete_ids.add(instance_id) + error_ids.add(instance_id) + except (json.JSONDecodeError, Exception) as e: + logger.error(f"Error file line {line_num}: {e}") + + submitted_ids = completed_ids | incomplete_ids + + # Generate report - convert sets to sorted lists for consistent output + report = { + "total_instances": len(submitted_ids), + "submitted_instances": len(submitted_ids), + "completed_instances": len(completed_ids), + "incomplete_instances": len(incomplete_ids), + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "error_instances": len(error_ids), + "submitted_ids": sorted(submitted_ids), + "completed_ids": sorted(completed_ids), + "incomplete_ids": sorted(incomplete_ids), + "resolved_ids": sorted(resolved_ids), + "unresolved_ids": sorted(unresolved_ids), + "error_ids": sorted(error_ids), + # Aggregate metrics + "aggregate_metrics": { + "total_cost_usd": total_cost_usd, + "total_prompt_tokens": total_prompt_tokens, + "total_completion_tokens": total_completion_tokens, + }, + } + + # Write report + with open(output_file, "w") as outfile: + json.dump(report, outfile, indent=4) + + logger.info("Report generated successfully:") + logger.info(f" Total instances: {report['total_instances']}") + logger.info(f" Completed instances: {report['completed_instances']}") + logger.info(f" Resolved instances: {report['resolved_instances']}") + logger.info(f" Unresolved instances: {report['unresolved_instances']}") + logger.info(f" Error instances: {report['error_instances']}") + if report["completed_instances"] > 0: + logger.info( + f" Success rate: " + f"{report['resolved_instances'] / report['completed_instances'] * 100:.1f}%" + ) + logger.info(f" Total cost: ${total_cost_usd:.4f}") + + return report + + +def main() -> None: + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Process SkillsBench output and generate evaluation report", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run skillsbench-eval output.jsonl + uv run skillsbench-eval /path/to/output.jsonl + """, + ) + + parser.add_argument("input_file", help="Path to the SkillsBench output.jsonl file") + parser.add_argument( + "--output-file", + help="Output file for report (default: input_file with .report.json extension)", + ) + + args = parser.parse_args() + + # Validate input file + input_file = Path(args.input_file) + if not input_file.exists(): + logger.error(f"Input file does not exist: {input_file}") + sys.exit(1) + + if not input_file.suffix == ".jsonl": + logger.warning(f"Input file does not have .jsonl extension: {input_file}") + + # Determine output file + if args.output_file: + output_file = Path(args.output_file) + else: + output_file = input_file.with_suffix(".report.json") + + logger.info(f"Input file: {input_file}") + logger.info(f"Output file: {output_file}") + + try: + # Process results and generate report + process_skillsbench_results( + str(input_file), + str(output_file), + ) + except Exception as e: + logger.error(f"Script failed: {e}") + sys.exit(1) + + # Non-critical telemetry and reporting - wrap in try/except so expensive + # multi-hour evaluations don't fail at the telemetry step after completing + try: + LaminarService.get().update_evaluation_scores(str(input_file), str(output_file)) + except Exception as e: + logger.warning(f"Laminar update failed (non-critical): {e}") + + try: + generate_cost_report(str(input_file)) + except Exception as e: + logger.warning(f"Cost report generation failed (non-critical): {e}") + + logger.info("Script completed successfully!") + print(json.dumps({"report_json": str(output_file)})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py new file mode 100644 index 000000000..a8afa7281 --- /dev/null +++ b/benchmarks/skillsbench/run_infer.py @@ -0,0 +1,467 @@ +"""SkillsBench inference script using Harbor with openhands-sdk agent. + +This script runs SkillsBench evaluation using Harbor as the harness +and openhands-sdk as the agent. Results are saved in a format compatible +with the standard evaluation pipeline. + +Usage: + uv run skillsbench-infer --dataset benchflow/skillsbench +""" + +import argparse +import json +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +from pydantic import SecretStr + +from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS +from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import LLM, get_logger + + +logger = get_logger(__name__) + +# Output filename for results +OUTPUT_FILENAME = "output.jsonl" + + +def check_harbor_installed() -> bool: + """Check if harbor CLI is installed and available.""" + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + try: + result = subprocess.run( + [harbor_exe, "--version"], + capture_output=True, + text=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +def run_harbor_evaluation( + llm: LLM, + dataset: str, + output_dir: str, + num_workers: int = 1, + task_ids: list[str] | None = None, + n_limit: int | None = None, +) -> Path: + """Run harbor evaluation with openhands-sdk agent. + + Args: + llm: LLM configuration for the agent. + dataset: Harbor dataset name (e.g., benchflow/skillsbench). + output_dir: Directory to store output files. + num_workers: Number of parallel workers. + task_ids: Optional list of specific task IDs to run. + n_limit: Optional maximum number of dataset tasks to run. + + Returns: + Path to the harbor output directory. + """ + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + + # Build harbor command using harbor CLI flags. + # Use absolute path for --jobs-dir to avoid CWD-relative path issues. + cmd = [ + harbor_exe, + "run", + "-d", + dataset, + "-a", + HARBOR_DEFAULTS["agent_name"], + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(num_workers), + ] + + # Pass LLM credentials as agent environment variables + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + + # Add specific task names if provided + if task_ids: + for task_id in task_ids: + cmd.extend(["--include-task-name", task_id]) + + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) + + logger.info(f"Running harbor command: {' '.join(cmd)}") + logger.info(f"Output directory: {harbor_output_dir}") + + # harbor's openhands-sdk agent reads LLM credentials from the host process + # environment (os.environ), not from --ae flags which go to the sandbox. + env = os.environ.copy() + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + ) + + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + + logger.info("Harbor evaluation completed successfully") + logger.info(f"stdout: {result.stdout}") + + except FileNotFoundError: + raise RuntimeError( + "Harbor CLI not found. Please install harbor: pip install harbor" + ) + + return harbor_output_dir + + +def _find_job_dir(harbor_output_dir: Path) -> Path: + """Find the harbor job directory (timestamp-named) inside the output dir.""" + # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) + # containing result.json and trial subdirectories + candidates = [ + d + for d in harbor_output_dir.iterdir() + if d.is_dir() and (d / "result.json").exists() + ] + if not candidates: + raise RuntimeError( + f"No harbor job directory found in {harbor_output_dir}. " + f"Expected a timestamp-named directory containing result.json." + ) + # Use the most recent job directory if multiple exist + return sorted(candidates)[-1] + + +def convert_harbor_to_eval_output( + harbor_output_dir: Path, + eval_output_path: Path, +) -> None: + """Convert harbor output to evaluation output format. + + Harbor stores trial results in a job directory structured as: + harbor_output/TIMESTAMP/TRIAL_NAME/result.json + + Each trial's result.json contains task_name, verifier_result, agent_result, + timing info, and exception details. + + Args: + harbor_output_dir: Path to harbor output directory. + eval_output_path: Path to write the converted output.jsonl. + """ + logger.info(f"Converting harbor output from {harbor_output_dir}") + + job_dir = _find_job_dir(harbor_output_dir) + logger.info(f"Using harbor job directory: {job_dir}") + + # Find trial result files (each trial dir has a result.json) + result_files = list(job_dir.glob("*/result.json")) + # Exclude the job-level result.json + result_files = [f for f in result_files if f.parent != job_dir] + + if not result_files: + raise RuntimeError( + f"No trial result files found in {job_dir}. " + f"Expected result.json files in trial subdirectories." + ) + + logger.info(f"Found {len(result_files)} trial results in {job_dir}") + + results: list[dict] = [] + errors: list[dict] = [] + + for result_file in result_files: + try: + with open(result_file) as f: + trial = json.load(f) + + instance_id = trial.get("task_name", result_file.parent.name) + + # Check for exceptions + if trial.get("exception_info"): + errors.append( + { + "instance_id": instance_id, + "error": str(trial["exception_info"]), + "test_result": {}, + } + ) + continue + + # Extract verifier results + verifier_result = trial.get("verifier_result", {}) + rewards = verifier_result.get("rewards", {}) + passed = rewards.get("reward", 0.0) > 0 + + # Extract agent metrics + agent_result = trial.get("agent_result", {}) + + eval_entry = { + "instance_id": instance_id, + "test_result": { + "trial_name": trial.get("trial_name"), + "trial_uri": trial.get("trial_uri"), + "rewards": rewards, + "passed": passed, + }, + "instruction": "", + "error": None, + "history": [], + "metrics": { + "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, + "total_completion_tokens": ( + agent_result.get("n_output_tokens") or 0 + ), + "total_cost_usd": agent_result.get("cost_usd") or 0.0, + }, + } + results.append(eval_entry) + logger.info( + f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" + ) + + except (json.JSONDecodeError, OSError) as e: + logger.error(f"Failed to process result file {result_file}: {e}") + errors.append( + { + "instance_id": result_file.parent.name, + "error": str(e), + "test_result": {}, + } + ) + + if not results and not errors: + raise RuntimeError(f"No trials processed from {harbor_output_dir}") + + if not results: + logger.warning( + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" + ) + + # Write results to output.jsonl + with open(eval_output_path, "w") as f: + for entry in results: + f.write(json.dumps(entry) + "\n") + for entry in errors: + f.write(json.dumps(entry) + "\n") + + logger.info( + f"Wrote {len(results)} successful + {len(errors)} failed entries " + f"to {eval_output_path}" + ) + + +def load_task_ids_from_file(filepath: str) -> list[str]: + """Load task IDs from a text file (one per line).""" + task_ids = [] + with open(filepath) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + task_ids.append(line) + return task_ids + + +def main() -> None: + """Main entry point for skillsbench inference.""" + parser = argparse.ArgumentParser( + description="Run SkillsBench evaluation with openhands-sdk via Harbor", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run full skillsbench evaluation + uv run skillsbench-infer .llm_config/claude.json + + # Run specific tasks + uv run skillsbench-infer .llm_config/claude.json --select tasks.txt + + # Run with custom dataset version + uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 + """, + ) + + parser.add_argument( + "llm_config_path", + type=str, + help="Path to JSON LLM configuration file", + ) + parser.add_argument( + "--dataset", + type=str, + default=INFER_DEFAULTS["dataset"], + help="Harbor dataset name (e.g., benchflow/skillsbench)", + ) + parser.add_argument( + "--output-dir", + type=str, + default=INFER_DEFAULTS["output_dir"], + help="Base output directory for evaluation results", + ) + parser.add_argument( + "--num-workers", + type=int, + default=INFER_DEFAULTS["num_workers"], + help="Number of parallel workers", + ) + parser.add_argument( + "--n-limit", + type=int, + help="Maximum number of dataset tasks to run after Harbor filtering", + ) + parser.add_argument( + "--select", + type=str, + help="Path to text file containing task IDs to run (one per line)", + ) + parser.add_argument( + "--task-id", + type=str, + action="append", + help="Specific task ID to run (can be specified multiple times)", + ) + parser.add_argument( + "--note", + type=str, + help="Optional note for the evaluation run", + ) + parser.add_argument( + "--skip-harbor", + action="store_true", + help="Skip running harbor and only convert existing results", + ) + + args = parser.parse_args() + + # Validate LLM config + if not os.path.isfile(args.llm_config_path): + logger.error(f"LLM config file does not exist: {args.llm_config_path}") + sys.exit(1) + + with open(args.llm_config_path) as f: + llm_config = f.read() + llm = LLM.model_validate_json(llm_config) + logger.info(f"Using LLM: {llm.model}") + + # Check harbor installation + if not args.skip_harbor and not check_harbor_installed(): + logger.error( + "Harbor CLI is not installed. Please install it:\n" + " pip install harbor\n" + " # or\n" + " uv pip install harbor" + ) + sys.exit(1) + + # Construct output directory + dataset_description = args.dataset.replace("/", "__").replace("@", "-") + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name=dataset_description, + model_name=llm.model, + max_iterations=100, # Not directly used but required for path construction + eval_note=args.note, + ) + + logger.info(f"Output directory: {structured_output_dir}") + os.makedirs(structured_output_dir, exist_ok=True) + + # Save metadata + metadata = { + "llm": llm.model_dump_json(), + "dataset": args.dataset, + "timestamp": datetime.now(timezone.utc).isoformat(), + "harbor_agent": HARBOR_DEFAULTS["agent_name"], + "note": args.note, + } + metadata_path = Path(structured_output_dir) / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2) + + # Collect task IDs if specified + task_ids: list[str] | None = None + if args.select: + loaded_ids = load_task_ids_from_file(args.select) + task_ids = loaded_ids + logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") + elif args.task_id: + task_ids = list(args.task_id) + logger.info(f"Running {len(task_ids)} specified task IDs") + + output_path = Path(structured_output_dir) / OUTPUT_FILENAME + + if not args.skip_harbor: + # Run harbor evaluation + try: + harbor_output_dir = run_harbor_evaluation( + llm=llm, + dataset=args.dataset, + output_dir=structured_output_dir, + num_workers=args.num_workers, + task_ids=task_ids, + n_limit=args.n_limit, + ) + + # Convert harbor output to standard format + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, + eval_output_path=output_path, + ) + + except Exception as e: + logger.error(f"Evaluation failed: {e}") + sys.exit(1) + else: + # Skip harbor, just convert existing results + harbor_output_dir = Path(structured_output_dir) / "harbor_output" + if harbor_output_dir.exists(): + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, + eval_output_path=output_path, + ) + else: + logger.error(f"No harbor output found at {harbor_output_dir}") + sys.exit(1) + + # Generate cost report + if output_path.exists(): + generate_cost_report(str(output_path)) + + logger.info("SkillsBench inference completed!") + print(json.dumps({"output_json": str(output_path)})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py index 8f38909f3..7a21a3831 100755 --- a/benchmarks/utils/report_costs.py +++ b/benchmarks/utils/report_costs.py @@ -48,7 +48,9 @@ def extract_accumulated_cost(jsonl_data: List[Optional[Dict]]) -> float: if entry is None: continue metrics = entry.get("metrics") or {} - accumulated_cost = metrics.get("accumulated_cost") + accumulated_cost = metrics.get("accumulated_cost") or metrics.get( + "total_cost_usd" + ) if accumulated_cost is not None: total_cost += float(accumulated_cost) diff --git a/pyproject.toml b/pyproject.toml index 69f7b2df5..79c38c8c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,8 @@ swebenchmultilingual-eval = "benchmarks.swebenchmultilingual.eval_infer:main" swefficiency-infer = "benchmarks.swefficiency.run_infer:main" terminalbench-infer = "benchmarks.terminalbench.run_infer:main" terminalbench-eval = "benchmarks.terminalbench.eval_infer:main" +skillsbench-infer = "benchmarks.skillsbench.run_infer:main" +skillsbench-eval = "benchmarks.skillsbench.eval_infer:main" [build-system] requires = ["setuptools>=61.0", "wheel"] From 2bb3266da6039ac8dc97f36c8ec22240da48f18c Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Sun, 5 Apr 2026 19:44:33 -0400 Subject: [PATCH 02/12] add skillsbench tests --- tests/test_skillsbench_eval_infer.py | 125 +++++++++++++++ tests/test_skillsbench_run_infer.py | 221 +++++++++++++++++++++++++++ 2 files changed, 346 insertions(+) create mode 100644 tests/test_skillsbench_eval_infer.py create mode 100644 tests/test_skillsbench_run_infer.py diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py new file mode 100644 index 000000000..56d54f27a --- /dev/null +++ b/tests/test_skillsbench_eval_infer.py @@ -0,0 +1,125 @@ +"""Tests for SkillsBench eval_infer module.""" + +import json +from pathlib import Path + +from benchmarks.skillsbench.eval_infer import process_skillsbench_results + + +class TestProcessSkillsbenchResults: + """Tests for the process_skillsbench_results function.""" + + def test_empty_input(self, tmp_path: Path) -> None: + """Test processing empty input file.""" + input_file = tmp_path / "empty.jsonl" + output_file = tmp_path / "empty.report.json" + input_file.write_text("") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["total_instances"] == 0 + assert result["completed_instances"] == 0 + assert result["resolved_instances"] == 0 + + def test_resolved_instance(self, tmp_path: Path) -> None: + """Test processing a resolved (passed=True) instance.""" + input_file = tmp_path / "resolved.jsonl" + output_file = tmp_path / "resolved.report.json" + + entry = { + "instance_id": "benchflow/weighted-gdp-calc", + "test_result": {"passed": True, "rewards": {"reward": 1.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 1 + assert result["unresolved_instances"] == 0 + assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] + + def test_unresolved_instance(self, tmp_path: Path) -> None: + """Test processing an unresolved (passed=False) instance.""" + input_file = tmp_path / "unresolved.jsonl" + output_file = tmp_path / "unresolved.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": False, "rewards": {"reward": 0.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 0 + assert result["unresolved_instances"] == 1 + + def test_instance_with_error(self, tmp_path: Path) -> None: + """Test processing an instance that errored.""" + input_file = tmp_path / "error.jsonl" + output_file = tmp_path / "error.report.json" + + entry = { + "instance_id": "benchflow/error-task", + "test_result": {}, + "error": "ValueError: LLM_API_KEY environment variable must be set", + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["error_instances"] == 1 + assert result["incomplete_instances"] == 1 + assert result["completed_instances"] == 0 + assert "benchflow/error-task" in result["error_ids"] + + def test_multiple_instances(self, tmp_path: Path) -> None: + """Test processing multiple instances with mixed results.""" + input_file = tmp_path / "multi.jsonl" + output_file = tmp_path / "multi.report.json" + + entries = [ + { + "instance_id": "benchflow/task-1", + "test_result": {"passed": True}, + "error": None, + }, + { + "instance_id": "benchflow/task-2", + "test_result": {"passed": False}, + "error": None, + }, + {"instance_id": "benchflow/task-3", "test_result": {}, "error": "Timeout"}, + ] + input_file.write_text("\n".join(json.dumps(e) for e in entries) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["total_instances"] == 3 + assert result["completed_instances"] == 2 + assert result["resolved_instances"] == 1 + assert result["unresolved_instances"] == 1 + assert result["error_instances"] == 1 + + def test_report_file_written(self, tmp_path: Path) -> None: + """Test that report file is written correctly.""" + input_file = tmp_path / "input.jsonl" + output_file = tmp_path / "output.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": True}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + process_skillsbench_results(str(input_file), str(output_file)) + + assert output_file.exists() + with open(output_file) as f: + report = json.load(f) + assert "total_instances" in report + assert "resolved_ids" in report + assert "aggregate_metrics" in report diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py new file mode 100644 index 000000000..5f8452cb3 --- /dev/null +++ b/tests/test_skillsbench_run_infer.py @@ -0,0 +1,221 @@ +"""Tests for SkillsBench run_infer module.""" + +import json +from pathlib import Path + +import pytest + +from benchmarks.skillsbench.config import INFER_DEFAULTS +from benchmarks.skillsbench.run_infer import ( + convert_harbor_to_eval_output, + run_harbor_evaluation, +) +from openhands.sdk import LLM + + +class TestRunHarborEvaluation: + """Tests for building Harbor invocation arguments.""" + + def test_default_dataset_matches_harbor_registry(self) -> None: + """Test that the default dataset name matches Harbor's published registry.""" + assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" + + def test_run_harbor_evaluation_passes_filters_and_limits( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test Harbor command includes task filters and n-limit.""" + captured: dict[str, list[str]] = {} + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured["cmd"] = cmd + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + + harbor_output_dir = run_harbor_evaluation( + llm=LLM( + model="litellm_proxy/test-model", + api_key="test-key", + base_url="https://proxy.example.com", + ), + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + num_workers=2, + task_ids=["benchflow/task-a", "benchflow/task-b"], + n_limit=3, + ) + + expected_output_dir = tmp_path / "harbor_output" + assert harbor_output_dir == expected_output_dir + + cmd = captured["cmd"] + assert cmd[:8] == [ + "harbor", + "run", + "-d", + "benchflow/skillsbench", + "-a", + "openhands-sdk", + "-m", + "litellm_proxy/test-model", + ] + assert "--jobs-dir" in cmd + assert str(expected_output_dir.resolve()) in cmd + assert cmd.count("--include-task-name") == 2 + assert "benchflow/task-a" in cmd + assert "benchflow/task-b" in cmd + assert cmd[cmd.index("--n-concurrent") + 1] == "2" + assert cmd[cmd.index("--n-tasks") + 1] == "3" + + def test_llm_credentials_passed_via_env( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that LLM credentials are passed via subprocess env, not --ae flags.""" + captured: dict = {} + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured["cmd"] = cmd + captured["env"] = env + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + + run_harbor_evaluation( + llm=LLM( + model="test-model", + api_key="my-secret-key", + base_url="https://my-proxy.example.com", + ), + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + ) + + assert captured["env"]["LLM_API_KEY"] == "my-secret-key" + assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + + +class TestConvertHarborToEvalOutput: + """Tests for convert_harbor_to_eval_output function.""" + + def _create_harbor_structure( + self, tmp_path: Path, trials: list[tuple[str, dict]] + ) -> Path: + """Create a mock Harbor output structure.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) + + for trial_name, trial_result in trials: + trial_dir = job_dir / trial_name + trial_dir.mkdir() + (trial_dir / "result.json").write_text(json.dumps(trial_result)) + + return harbor_dir + + def test_successful_trial_parsing(self, tmp_path: Path) -> None: + """Test successful parsing of harbor trial result.""" + trial_result = { + "task_name": "benchflow/weighted-gdp-calc", + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_harbor_to_eval_output(harbor_dir, output_file) + + assert output_file.exists() + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + assert entries[0]["test_result"]["passed"] is True + assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + + def test_failed_trial(self, tmp_path: Path) -> None: + """Test parsing of a trial with reward 0.""" + trial_result = { + "task_name": "benchflow/task-1", + "trial_name": "task-1__xyz", + "agent_result": { + "n_input_tokens": None, + "n_output_tokens": None, + "cost_usd": None, + }, + "verifier_result": {"rewards": {"reward": 0.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("task-1__xyz", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["test_result"]["passed"] is False + assert entries[0]["metrics"]["total_cost_usd"] == 0.0 + + def test_trial_with_exception(self, tmp_path: Path) -> None: + """Test that exception trials are written as error entries.""" + trial_result = { + "task_name": "benchflow/error-task", + "trial_name": "error-task__err", + "agent_result": {}, + "verifier_result": {}, + "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("error-task__err", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/error-task" + assert entries[0]["error"] is not None + assert entries[0]["test_result"] == {} + + def test_missing_job_directory(self, tmp_path: Path) -> None: + """Test handling when no job directory exists.""" + harbor_dir = tmp_path / "harbor_output" + harbor_dir.mkdir() + + with pytest.raises(RuntimeError, match="No harbor job directory found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + + def test_empty_job_directory(self, tmp_path: Path) -> None: + """Test handling of harbor job dir with no trial subdirs.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test"})) + + with pytest.raises(RuntimeError, match="No trial result files found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") From 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Wed, 22 Apr 2026 20:47:30 -0400 Subject: [PATCH 03/12] feat(skillsbench): migrate harness from Harbor to benchflow 0.3.0 Switch the SkillsBench evaluation harness from Harbor/openhands-sdk to benchflow 0.3.0 with the native openhands ACP agent. Key changes: - Replace Harbor-specific logic with benchflow CLI invocation (`bench eval create -f config.yaml` / legacy `benchflow job --config`) - Add sparse-checkout task download to avoid cloning the full skillsbench repo - Fix metrics extraction: benchflow 0.3.0 result.json omits cost/token fields; now reads from agent/trajectory.json (harbor-format) or parses agent/openhands.txt stdout (ACP agent) - Fix timestamp detection with regex (_TIMESTAMP_RE) to correctly identify benchflow 0.3.0 job dirs (YYYY-MM-DD__HH-MM-SS) vs plain task dirs - Fix openhands install failure on Ubuntu 24.04 (PEP 668) by injecting PIP_BREAK_SYSTEM_PACKAGES=1 into agent_env - Add provider-specific env var injection for direct Gemini/Anthropic models - Update README and config to reflect benchflow harness Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 1 + benchmarks/skillsbench/README.md | 42 +- benchmarks/skillsbench/config.py | 11 +- benchmarks/skillsbench/run_infer.py | 655 ++++++++++++++++++--------- tests/test_skillsbench_eval_infer.py | 17 - tests/test_skillsbench_run_infer.py | 442 +++++++++++++----- uv.lock | 50 +- vendor/software-agent-sdk | 2 +- 8 files changed, 824 insertions(+), 396 deletions(-) diff --git a/.gitignore b/.gitignore index 459fad588..9164fd12b 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,5 @@ workspace/ # Evaluation outputs eval_outputs/ +evaluation_outputs/ builds/ diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 60ff73652..21339842c 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -1,10 +1,10 @@ # SkillsBench Evaluation -This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent. ## Overview -SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include: - Software engineering - Office & white collar @@ -20,23 +20,25 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites -1. **Install Harbor**: Harbor is the official harness for running SkillsBench. +1. **Install benchflow**: benchflow is the official harness for running SkillsBench. ```bash - pip install harbor + uv tool install benchflow==0.3.0 # or - uv pip install harbor + pip install benchflow==0.3.0 + # or + uv pip install benchflow==0.3.0 ``` -2. **Docker**: Harbor requires Docker to be installed and running. +2. **Docker**: benchflow requires Docker to be installed and running. -3. **LLM API Key**: Configure your LLM provider credentials. +3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment. ## Usage ### Running Inference -Run the SkillsBench evaluation using the OpenHands SDK agent: +Run the SkillsBench evaluation using the `openhands` agent: ```bash # Run full evaluation @@ -62,7 +64,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`): ```json { "model": "anthropic/claude-sonnet-4-20250514", - "api_key": "YOUR_API_KEY" + "api_key": "YOUR_ANTHROPIC_API_KEY" } ``` @@ -99,8 +101,6 @@ Each line contains: { "instance_id": "benchflow/task-name", "test_result": { - "trial_name": "...", - "trial_uri": "...", "rewards": {"reward": 1.0}, "passed": true }, @@ -134,22 +134,21 @@ Each line contains: ## Architecture -The integration follows the Harbor agent adapter pattern: +The integration uses the benchflow CLI as the evaluation harness: -1. **Harbor Harness**: Manages task containers and lifecycle -2. **OpenHands SDK Agent**: Runs inside containers to solve tasks -3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format +1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty +2. **benchflow job**: Runs all tasks concurrently with `openhands` +3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format ```text ┌──────────────────────────────────────────────────┐ -│ Harbor Harness │ +│ benchflow job │ │ ┌────────────────────────────────────────────┐ │ -│ │ Task Container │ │ +│ │ Task Container (Docker) │ │ │ │ ┌──────────────────────────────────────┐ │ │ -│ │ │ OpenHands SDK Agent │ │ │ +│ │ │ openhands │ │ │ │ │ │ - Terminal tool │ │ │ │ │ │ - File editor tool │ │ │ -│ │ │ - Task tracker tool │ │ │ │ │ └──────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────┘ @@ -158,6 +157,5 @@ The integration follows the Harbor agent adapter pattern: ## References - [SkillsBench](https://www.skillsbench.ai/) - The benchmark -- [Harbor](https://harborframework.com) - The evaluation harness -- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK -- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format +- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness +- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 8b55a92b0..4ed541ab9 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,16 +1,13 @@ """SkillsBench configuration defaults.""" -# Default inference settings (only include values actually used by argparse) +# Default inference settings INFER_DEFAULTS = { "dataset": "benchflow/skillsbench", "output_dir": "./evaluation_outputs", "num_workers": 1, } -# Harbor configuration defaults -HARBOR_DEFAULTS = { - # Harbor executable - "harbor_executable": "harbor", - # Default agent name for openhands-sdk - "agent_name": "openhands-sdk", +# benchflow configuration defaults +BENCHFLOW_DEFAULTS = { + "agent_name": "openhands", } diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index a8afa7281..2e11a100a 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -1,24 +1,31 @@ -"""SkillsBench inference script using Harbor with openhands-sdk agent. +"""SkillsBench inference script using the benchflow SDK. -This script runs SkillsBench evaluation using Harbor as the harness -and openhands-sdk as the agent. Results are saved in a format compatible +This script runs SkillsBench evaluation using `benchflow job` as the harness +and `openhands` as the default agent. Results are saved in a format compatible with the standard evaluation pipeline. Usage: - uv run skillsbench-infer --dataset benchflow/skillsbench + uv run skillsbench-infer + + # Run specific tasks + uv run skillsbench-infer --select tasks.txt """ import argparse import json import os +import re +import shutil import subprocess import sys +import tempfile from datetime import datetime, timezone from pathlib import Path +import yaml from pydantic import SecretStr -from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -26,256 +33,469 @@ logger = get_logger(__name__) -# Output filename for results +# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS +_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$") + +# "Total cost: $0.0487" +_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)") +# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K" +_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b") + OUTPUT_FILENAME = "output.jsonl" +TASK_REPOS = { + "skillsbench": { + "repo": "https://github.com/benchflow-ai/skillsbench.git", + "subdir": "tasks", + } +} + +_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = { + "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"), + "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), + "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), + "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"), +} + + +def _infer_direct_provider(model: str) -> str | None: + """Infer the provider prefix for direct model names. + + Examples: + - gemini/gemini-2.5-pro -> gemini + - anthropic/claude-sonnet-4-5 -> anthropic + - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars) + """ + if not model or model.startswith("litellm_proxy/"): + return None + if "/" in model: + provider = model.split("/", 1)[0].lower() + if provider in _DIRECT_PROVIDER_ENV_VARS: + return provider + return None + + +def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]: + """Build the sandbox environment for benchflow's openhands agent. + + Only LLM-specific variables are returned — these go INTO the sandbox + container via the ``agent_env`` YAML key. The calling process inherits + the host environment normally; dumping ``os.environ`` here would leak + the entire host env into every container. + """ + env: dict[str, str] = {} + api_key: str | None = None + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url + + provider = _infer_direct_provider(llm.model) + if provider and api_key: + key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider] + for var_name in key_vars: + env[var_name] = api_key + if llm.base_url and base_url_var: + env[base_url_var] = llm.base_url + + return env + + +def check_benchflow_installed() -> bool: + """Check if benchflow CLI is installed and available. + + Tries ``bench`` first (current name), then falls back to the legacy + ``benchflow`` binary. + """ + for cmd in ("bench", "benchflow"): + try: + result = subprocess.run( + [cmd, "--help"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + return True + except (FileNotFoundError, subprocess.TimeoutExpired): + continue + return False + -def check_harbor_installed() -> bool: - """Check if harbor CLI is installed and available.""" - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] +def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]: + """Map a benchflow dataset name to its task repository metadata.""" + dataset_name = dataset.split("@", 1)[0].split("/")[-1] try: - result = subprocess.run( - [harbor_exe, "--version"], + return dataset_name, TASK_REPOS[dataset_name] + except KeyError as exc: + raise ValueError( + f"Unsupported SkillsBench dataset: {dataset!r}. " + f"Known datasets: {sorted(TASK_REPOS)}" + ) from exc + + +def ensure_tasks( + dataset: str, + tasks_dir: Path, + task_ids: list[str] | None = None, +) -> None: + """Download tasks for a benchflow dataset into tasks_dir. + + BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the + benchmark task repository directly when the local tasks directory is empty. + + When *task_ids* is provided, a sparse checkout is used so only the + requested task subdirectories are downloaded — much faster than a full + clone for large repos. + """ + if tasks_dir.exists() and any(tasks_dir.iterdir()): + logger.info(f"Tasks already present in {tasks_dir}, skipping download") + return + + _, repo_info = _resolve_task_repo(dataset) + tasks_dir.mkdir(parents=True, exist_ok=True) + clone_dir = tasks_dir.parent / "_clone" + if clone_dir.exists(): + shutil.rmtree(clone_dir, ignore_errors=True) + + subdir = repo_info.get("subdir", "") + + if task_ids: + # Sparse checkout: only download the specific task directories + short_names = [tid.split("/")[-1] for tid in task_ids] + + cmd_clone = [ + "git", + "clone", + "--no-checkout", + "--depth", + "1", + repo_info["repo"], + str(clone_dir), + ] + logger.info(f"Sparse clone: {' '.join(cmd_clone)}") + result = subprocess.run(cmd_clone, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"task download failed: {result.stderr}") + + # Init sparse-checkout and set the desired paths + subprocess.run( + ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"], capture_output=True, text=True, - timeout=10, + check=True, ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired): - return False + sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names] + subprocess.run( + ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths], + capture_output=True, + text=True, + check=True, + ) + subprocess.run( + ["git", "-C", str(clone_dir), "checkout"], + capture_output=True, + text=True, + check=True, + ) + else: + # Full shallow clone + cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)] + logger.info(f"Downloading tasks: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + logger.error(f"Failed to clone tasks: {result.stderr}") + raise RuntimeError(f"task download failed: {result.stderr}") + + try: + source_dir = clone_dir / subdir if subdir else clone_dir + + for entry in source_dir.iterdir(): + target = tasks_dir / entry.name + if entry.is_dir(): + shutil.copytree(entry, target, dirs_exist_ok=True) + else: + shutil.copy2(entry, target) + finally: + shutil.rmtree(clone_dir, ignore_errors=True) + logger.info(f"Tasks downloaded to {tasks_dir}") -def run_harbor_evaluation( + +def run_benchflow_job( llm: LLM, - dataset: str, - output_dir: str, + tasks_dir: Path, + jobs_dir: Path, num_workers: int = 1, task_ids: list[str] | None = None, - n_limit: int | None = None, ) -> Path: - """Run harbor evaluation with openhands-sdk agent. + """Run benchflow job command. Args: llm: LLM configuration for the agent. - dataset: Harbor dataset name (e.g., benchflow/skillsbench). - output_dir: Directory to store output files. - num_workers: Number of parallel workers. - task_ids: Optional list of specific task IDs to run. - n_limit: Optional maximum number of dataset tasks to run. + tasks_dir: Path to directory containing task subdirectories. + jobs_dir: Directory for benchflow job output. + num_workers: Number of parallel workers (concurrency). + task_ids: Optional list of task IDs to filter (short names, not full paths). Returns: - Path to the harbor output directory. + Path to jobs_dir. """ - harbor_output_dir = Path(output_dir) / "harbor_output" - harbor_output_dir.mkdir(parents=True, exist_ok=True) - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] - - # Build harbor command using harbor CLI flags. - # Use absolute path for --jobs-dir to avoid CWD-relative path issues. - cmd = [ - harbor_exe, - "run", - "-d", - dataset, - "-a", - HARBOR_DEFAULTS["agent_name"], - "-m", - llm.model, - "--jobs-dir", - str(harbor_output_dir.resolve()), - "--n-concurrent", - str(num_workers), - ] - - # Pass LLM credentials as agent environment variables - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) - if llm.base_url: - cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + jobs_dir.mkdir(parents=True, exist_ok=True) + + agent_env = _build_benchflow_agent_env(llm) + # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without + # --break-system-packages. benchflow's openhands install_cmd uses plain + # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04, + # causing "Agent openhands install failed (rc=1)". Setting this env var + # makes pip skip the restriction without modifying the install_cmd. + agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1") + config = { + "tasks_dir": str(tasks_dir), + "jobs_dir": str(jobs_dir.resolve()), + "agent": BENCHFLOW_DEFAULTS["agent_name"], + "model": llm.model, + "environment": "docker", + "concurrency": num_workers, + # OpenHands is installed inside the sandbox as root by benchflow's + # registry install command. Running as the default "agent" user can + # lose access to that binary on some task images. + "sandbox_user": None, + "agent_env": agent_env, + } - # Add specific task names if provided - if task_ids: - for task_id in task_ids: - cmd.extend(["--include-task-name", task_id]) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False + ) as tmp: + yaml.safe_dump(config, tmp, sort_keys=False) + config_path = tmp.name - if n_limit is not None: - cmd.extend(["--n-tasks", str(n_limit)]) + # Prefer `bench eval create` (current), fall back to legacy `benchflow job` + bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench" + if "benchflow" in bench_bin: + cmd = [bench_bin, "job", "--config", config_path] + else: + cmd = [bench_bin, "eval", "create", "-f", config_path] - logger.info(f"Running harbor command: {' '.join(cmd)}") - logger.info(f"Output directory: {harbor_output_dir}") + logger.info(f"Running: {' '.join(cmd)}") - # harbor's openhands-sdk agent reads LLM credentials from the host process - # environment (os.environ), not from --ae flags which go to the sandbox. - env = os.environ.copy() - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - env["LLM_API_KEY"] = api_key - if llm.base_url: - env["LLM_BASE_URL"] = llm.base_url + # Inject LLM vars into the host process env so benchflow's provider + # resolution can pick them up; the subprocess inherits normally (env=None). + host_env = os.environ.copy() + host_env.update(agent_env) + result = subprocess.run(cmd, capture_output=True, text=True, env=host_env) + Path(config_path).unlink(missing_ok=True) - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - env=env, - ) + if result.returncode != 0: + logger.error(f"benchflow job failed (code {result.returncode})") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"benchflow job failed: {result.stderr}") - if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + logger.info("benchflow job completed") + logger.info(f"stdout: {result.stdout}") - logger.info("Harbor evaluation completed successfully") - logger.info(f"stdout: {result.stdout}") + return jobs_dir - except FileNotFoundError: - raise RuntimeError( - "Harbor CLI not found. Please install harbor: pip install harbor" - ) - return harbor_output_dir - - -def _find_job_dir(harbor_output_dir: Path) -> Path: - """Find the harbor job directory (timestamp-named) inside the output dir.""" - # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) - # containing result.json and trial subdirectories - candidates = [ - d - for d in harbor_output_dir.iterdir() - if d.is_dir() and (d / "result.json").exists() - ] - if not candidates: - raise RuntimeError( - f"No harbor job directory found in {harbor_output_dir}. " - f"Expected a timestamp-named directory containing result.json." +def _extract_trial_metrics(trial_dir: Path) -> dict: + """Extract token/cost metrics from benchflow 0.3.0 trial output files. + + benchflow 0.3.0 does not write cost/token fields to result.json. + Instead, metrics are read from: + 1. agent/trajectory.json → final_metrics (harbor-format agent) + 2. agent/openhands.txt → "Total cost:" and "Tokens:" lines (ACP agent) + """ + # 1. Harbor-format trajectory.json written by openhands-sdk agent + traj_file = trial_dir / "agent" / "trajectory.json" + if traj_file.exists(): + try: + with open(traj_file) as f: + traj = json.load(f) + fm = traj.get("final_metrics") or {} + if fm: + return { + "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0), + "total_completion_tokens": int( + fm.get("total_completion_tokens") or 0 + ), + "total_cost_usd": float(fm.get("total_cost_usd") or 0.0), + } + except (json.JSONDecodeError, OSError): + pass + + # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native) + def _parse_token_count(value: str, suffix: str) -> int: + n = float(value) + return int( + n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1) ) - # Use the most recent job directory if multiple exist - return sorted(candidates)[-1] + for log_name in ("openhands.txt", "openhands_sdk.txt"): + log_file = trial_dir / "agent" / log_name + if not log_file.exists(): + continue + try: + text = log_file.read_text(errors="replace") + cost_usd = 0.0 + prompt_tokens = 0 + completion_tokens = 0 + m = _COST_RE.search(text) + if m: + cost_usd = float(m.group(1)) + m = _TOKENS_RE.search(text) + if m: + prompt_tokens = _parse_token_count(m.group(1), m.group(2)) + completion_tokens = _parse_token_count(m.group(3), m.group(4)) + if cost_usd or prompt_tokens: + return { + "total_prompt_tokens": prompt_tokens, + "total_completion_tokens": completion_tokens, + "total_cost_usd": cost_usd, + } + except OSError: + pass -def convert_harbor_to_eval_output( - harbor_output_dir: Path, + return { + "total_prompt_tokens": 0, + "total_completion_tokens": 0, + "total_cost_usd": 0.0, + } + + +def convert_benchflow_to_eval_output( + jobs_dir: Path, eval_output_path: Path, + task_ids: list[str] | None = None, ) -> None: - """Convert harbor output to evaluation output format. + """Convert benchflow job output to standard evaluation output format. - Harbor stores trial results in a job directory structured as: - harbor_output/TIMESTAMP/TRIAL_NAME/result.json + benchflow 0.3.0 stores trial results as: + jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json - Each trial's result.json contains task_name, verifier_result, agent_result, - timing info, and exception details. + Each result.json contains task_name, rewards, error, verifier_error, and timing. Args: - harbor_output_dir: Path to harbor output directory. - eval_output_path: Path to write the converted output.jsonl. + jobs_dir: Path to benchflow jobs directory. + eval_output_path: Path to write output.jsonl. + task_ids: Optional filter for specific task IDs (short names). """ - logger.info(f"Converting harbor output from {harbor_output_dir}") - - job_dir = _find_job_dir(harbor_output_dir) - logger.info(f"Using harbor job directory: {job_dir}") + logger.info(f"Converting benchflow output from {jobs_dir}") + + # benchflow 0.3.0 writes: + # jobs/summary.json + # jobs/TIMESTAMP/TRIAL_NAME/result.json + # while older local outputs may place results directly under jobs/. + job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()] + timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)] + + if timestamp_job_dirs: + selected_job_dir = sorted(timestamp_job_dirs)[-1] + logger.info(f"Using benchflow job directory: {selected_job_dir}") + task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()] + else: + task_dirs = job_dirs - # Find trial result files (each trial dir has a result.json) - result_files = list(job_dir.glob("*/result.json")) - # Exclude the job-level result.json - result_files = [f for f in result_files if f.parent != job_dir] + if not task_dirs: + raise RuntimeError(f"No task directories found in {jobs_dir}") - if not result_files: - raise RuntimeError( - f"No trial result files found in {job_dir}. " - f"Expected result.json files in trial subdirectories." - ) + if task_ids: + short_ids = {tid.split("/")[-1] for tid in task_ids} + task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids] - logger.info(f"Found {len(result_files)} trial results in {job_dir}") + logger.info(f"Processing {len(task_dirs)} task directories") results: list[dict] = [] errors: list[dict] = [] - for result_file in result_files: + for task_dir in sorted(task_dirs): + # Find the trial result — benchflow writes trial-0/result.json + trial_results = list(task_dir.glob("trial-*/result.json")) + if not trial_results: + # Fall back to a direct result.json + direct = task_dir / "result.json" + if direct.exists(): + trial_results = [direct] + + if not trial_results: + logger.warning(f"No result.json found in {task_dir}, skipping") + errors.append( + { + "instance_id": f"benchflow/{task_dir.name}", + "error": "No result.json found", + "test_result": {}, + } + ) + continue + + # Use the last trial (highest retry index) + result_file = sorted(trial_results)[-1] + try: with open(result_file) as f: trial = json.load(f) - instance_id = trial.get("task_name", result_file.parent.name) + task_basename = task_dir.name.split("__")[0] + task_name = trial.get("task_name") or f"benchflow/{task_basename}" + # Normalise to benchflow/ form + if "/" not in task_name: + task_name = f"benchflow/{task_name}" - # Check for exceptions - if trial.get("exception_info"): + error = trial.get("error") + verifier_error = trial.get("verifier_error") + + if error or verifier_error: errors.append( { - "instance_id": instance_id, - "error": str(trial["exception_info"]), + "instance_id": task_name, + "error": str(error or verifier_error), "test_result": {}, } ) continue - # Extract verifier results - verifier_result = trial.get("verifier_result", {}) - rewards = verifier_result.get("rewards", {}) - passed = rewards.get("reward", 0.0) > 0 - - # Extract agent metrics - agent_result = trial.get("agent_result", {}) + rewards = trial.get("rewards") or {} + passed = bool(rewards.get("reward", 0.0)) eval_entry = { - "instance_id": instance_id, + "instance_id": task_name, "test_result": { - "trial_name": trial.get("trial_name"), - "trial_uri": trial.get("trial_uri"), "rewards": rewards, "passed": passed, }, "instruction": "", "error": None, "history": [], - "metrics": { - "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, - "total_completion_tokens": ( - agent_result.get("n_output_tokens") or 0 - ), - "total_cost_usd": agent_result.get("cost_usd") or 0.0, - }, + "metrics": _extract_trial_metrics(result_file.parent), } results.append(eval_entry) - logger.info( - f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" - ) + logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}") except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to process result file {result_file}: {e}") + logger.error(f"Failed to read {result_file}: {e}") errors.append( { - "instance_id": result_file.parent.name, + "instance_id": f"benchflow/{task_dir.name}", "error": str(e), "test_result": {}, } ) if not results and not errors: - raise RuntimeError(f"No trials processed from {harbor_output_dir}") + raise RuntimeError(f"No trials processed from {jobs_dir}") if not results: logger.warning( - f"All {len(errors)} trials failed in {harbor_output_dir}; " - "writing error entries for downstream reporting" + f"All {len(errors)} trials failed; writing error entries for reporting" ) - # Write results to output.jsonl with open(eval_output_path, "w") as f: - for entry in results: - f.write(json.dumps(entry) + "\n") - for entry in errors: + for entry in results + errors: f.write(json.dumps(entry) + "\n") logger.info( @@ -298,18 +518,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]: def main() -> None: """Main entry point for skillsbench inference.""" parser = argparse.ArgumentParser( - description="Run SkillsBench evaluation with openhands-sdk via Harbor", + description="Run SkillsBench evaluation with benchflow and openhands", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run full skillsbench evaluation uv run skillsbench-infer .llm_config/claude.json - # Run specific tasks + # Run specific tasks from a file uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with custom dataset version - uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 + # Run with more concurrency + uv run skillsbench-infer .llm_config/claude.json --num-workers 4 """, ) @@ -322,7 +542,7 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="Harbor dataset name (e.g., benchflow/skillsbench)", + help="benchflow dataset name (e.g., benchflow/skillsbench)", ) parser.add_argument( "--output-dir", @@ -334,12 +554,12 @@ def main() -> None: "--num-workers", type=int, default=INFER_DEFAULTS["num_workers"], - help="Number of parallel workers", + help="Number of parallel workers (concurrency)", ) parser.add_argument( "--n-limit", type=int, - help="Maximum number of dataset tasks to run after Harbor filtering", + help="Maximum number of tasks to run", ) parser.add_argument( "--select", @@ -358,14 +578,13 @@ def main() -> None: help="Optional note for the evaluation run", ) parser.add_argument( - "--skip-harbor", + "--skip-run", action="store_true", - help="Skip running harbor and only convert existing results", + help="Skip running benchflow and only convert existing results", ) args = parser.parse_args() - # Validate LLM config if not os.path.isfile(args.llm_config_path): logger.error(f"LLM config file does not exist: {args.llm_config_path}") sys.exit(1) @@ -375,87 +594,103 @@ def main() -> None: llm = LLM.model_validate_json(llm_config) logger.info(f"Using LLM: {llm.model}") - # Check harbor installation - if not args.skip_harbor and not check_harbor_installed(): + if not args.skip_run and not check_benchflow_installed(): logger.error( - "Harbor CLI is not installed. Please install it:\n" - " pip install harbor\n" + "benchflow CLI is not installed. Please install it:\n" + " uv tool install benchflow==0.3.0\n" " # or\n" - " uv pip install harbor" + " pip install benchflow==0.3.0\n" + " # or\n" + " uv pip install benchflow==0.3.0" ) sys.exit(1) - # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( base_dir=args.output_dir, dataset_name=dataset_description, model_name=llm.model, - max_iterations=100, # Not directly used but required for path construction + max_iterations=100, eval_note=args.note, ) logger.info(f"Output directory: {structured_output_dir}") os.makedirs(structured_output_dir, exist_ok=True) - # Save metadata metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, "timestamp": datetime.now(timezone.utc).isoformat(), - "harbor_agent": HARBOR_DEFAULTS["agent_name"], + "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"], "note": args.note, } metadata_path = Path(structured_output_dir) / "metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) - # Collect task IDs if specified task_ids: list[str] | None = None if args.select: - loaded_ids = load_task_ids_from_file(args.select) - task_ids = loaded_ids - logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") + task_ids = load_task_ids_from_file(args.select) + logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}") elif args.task_id: task_ids = list(args.task_id) logger.info(f"Running {len(task_ids)} specified task IDs") + tasks_dir = Path(structured_output_dir) / "tasks" + jobs_dir = Path(structured_output_dir) / "jobs" output_path = Path(structured_output_dir) / OUTPUT_FILENAME - if not args.skip_harbor: - # Run harbor evaluation + if not args.skip_run: try: - harbor_output_dir = run_harbor_evaluation( + ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids) + + # Apply n_limit by slicing available task directories + effective_task_dirs = tasks_dir + if args.n_limit is not None or task_ids is not None: + all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir()) + if task_ids: + short_ids = {tid.split("/")[-1] for tid in task_ids} + all_dirs = [d for d in all_dirs if d.name in short_ids] + if args.n_limit is not None: + all_dirs = all_dirs[: args.n_limit] + + # Write a filtered tasks dir symlink tree + filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered" + filtered_tasks_dir.mkdir(exist_ok=True) + for d in all_dirs: + link = filtered_tasks_dir / d.name + if not link.exists(): + link.symlink_to(d.resolve()) + effective_task_dirs = filtered_tasks_dir + + run_benchflow_job( llm=llm, - dataset=args.dataset, - output_dir=structured_output_dir, + tasks_dir=effective_task_dirs, + jobs_dir=jobs_dir, num_workers=args.num_workers, task_ids=task_ids, - n_limit=args.n_limit, ) - # Convert harbor output to standard format - convert_harbor_to_eval_output( - harbor_output_dir=harbor_output_dir, + convert_benchflow_to_eval_output( + jobs_dir=jobs_dir, eval_output_path=output_path, + task_ids=task_ids, ) except Exception as e: logger.error(f"Evaluation failed: {e}") sys.exit(1) else: - # Skip harbor, just convert existing results - harbor_output_dir = Path(structured_output_dir) / "harbor_output" - if harbor_output_dir.exists(): - convert_harbor_to_eval_output( - harbor_output_dir=harbor_output_dir, + if jobs_dir.exists(): + convert_benchflow_to_eval_output( + jobs_dir=jobs_dir, eval_output_path=output_path, + task_ids=task_ids, ) else: - logger.error(f"No harbor output found at {harbor_output_dir}") + logger.error(f"No jobs output found at {jobs_dir}") sys.exit(1) - # Generate cost report if output_path.exists(): generate_cost_report(str(output_path)) diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py index 56d54f27a..1334da297 100644 --- a/tests/test_skillsbench_eval_infer.py +++ b/tests/test_skillsbench_eval_infer.py @@ -39,23 +39,6 @@ def test_resolved_instance(self, tmp_path: Path) -> None: assert result["unresolved_instances"] == 0 assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] - def test_unresolved_instance(self, tmp_path: Path) -> None: - """Test processing an unresolved (passed=False) instance.""" - input_file = tmp_path / "unresolved.jsonl" - output_file = tmp_path / "unresolved.report.json" - - entry = { - "instance_id": "benchflow/task-1", - "test_result": {"passed": False, "rewards": {"reward": 0.0}}, - "error": None, - } - input_file.write_text(json.dumps(entry) + "\n") - - result = process_skillsbench_results(str(input_file), str(output_file)) - - assert result["resolved_instances"] == 0 - assert result["unresolved_instances"] == 1 - def test_instance_with_error(self, tmp_path: Path) -> None: """Test processing an instance that errored.""" input_file = tmp_path / "error.jsonl" diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 5f8452cb3..784b4d1cc 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -4,82 +4,102 @@ from pathlib import Path import pytest +import yaml -from benchmarks.skillsbench.config import INFER_DEFAULTS +from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( - convert_harbor_to_eval_output, - run_harbor_evaluation, + _build_benchflow_agent_env, + convert_benchflow_to_eval_output, + run_benchflow_job, ) from openhands.sdk import LLM -class TestRunHarborEvaluation: - """Tests for building Harbor invocation arguments.""" +class TestRunBenchflowJob: + """Tests for building benchflow job invocation arguments.""" - def test_default_dataset_matches_harbor_registry(self) -> None: - """Test that the default dataset name matches Harbor's published registry.""" + def test_default_dataset_matches_benchflow_registry(self) -> None: + """Test that the default dataset name matches benchflow's published registry.""" assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_run_harbor_evaluation_passes_filters_and_limits( + def test_default_agent_is_openhands(self) -> None: + """Test that the default agent is openhands.""" + assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands" + + def test_run_benchflow_job_passes_model_and_concurrency( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test Harbor command includes task filters and n-limit.""" - captured: dict[str, list[str]] = {} + """Test benchflow job command writes the expected YAML config.""" + captured_cmd: list[str] = [] + captured_env: dict[str, str] = {} + captured_config: dict = {} + + # Force legacy benchflow binary path so the command format is deterministic + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.shutil.which", + lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, + ) def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured["cmd"] = cmd + captured_cmd[:] = cmd + captured_env.clear() + captured_env.update(env) + with open(cmd[3]) as f: + captured_config.update(yaml.safe_load(f)) return type( "Completed", (), - {"returncode": 0, "stdout": "ok", "stderr": ""}, + {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""}, )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - harbor_output_dir = run_harbor_evaluation( + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + jobs_dir = tmp_path / "jobs" + + run_benchflow_job( llm=LLM( - model="litellm_proxy/test-model", + model="anthropic/claude-sonnet-4-5", api_key="test-key", base_url="https://proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], - output_dir=str(tmp_path), - num_workers=2, - task_ids=["benchflow/task-a", "benchflow/task-b"], - n_limit=3, - ) - - expected_output_dir = tmp_path / "harbor_output" - assert harbor_output_dir == expected_output_dir - - cmd = captured["cmd"] - assert cmd[:8] == [ - "harbor", - "run", - "-d", - "benchflow/skillsbench", - "-a", - "openhands-sdk", - "-m", - "litellm_proxy/test-model", - ] - assert "--jobs-dir" in cmd - assert str(expected_output_dir.resolve()) in cmd - assert cmd.count("--include-task-name") == 2 - assert "benchflow/task-a" in cmd - assert "benchflow/task-b" in cmd - assert cmd[cmd.index("--n-concurrent") + 1] == "2" - assert cmd[cmd.index("--n-tasks") + 1] == "3" - - def test_llm_credentials_passed_via_env( + tasks_dir=tasks_dir, + jobs_dir=jobs_dir, + num_workers=4, + ) + + cmd = captured_cmd + assert cmd[0] == "/usr/local/bin/benchflow" + assert cmd[1] == "job" + assert cmd[2] == "--config" + assert captured_config["tasks_dir"] == str(tasks_dir) + assert captured_config["jobs_dir"] == str(jobs_dir.resolve()) + assert captured_config["agent"] == "openhands" + assert captured_config["model"] == "anthropic/claude-sonnet-4-5" + assert captured_config["concurrency"] == 4 + assert captured_config["sandbox_user"] is None + + def test_llm_credentials_passed_via_subprocess_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test that LLM credentials are passed via subprocess env, not --ae flags.""" - captured: dict = {} + """Test that LLM credentials are passed via subprocess env and YAML.""" + captured_cmd: list[str] = [] + captured_env: dict[str, str] = {} + captured_config: dict = {} + + # Force legacy benchflow binary path so the command format is deterministic + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.shutil.which", + lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, + ) def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured["cmd"] = cmd - captured["env"] = env + captured_cmd[:] = cmd + captured_env.clear() + captured_env.update(env) + with open(cmd[3]) as f: + captured_config.update(yaml.safe_load(f)) return type( "Completed", (), @@ -88,60 +108,127 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - run_harbor_evaluation( + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + + run_benchflow_job( llm=LLM( model="test-model", api_key="my-secret-key", base_url="https://my-proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], - output_dir=str(tmp_path), + tasks_dir=tasks_dir, + jobs_dir=tmp_path / "jobs", + ) + + # Credentials in subprocess env + assert captured_env["LLM_API_KEY"] == "my-secret-key" + assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com" + assert "--ae" not in captured_cmd + assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key" + assert ( + captured_config["agent_env"]["LLM_BASE_URL"] + == "https://my-proxy.example.com" ) - assert captured["env"]["LLM_API_KEY"] == "my-secret-key" - assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + def test_direct_gemini_model_sets_provider_env_vars(self) -> None: + """Direct provider models need provider-specific env vars.""" + env = _build_benchflow_agent_env( + LLM( + model="gemini/gemini-3.1-flash-lite-preview", + api_key="gemini-test-key", + ) + ) + assert env["LLM_API_KEY"] == "gemini-test-key" + assert env["GEMINI_API_KEY"] == "gemini-test-key" + assert env["GOOGLE_API_KEY"] == "gemini-test-key" -class TestConvertHarborToEvalOutput: - """Tests for convert_harbor_to_eval_output function.""" + def test_proxy_model_does_not_set_provider_env_vars(self) -> None: + """LiteLLM proxy configs should keep using generic LLM_* vars only.""" + env = _build_benchflow_agent_env( + LLM( + model="litellm_proxy/anthropic/claude-sonnet-4-20250514", + api_key="proxy-key", + base_url="https://proxy.example.com", + ) + ) - def _create_harbor_structure( - self, tmp_path: Path, trials: list[tuple[str, dict]] - ) -> Path: - """Create a mock Harbor output structure.""" - harbor_dir = tmp_path / "harbor_output" - job_dir = harbor_dir / "2026-01-01__00-00-00" - job_dir.mkdir(parents=True) - (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) + assert env["LLM_API_KEY"] == "proxy-key" + assert env["LLM_BASE_URL"] == "https://proxy.example.com" + assert "ANTHROPIC_API_KEY" not in env + assert "ANTHROPIC_BASE_URL" not in env - for trial_name, trial_result in trials: - trial_dir = job_dir / trial_name - trial_dir.mkdir() - (trial_dir / "result.json").write_text(json.dumps(trial_result)) - return harbor_dir +class TestConvertBenchflowToEvalOutput: + """Tests for convert_benchflow_to_eval_output function.""" + + def _create_benchflow_structure( + self, tmp_path: Path, tasks: list[tuple[str, dict]] + ) -> Path: + """Create a mock benchflow jobs directory structure. + + benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json + """ + jobs_dir = tmp_path / "jobs" + for task_name, result in tasks: + trial_dir = jobs_dir / task_name / "trial-0" + trial_dir.mkdir(parents=True) + (trial_dir / "result.json").write_text(json.dumps(result)) + return jobs_dir + + def _create_benchflow_timestamped_job( + self, tmp_path: Path, tasks: list[tuple[str, dict]] + ) -> Path: + """Create a mock benchflow 0.3.0 jobs directory structure. + + benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json + """ + jobs_dir = tmp_path / "jobs" + job_dir = jobs_dir / "2026-04-21__23-12-35" + job_dir.mkdir(parents=True) + (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)})) + for task_name, result in tasks: + trial_dir = job_dir / f"{task_name}__abc123" + trial_dir.mkdir(parents=True) + (trial_dir / "result.json").write_text(json.dumps(result)) + return jobs_dir def test_successful_trial_parsing(self, tmp_path: Path) -> None: - """Test successful parsing of harbor trial result.""" + """Test successful parsing of a benchflow trial result. + + benchflow 0.3.0 does not write cost/token fields to result.json. + Metrics are read from agent/trajectory.json (harbor-format agent) + or parsed from agent/openhands.txt (ACP agent stdout). + """ trial_result = { "task_name": "benchflow/weighted-gdp-calc", - "trial_name": "weighted-gdp-calc__abc123", - "trial_uri": "file:///path/to/trial", - "agent_result": { - "n_input_tokens": 1000, - "n_output_tokens": 200, - "cost_usd": 0.05, - }, - "verifier_result": {"rewards": {"reward": 1.0}}, - "exception_info": None, + "rewards": {"reward": 1.0}, + "error": None, } - harbor_dir = self._create_harbor_structure( - tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + # Write agent/trajectory.json with final_metrics (harbor-format agent output). + # agent/ sits next to result.json, inside the trial-0 subdirectory. + trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0" + agent_dir = trial_dir / "agent" + agent_dir.mkdir(parents=True, exist_ok=True) + (agent_dir / "trajectory.json").write_text( + json.dumps( + { + "final_metrics": { + "total_prompt_tokens": 1000, + "total_completion_tokens": 200, + "total_cost_usd": 0.05, + } + } + ) ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) assert output_file.exists() with open(output_file) as f: @@ -151,26 +238,52 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + assert entries[0]["metrics"]["total_prompt_tokens"] == 1000 + assert entries[0]["metrics"]["total_completion_tokens"] == 200 + + def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None: + """Test that metrics are extracted from agent/openhands.txt (ACP agent).""" + trial_result = { + "task_name": "benchflow/acp-task", + "rewards": {"reward": 1.0}, + "error": None, + } + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, [("acp-task", trial_result)] + ) + # Write agent/openhands.txt simulating openhands ACP stdout + trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123" + agent_dir = trial_dir / "agent" + agent_dir.mkdir(parents=True, exist_ok=True) + (agent_dir / "openhands.txt").write_text( + "OpenHands SDK v1.16.0\n" + "Tokens: ↑ input 404.21K • cache hit 70.47% • reasoning 579 • ↓ output 7.83K • $0.0487\n" + "Total cost: $0.0487\n" + ) + output_file = tmp_path / "output.jsonl" + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487) + assert entries[0]["metrics"]["total_prompt_tokens"] == 404210 + assert entries[0]["metrics"]["total_completion_tokens"] == 7830 def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { "task_name": "benchflow/task-1", - "trial_name": "task-1__xyz", - "agent_result": { - "n_input_tokens": None, - "n_output_tokens": None, - "cost_usd": None, - }, - "verifier_result": {"rewards": {"reward": 0.0}}, - "exception_info": None, + "rewards": {"reward": 0.0}, + "error": None, } - harbor_dir = self._create_harbor_structure( - tmp_path, [("task-1__xyz", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("task-1", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -178,21 +291,19 @@ def test_failed_trial(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is False assert entries[0]["metrics"]["total_cost_usd"] == 0.0 - def test_trial_with_exception(self, tmp_path: Path) -> None: - """Test that exception trials are written as error entries.""" + def test_trial_with_error(self, tmp_path: Path) -> None: + """Test that errored trials are written as error entries.""" trial_result = { "task_name": "benchflow/error-task", - "trial_name": "error-task__err", - "agent_result": {}, - "verifier_result": {}, - "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, + "rewards": {}, + "error": "LLM_API_KEY not set", } - harbor_dir = self._create_harbor_structure( - tmp_path, [("error-task__err", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("error-task", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -202,20 +313,121 @@ def test_trial_with_exception(self, tmp_path: Path) -> None: assert entries[0]["error"] is not None assert entries[0]["test_result"] == {} - def test_missing_job_directory(self, tmp_path: Path) -> None: - """Test handling when no job directory exists.""" - harbor_dir = tmp_path / "harbor_output" - harbor_dir.mkdir() + def test_missing_jobs_directory(self, tmp_path: Path) -> None: + """Test handling when jobs directory is empty.""" + jobs_dir = tmp_path / "jobs" + jobs_dir.mkdir() + + with pytest.raises(RuntimeError, match="No task directories found"): + convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl") + + def test_task_id_filtering(self, tmp_path: Path) -> None: + """Test that only specified task IDs are converted.""" + trials = [ + ( + "task-a", + { + "task_name": "benchflow/task-a", + "rewards": {"reward": 1.0}, + "error": None, + }, + ), + ( + "task-b", + { + "task_name": "benchflow/task-b", + "rewards": {"reward": 0.0}, + "error": None, + }, + ), + ] + jobs_dir = self._create_benchflow_structure(tmp_path, trials) + output_file = tmp_path / "output.jsonl" - with pytest.raises(RuntimeError, match="No harbor job directory found"): - convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + convert_benchflow_to_eval_output( + jobs_dir, output_file, task_ids=["benchflow/task-a"] + ) - def test_empty_job_directory(self, tmp_path: Path) -> None: - """Test handling of harbor job dir with no trial subdirs.""" - harbor_dir = tmp_path / "harbor_output" - job_dir = harbor_dir / "2026-01-01__00-00-00" - job_dir.mkdir(parents=True) - (job_dir / "result.json").write_text(json.dumps({"id": "test"})) + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/task-a" + + def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None: + """Test that task names without prefix get benchflow/ prepended.""" + trial_result = { + "task_name": "weighted-gdp-calc", # no benchflow/ prefix + "rewards": {"reward": 1.0}, + "error": None, + } + jobs_dir = self._create_benchflow_structure( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + + def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None: + """Test benchflow 0.3.0 timestamped jobs directory layout.""" + trial_result = { + "task_name": "weighted-gdp-calc", + "rewards": {"reward": 1.0}, + "error": None, + "n_input_tokens": 42, + "n_output_tokens": 7, + "cost_usd": 0.01, + } + + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + assert entries[0]["test_result"]["passed"] is True - with pytest.raises(RuntimeError, match="No trial result files found"): - convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None: + """Test filtering strips the run suffix from trial directory names.""" + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, + [ + ( + "task-a", + { + "task_name": "task-a", + "rewards": {"reward": 1.0}, + "error": None, + }, + ), + ( + "task-b", + { + "task_name": "task-b", + "rewards": {"reward": 0.0}, + "error": None, + }, + ), + ], + ) + output_file = tmp_path / "output.jsonl" + + convert_benchflow_to_eval_output( + jobs_dir, output_file, task_ids=["benchflow/task-a"] + ) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/task-a" diff --git a/uv.lock b/uv.lock index 2cd0b3640..147abedc9 100644 --- a/uv.lock +++ b/uv.lock @@ -1282,6 +1282,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1292,6 +1293,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1302,6 +1304,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -1516,11 +1519,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] -[package.optional-dependencies] -socks = [ - { name = "socksio" }, -] - [[package]] name = "httpx-sse" version = "0.4.2" @@ -1816,12 +1814,14 @@ wheels = [ [[package]] name = "litellm" -version = "1.83.0" +version = "1.80.10" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "click" }, { name = "fastuuid" }, + { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "httpx" }, { name = "importlib-metadata" }, { name = "jinja2" }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" }, ] [[package]] @@ -2402,7 +2402,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2467,6 +2467,7 @@ dependencies = [ { name = "python-json-logger" }, { name = "requests" }, { name = "swebench" }, + { name = "swesmith" }, { name = "swt-bench" }, { name = "tenacity" }, { name = "toml" }, @@ -2521,6 +2522,7 @@ requires-dist = [ { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, { name = "swebench", specifier = "==4.1.0" }, + { name = "swesmith", specifier = ">=0.0.9" }, { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" }, { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, @@ -2544,7 +2546,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, @@ -2552,7 +2554,7 @@ dependencies = [ { name = "fakeredis", extra = ["lua"] }, { name = "fastmcp" }, { name = "filelock" }, - { name = "httpx", extra = ["socks"] }, + { name = "httpx" }, { name = "litellm" }, { name = "lmnr" }, { name = "pydantic" }, @@ -2575,8 +2577,8 @@ requires-dist = [ { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" }, { name = "fastmcp", specifier = ">=3.0.0" }, { name = "filelock", specifier = ">=3.20.1" }, - { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, - { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" }, + { name = "httpx", specifier = ">=0.27.0" }, + { name = "litellm", specifier = "==1.80.10" }, { name = "lmnr", specifier = ">=0.7.24" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, @@ -2588,7 +2590,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2617,7 +2619,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, @@ -6707,15 +6709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "socksio" -version = "1.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, -] - [[package]] name = "sortedcontainers" version = "2.4.0" @@ -6841,6 +6834,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, ] +[[package]] +name = "swesmith" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" }, +] + [[package]] name = "swt-bench" version = "1.0.1" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 3e0a3a091..acd5adc96 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a +Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6 From 3963e9c50c4c47bbe7a47f7ace44e1d1cf57dcef Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Wed, 22 Apr 2026 20:57:53 -0400 Subject: [PATCH 04/12] Revert "feat(skillsbench): migrate harness from Harbor to benchflow 0.3.0" This reverts commit 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b. --- .gitignore | 1 - benchmarks/skillsbench/README.md | 42 +- benchmarks/skillsbench/config.py | 11 +- benchmarks/skillsbench/run_infer.py | 655 +++++++++------------------ tests/test_skillsbench_eval_infer.py | 17 + tests/test_skillsbench_run_infer.py | 442 +++++------------- uv.lock | 50 +- vendor/software-agent-sdk | 2 +- 8 files changed, 396 insertions(+), 824 deletions(-) diff --git a/.gitignore b/.gitignore index 9164fd12b..459fad588 100644 --- a/.gitignore +++ b/.gitignore @@ -216,5 +216,4 @@ workspace/ # Evaluation outputs eval_outputs/ -evaluation_outputs/ builds/ diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 21339842c..60ff73652 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -1,10 +1,10 @@ # SkillsBench Evaluation -This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent. +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. ## Overview -SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include: +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain - Software engineering - Office & white collar @@ -20,25 +20,23 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites -1. **Install benchflow**: benchflow is the official harness for running SkillsBench. +1. **Install Harbor**: Harbor is the official harness for running SkillsBench. ```bash - uv tool install benchflow==0.3.0 + pip install harbor # or - pip install benchflow==0.3.0 - # or - uv pip install benchflow==0.3.0 + uv pip install harbor ``` -2. **Docker**: benchflow requires Docker to be installed and running. +2. **Docker**: Harbor requires Docker to be installed and running. -3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment. +3. **LLM API Key**: Configure your LLM provider credentials. ## Usage ### Running Inference -Run the SkillsBench evaluation using the `openhands` agent: +Run the SkillsBench evaluation using the OpenHands SDK agent: ```bash # Run full evaluation @@ -64,7 +62,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`): ```json { "model": "anthropic/claude-sonnet-4-20250514", - "api_key": "YOUR_ANTHROPIC_API_KEY" + "api_key": "YOUR_API_KEY" } ``` @@ -101,6 +99,8 @@ Each line contains: { "instance_id": "benchflow/task-name", "test_result": { + "trial_name": "...", + "trial_uri": "...", "rewards": {"reward": 1.0}, "passed": true }, @@ -134,21 +134,22 @@ Each line contains: ## Architecture -The integration uses the benchflow CLI as the evaluation harness: +The integration follows the Harbor agent adapter pattern: -1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty -2. **benchflow job**: Runs all tasks concurrently with `openhands` -3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format +1. **Harbor Harness**: Manages task containers and lifecycle +2. **OpenHands SDK Agent**: Runs inside containers to solve tasks +3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format ```text ┌──────────────────────────────────────────────────┐ -│ benchflow job │ +│ Harbor Harness │ │ ┌────────────────────────────────────────────┐ │ -│ │ Task Container (Docker) │ │ +│ │ Task Container │ │ │ │ ┌──────────────────────────────────────┐ │ │ -│ │ │ openhands │ │ │ +│ │ │ OpenHands SDK Agent │ │ │ │ │ │ - Terminal tool │ │ │ │ │ │ - File editor tool │ │ │ +│ │ │ - Task tracker tool │ │ │ │ │ └──────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────┘ @@ -157,5 +158,6 @@ The integration uses the benchflow CLI as the evaluation harness: ## References - [SkillsBench](https://www.skillsbench.ai/) - The benchmark -- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness -- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation +- [Harbor](https://harborframework.com) - The evaluation harness +- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK +- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 4ed541ab9..8b55a92b0 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,13 +1,16 @@ """SkillsBench configuration defaults.""" -# Default inference settings +# Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { "dataset": "benchflow/skillsbench", "output_dir": "./evaluation_outputs", "num_workers": 1, } -# benchflow configuration defaults -BENCHFLOW_DEFAULTS = { - "agent_name": "openhands", +# Harbor configuration defaults +HARBOR_DEFAULTS = { + # Harbor executable + "harbor_executable": "harbor", + # Default agent name for openhands-sdk + "agent_name": "openhands-sdk", } diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index 2e11a100a..a8afa7281 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -1,31 +1,24 @@ -"""SkillsBench inference script using the benchflow SDK. +"""SkillsBench inference script using Harbor with openhands-sdk agent. -This script runs SkillsBench evaluation using `benchflow job` as the harness -and `openhands` as the default agent. Results are saved in a format compatible +This script runs SkillsBench evaluation using Harbor as the harness +and openhands-sdk as the agent. Results are saved in a format compatible with the standard evaluation pipeline. Usage: - uv run skillsbench-infer - - # Run specific tasks - uv run skillsbench-infer --select tasks.txt + uv run skillsbench-infer --dataset benchflow/skillsbench """ import argparse import json import os -import re -import shutil import subprocess import sys -import tempfile from datetime import datetime, timezone from pathlib import Path -import yaml from pydantic import SecretStr -from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -33,469 +26,256 @@ logger = get_logger(__name__) -# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS -_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$") - -# "Total cost: $0.0487" -_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)") -# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K" -_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b") - +# Output filename for results OUTPUT_FILENAME = "output.jsonl" -TASK_REPOS = { - "skillsbench": { - "repo": "https://github.com/benchflow-ai/skillsbench.git", - "subdir": "tasks", - } -} - -_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = { - "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"), - "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), - "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), - "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"), -} - - -def _infer_direct_provider(model: str) -> str | None: - """Infer the provider prefix for direct model names. - - Examples: - - gemini/gemini-2.5-pro -> gemini - - anthropic/claude-sonnet-4-5 -> anthropic - - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars) - """ - if not model or model.startswith("litellm_proxy/"): - return None - if "/" in model: - provider = model.split("/", 1)[0].lower() - if provider in _DIRECT_PROVIDER_ENV_VARS: - return provider - return None - - -def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]: - """Build the sandbox environment for benchflow's openhands agent. - - Only LLM-specific variables are returned — these go INTO the sandbox - container via the ``agent_env`` YAML key. The calling process inherits - the host environment normally; dumping ``os.environ`` here would leak - the entire host env into every container. - """ - env: dict[str, str] = {} - api_key: str | None = None - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - env["LLM_API_KEY"] = api_key - if llm.base_url: - env["LLM_BASE_URL"] = llm.base_url - - provider = _infer_direct_provider(llm.model) - if provider and api_key: - key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider] - for var_name in key_vars: - env[var_name] = api_key - if llm.base_url and base_url_var: - env[base_url_var] = llm.base_url - - return env - - -def check_benchflow_installed() -> bool: - """Check if benchflow CLI is installed and available. - - Tries ``bench`` first (current name), then falls back to the legacy - ``benchflow`` binary. - """ - for cmd in ("bench", "benchflow"): - try: - result = subprocess.run( - [cmd, "--help"], - capture_output=True, - text=True, - timeout=10, - ) - if result.returncode == 0: - return True - except (FileNotFoundError, subprocess.TimeoutExpired): - continue - return False - -def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]: - """Map a benchflow dataset name to its task repository metadata.""" - dataset_name = dataset.split("@", 1)[0].split("/")[-1] +def check_harbor_installed() -> bool: + """Check if harbor CLI is installed and available.""" + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] try: - return dataset_name, TASK_REPOS[dataset_name] - except KeyError as exc: - raise ValueError( - f"Unsupported SkillsBench dataset: {dataset!r}. " - f"Known datasets: {sorted(TASK_REPOS)}" - ) from exc - - -def ensure_tasks( - dataset: str, - tasks_dir: Path, - task_ids: list[str] | None = None, -) -> None: - """Download tasks for a benchflow dataset into tasks_dir. - - BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the - benchmark task repository directly when the local tasks directory is empty. - - When *task_ids* is provided, a sparse checkout is used so only the - requested task subdirectories are downloaded — much faster than a full - clone for large repos. - """ - if tasks_dir.exists() and any(tasks_dir.iterdir()): - logger.info(f"Tasks already present in {tasks_dir}, skipping download") - return - - _, repo_info = _resolve_task_repo(dataset) - tasks_dir.mkdir(parents=True, exist_ok=True) - clone_dir = tasks_dir.parent / "_clone" - if clone_dir.exists(): - shutil.rmtree(clone_dir, ignore_errors=True) - - subdir = repo_info.get("subdir", "") - - if task_ids: - # Sparse checkout: only download the specific task directories - short_names = [tid.split("/")[-1] for tid in task_ids] - - cmd_clone = [ - "git", - "clone", - "--no-checkout", - "--depth", - "1", - repo_info["repo"], - str(clone_dir), - ] - logger.info(f"Sparse clone: {' '.join(cmd_clone)}") - result = subprocess.run(cmd_clone, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"task download failed: {result.stderr}") - - # Init sparse-checkout and set the desired paths - subprocess.run( - ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"], + result = subprocess.run( + [harbor_exe, "--version"], capture_output=True, text=True, - check=True, + timeout=10, ) - sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names] - subprocess.run( - ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths], - capture_output=True, - text=True, - check=True, - ) - subprocess.run( - ["git", "-C", str(clone_dir), "checkout"], - capture_output=True, - text=True, - check=True, - ) - else: - # Full shallow clone - cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)] - logger.info(f"Downloading tasks: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - logger.error(f"Failed to clone tasks: {result.stderr}") - raise RuntimeError(f"task download failed: {result.stderr}") - - try: - source_dir = clone_dir / subdir if subdir else clone_dir - - for entry in source_dir.iterdir(): - target = tasks_dir / entry.name - if entry.is_dir(): - shutil.copytree(entry, target, dirs_exist_ok=True) - else: - shutil.copy2(entry, target) - finally: - shutil.rmtree(clone_dir, ignore_errors=True) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False - logger.info(f"Tasks downloaded to {tasks_dir}") - -def run_benchflow_job( +def run_harbor_evaluation( llm: LLM, - tasks_dir: Path, - jobs_dir: Path, + dataset: str, + output_dir: str, num_workers: int = 1, task_ids: list[str] | None = None, + n_limit: int | None = None, ) -> Path: - """Run benchflow job command. + """Run harbor evaluation with openhands-sdk agent. Args: llm: LLM configuration for the agent. - tasks_dir: Path to directory containing task subdirectories. - jobs_dir: Directory for benchflow job output. - num_workers: Number of parallel workers (concurrency). - task_ids: Optional list of task IDs to filter (short names, not full paths). + dataset: Harbor dataset name (e.g., benchflow/skillsbench). + output_dir: Directory to store output files. + num_workers: Number of parallel workers. + task_ids: Optional list of specific task IDs to run. + n_limit: Optional maximum number of dataset tasks to run. Returns: - Path to jobs_dir. + Path to the harbor output directory. """ - jobs_dir.mkdir(parents=True, exist_ok=True) - - agent_env = _build_benchflow_agent_env(llm) - # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without - # --break-system-packages. benchflow's openhands install_cmd uses plain - # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04, - # causing "Agent openhands install failed (rc=1)". Setting this env var - # makes pip skip the restriction without modifying the install_cmd. - agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1") - config = { - "tasks_dir": str(tasks_dir), - "jobs_dir": str(jobs_dir.resolve()), - "agent": BENCHFLOW_DEFAULTS["agent_name"], - "model": llm.model, - "environment": "docker", - "concurrency": num_workers, - # OpenHands is installed inside the sandbox as root by benchflow's - # registry install command. Running as the default "agent" user can - # lose access to that binary on some task images. - "sandbox_user": None, - "agent_env": agent_env, - } - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False - ) as tmp: - yaml.safe_dump(config, tmp, sort_keys=False) - config_path = tmp.name - - # Prefer `bench eval create` (current), fall back to legacy `benchflow job` - bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench" - if "benchflow" in bench_bin: - cmd = [bench_bin, "job", "--config", config_path] - else: - cmd = [bench_bin, "eval", "create", "-f", config_path] + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + + # Build harbor command using harbor CLI flags. + # Use absolute path for --jobs-dir to avoid CWD-relative path issues. + cmd = [ + harbor_exe, + "run", + "-d", + dataset, + "-a", + HARBOR_DEFAULTS["agent_name"], + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(num_workers), + ] + + # Pass LLM credentials as agent environment variables + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) - logger.info(f"Running: {' '.join(cmd)}") + # Add specific task names if provided + if task_ids: + for task_id in task_ids: + cmd.extend(["--include-task-name", task_id]) - # Inject LLM vars into the host process env so benchflow's provider - # resolution can pick them up; the subprocess inherits normally (env=None). - host_env = os.environ.copy() - host_env.update(agent_env) - result = subprocess.run(cmd, capture_output=True, text=True, env=host_env) - Path(config_path).unlink(missing_ok=True) + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) - if result.returncode != 0: - logger.error(f"benchflow job failed (code {result.returncode})") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"benchflow job failed: {result.stderr}") + logger.info(f"Running harbor command: {' '.join(cmd)}") + logger.info(f"Output directory: {harbor_output_dir}") - logger.info("benchflow job completed") - logger.info(f"stdout: {result.stdout}") + # harbor's openhands-sdk agent reads LLM credentials from the host process + # environment (os.environ), not from --ae flags which go to the sandbox. + env = os.environ.copy() + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url - return jobs_dir + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + ) + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") -def _extract_trial_metrics(trial_dir: Path) -> dict: - """Extract token/cost metrics from benchflow 0.3.0 trial output files. + logger.info("Harbor evaluation completed successfully") + logger.info(f"stdout: {result.stdout}") - benchflow 0.3.0 does not write cost/token fields to result.json. - Instead, metrics are read from: - 1. agent/trajectory.json → final_metrics (harbor-format agent) - 2. agent/openhands.txt → "Total cost:" and "Tokens:" lines (ACP agent) - """ - # 1. Harbor-format trajectory.json written by openhands-sdk agent - traj_file = trial_dir / "agent" / "trajectory.json" - if traj_file.exists(): - try: - with open(traj_file) as f: - traj = json.load(f) - fm = traj.get("final_metrics") or {} - if fm: - return { - "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0), - "total_completion_tokens": int( - fm.get("total_completion_tokens") or 0 - ), - "total_cost_usd": float(fm.get("total_cost_usd") or 0.0), - } - except (json.JSONDecodeError, OSError): - pass - - # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native) - def _parse_token_count(value: str, suffix: str) -> int: - n = float(value) - return int( - n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1) + except FileNotFoundError: + raise RuntimeError( + "Harbor CLI not found. Please install harbor: pip install harbor" ) - for log_name in ("openhands.txt", "openhands_sdk.txt"): - log_file = trial_dir / "agent" / log_name - if not log_file.exists(): - continue - try: - text = log_file.read_text(errors="replace") - cost_usd = 0.0 - prompt_tokens = 0 - completion_tokens = 0 - m = _COST_RE.search(text) - if m: - cost_usd = float(m.group(1)) - m = _TOKENS_RE.search(text) - if m: - prompt_tokens = _parse_token_count(m.group(1), m.group(2)) - completion_tokens = _parse_token_count(m.group(3), m.group(4)) - if cost_usd or prompt_tokens: - return { - "total_prompt_tokens": prompt_tokens, - "total_completion_tokens": completion_tokens, - "total_cost_usd": cost_usd, - } - except OSError: - pass - - return { - "total_prompt_tokens": 0, - "total_completion_tokens": 0, - "total_cost_usd": 0.0, - } + return harbor_output_dir + + +def _find_job_dir(harbor_output_dir: Path) -> Path: + """Find the harbor job directory (timestamp-named) inside the output dir.""" + # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) + # containing result.json and trial subdirectories + candidates = [ + d + for d in harbor_output_dir.iterdir() + if d.is_dir() and (d / "result.json").exists() + ] + if not candidates: + raise RuntimeError( + f"No harbor job directory found in {harbor_output_dir}. " + f"Expected a timestamp-named directory containing result.json." + ) + # Use the most recent job directory if multiple exist + return sorted(candidates)[-1] -def convert_benchflow_to_eval_output( - jobs_dir: Path, +def convert_harbor_to_eval_output( + harbor_output_dir: Path, eval_output_path: Path, - task_ids: list[str] | None = None, ) -> None: - """Convert benchflow job output to standard evaluation output format. + """Convert harbor output to evaluation output format. - benchflow 0.3.0 stores trial results as: - jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json + Harbor stores trial results in a job directory structured as: + harbor_output/TIMESTAMP/TRIAL_NAME/result.json - Each result.json contains task_name, rewards, error, verifier_error, and timing. + Each trial's result.json contains task_name, verifier_result, agent_result, + timing info, and exception details. Args: - jobs_dir: Path to benchflow jobs directory. - eval_output_path: Path to write output.jsonl. - task_ids: Optional filter for specific task IDs (short names). + harbor_output_dir: Path to harbor output directory. + eval_output_path: Path to write the converted output.jsonl. """ - logger.info(f"Converting benchflow output from {jobs_dir}") - - # benchflow 0.3.0 writes: - # jobs/summary.json - # jobs/TIMESTAMP/TRIAL_NAME/result.json - # while older local outputs may place results directly under jobs/. - job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()] - timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)] - - if timestamp_job_dirs: - selected_job_dir = sorted(timestamp_job_dirs)[-1] - logger.info(f"Using benchflow job directory: {selected_job_dir}") - task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()] - else: - task_dirs = job_dirs + logger.info(f"Converting harbor output from {harbor_output_dir}") - if not task_dirs: - raise RuntimeError(f"No task directories found in {jobs_dir}") + job_dir = _find_job_dir(harbor_output_dir) + logger.info(f"Using harbor job directory: {job_dir}") - if task_ids: - short_ids = {tid.split("/")[-1] for tid in task_ids} - task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids] + # Find trial result files (each trial dir has a result.json) + result_files = list(job_dir.glob("*/result.json")) + # Exclude the job-level result.json + result_files = [f for f in result_files if f.parent != job_dir] + + if not result_files: + raise RuntimeError( + f"No trial result files found in {job_dir}. " + f"Expected result.json files in trial subdirectories." + ) - logger.info(f"Processing {len(task_dirs)} task directories") + logger.info(f"Found {len(result_files)} trial results in {job_dir}") results: list[dict] = [] errors: list[dict] = [] - for task_dir in sorted(task_dirs): - # Find the trial result — benchflow writes trial-0/result.json - trial_results = list(task_dir.glob("trial-*/result.json")) - if not trial_results: - # Fall back to a direct result.json - direct = task_dir / "result.json" - if direct.exists(): - trial_results = [direct] - - if not trial_results: - logger.warning(f"No result.json found in {task_dir}, skipping") - errors.append( - { - "instance_id": f"benchflow/{task_dir.name}", - "error": "No result.json found", - "test_result": {}, - } - ) - continue - - # Use the last trial (highest retry index) - result_file = sorted(trial_results)[-1] - + for result_file in result_files: try: with open(result_file) as f: trial = json.load(f) - task_basename = task_dir.name.split("__")[0] - task_name = trial.get("task_name") or f"benchflow/{task_basename}" - # Normalise to benchflow/ form - if "/" not in task_name: - task_name = f"benchflow/{task_name}" + instance_id = trial.get("task_name", result_file.parent.name) - error = trial.get("error") - verifier_error = trial.get("verifier_error") - - if error or verifier_error: + # Check for exceptions + if trial.get("exception_info"): errors.append( { - "instance_id": task_name, - "error": str(error or verifier_error), + "instance_id": instance_id, + "error": str(trial["exception_info"]), "test_result": {}, } ) continue - rewards = trial.get("rewards") or {} - passed = bool(rewards.get("reward", 0.0)) + # Extract verifier results + verifier_result = trial.get("verifier_result", {}) + rewards = verifier_result.get("rewards", {}) + passed = rewards.get("reward", 0.0) > 0 + + # Extract agent metrics + agent_result = trial.get("agent_result", {}) eval_entry = { - "instance_id": task_name, + "instance_id": instance_id, "test_result": { + "trial_name": trial.get("trial_name"), + "trial_uri": trial.get("trial_uri"), "rewards": rewards, "passed": passed, }, "instruction": "", "error": None, "history": [], - "metrics": _extract_trial_metrics(result_file.parent), + "metrics": { + "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, + "total_completion_tokens": ( + agent_result.get("n_output_tokens") or 0 + ), + "total_cost_usd": agent_result.get("cost_usd") or 0.0, + }, } results.append(eval_entry) - logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}") + logger.info( + f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" + ) except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to read {result_file}: {e}") + logger.error(f"Failed to process result file {result_file}: {e}") errors.append( { - "instance_id": f"benchflow/{task_dir.name}", + "instance_id": result_file.parent.name, "error": str(e), "test_result": {}, } ) if not results and not errors: - raise RuntimeError(f"No trials processed from {jobs_dir}") + raise RuntimeError(f"No trials processed from {harbor_output_dir}") if not results: logger.warning( - f"All {len(errors)} trials failed; writing error entries for reporting" + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" ) + # Write results to output.jsonl with open(eval_output_path, "w") as f: - for entry in results + errors: + for entry in results: + f.write(json.dumps(entry) + "\n") + for entry in errors: f.write(json.dumps(entry) + "\n") logger.info( @@ -518,18 +298,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]: def main() -> None: """Main entry point for skillsbench inference.""" parser = argparse.ArgumentParser( - description="Run SkillsBench evaluation with benchflow and openhands", + description="Run SkillsBench evaluation with openhands-sdk via Harbor", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run full skillsbench evaluation uv run skillsbench-infer .llm_config/claude.json - # Run specific tasks from a file + # Run specific tasks uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with more concurrency - uv run skillsbench-infer .llm_config/claude.json --num-workers 4 + # Run with custom dataset version + uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 """, ) @@ -542,7 +322,7 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="benchflow dataset name (e.g., benchflow/skillsbench)", + help="Harbor dataset name (e.g., benchflow/skillsbench)", ) parser.add_argument( "--output-dir", @@ -554,12 +334,12 @@ def main() -> None: "--num-workers", type=int, default=INFER_DEFAULTS["num_workers"], - help="Number of parallel workers (concurrency)", + help="Number of parallel workers", ) parser.add_argument( "--n-limit", type=int, - help="Maximum number of tasks to run", + help="Maximum number of dataset tasks to run after Harbor filtering", ) parser.add_argument( "--select", @@ -578,13 +358,14 @@ def main() -> None: help="Optional note for the evaluation run", ) parser.add_argument( - "--skip-run", + "--skip-harbor", action="store_true", - help="Skip running benchflow and only convert existing results", + help="Skip running harbor and only convert existing results", ) args = parser.parse_args() + # Validate LLM config if not os.path.isfile(args.llm_config_path): logger.error(f"LLM config file does not exist: {args.llm_config_path}") sys.exit(1) @@ -594,103 +375,87 @@ def main() -> None: llm = LLM.model_validate_json(llm_config) logger.info(f"Using LLM: {llm.model}") - if not args.skip_run and not check_benchflow_installed(): + # Check harbor installation + if not args.skip_harbor and not check_harbor_installed(): logger.error( - "benchflow CLI is not installed. Please install it:\n" - " uv tool install benchflow==0.3.0\n" + "Harbor CLI is not installed. Please install it:\n" + " pip install harbor\n" " # or\n" - " pip install benchflow==0.3.0\n" - " # or\n" - " uv pip install benchflow==0.3.0" + " uv pip install harbor" ) sys.exit(1) + # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( base_dir=args.output_dir, dataset_name=dataset_description, model_name=llm.model, - max_iterations=100, + max_iterations=100, # Not directly used but required for path construction eval_note=args.note, ) logger.info(f"Output directory: {structured_output_dir}") os.makedirs(structured_output_dir, exist_ok=True) + # Save metadata metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, "timestamp": datetime.now(timezone.utc).isoformat(), - "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"], + "harbor_agent": HARBOR_DEFAULTS["agent_name"], "note": args.note, } metadata_path = Path(structured_output_dir) / "metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) + # Collect task IDs if specified task_ids: list[str] | None = None if args.select: - task_ids = load_task_ids_from_file(args.select) - logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}") + loaded_ids = load_task_ids_from_file(args.select) + task_ids = loaded_ids + logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") elif args.task_id: task_ids = list(args.task_id) logger.info(f"Running {len(task_ids)} specified task IDs") - tasks_dir = Path(structured_output_dir) / "tasks" - jobs_dir = Path(structured_output_dir) / "jobs" output_path = Path(structured_output_dir) / OUTPUT_FILENAME - if not args.skip_run: + if not args.skip_harbor: + # Run harbor evaluation try: - ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids) - - # Apply n_limit by slicing available task directories - effective_task_dirs = tasks_dir - if args.n_limit is not None or task_ids is not None: - all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir()) - if task_ids: - short_ids = {tid.split("/")[-1] for tid in task_ids} - all_dirs = [d for d in all_dirs if d.name in short_ids] - if args.n_limit is not None: - all_dirs = all_dirs[: args.n_limit] - - # Write a filtered tasks dir symlink tree - filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered" - filtered_tasks_dir.mkdir(exist_ok=True) - for d in all_dirs: - link = filtered_tasks_dir / d.name - if not link.exists(): - link.symlink_to(d.resolve()) - effective_task_dirs = filtered_tasks_dir - - run_benchflow_job( + harbor_output_dir = run_harbor_evaluation( llm=llm, - tasks_dir=effective_task_dirs, - jobs_dir=jobs_dir, + dataset=args.dataset, + output_dir=structured_output_dir, num_workers=args.num_workers, task_ids=task_ids, + n_limit=args.n_limit, ) - convert_benchflow_to_eval_output( - jobs_dir=jobs_dir, + # Convert harbor output to standard format + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, eval_output_path=output_path, - task_ids=task_ids, ) except Exception as e: logger.error(f"Evaluation failed: {e}") sys.exit(1) else: - if jobs_dir.exists(): - convert_benchflow_to_eval_output( - jobs_dir=jobs_dir, + # Skip harbor, just convert existing results + harbor_output_dir = Path(structured_output_dir) / "harbor_output" + if harbor_output_dir.exists(): + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, eval_output_path=output_path, - task_ids=task_ids, ) else: - logger.error(f"No jobs output found at {jobs_dir}") + logger.error(f"No harbor output found at {harbor_output_dir}") sys.exit(1) + # Generate cost report if output_path.exists(): generate_cost_report(str(output_path)) diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py index 1334da297..56d54f27a 100644 --- a/tests/test_skillsbench_eval_infer.py +++ b/tests/test_skillsbench_eval_infer.py @@ -39,6 +39,23 @@ def test_resolved_instance(self, tmp_path: Path) -> None: assert result["unresolved_instances"] == 0 assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] + def test_unresolved_instance(self, tmp_path: Path) -> None: + """Test processing an unresolved (passed=False) instance.""" + input_file = tmp_path / "unresolved.jsonl" + output_file = tmp_path / "unresolved.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": False, "rewards": {"reward": 0.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 0 + assert result["unresolved_instances"] == 1 + def test_instance_with_error(self, tmp_path: Path) -> None: """Test processing an instance that errored.""" input_file = tmp_path / "error.jsonl" diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 784b4d1cc..5f8452cb3 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -4,102 +4,82 @@ from pathlib import Path import pytest -import yaml -from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( - _build_benchflow_agent_env, - convert_benchflow_to_eval_output, - run_benchflow_job, + convert_harbor_to_eval_output, + run_harbor_evaluation, ) from openhands.sdk import LLM -class TestRunBenchflowJob: - """Tests for building benchflow job invocation arguments.""" +class TestRunHarborEvaluation: + """Tests for building Harbor invocation arguments.""" - def test_default_dataset_matches_benchflow_registry(self) -> None: - """Test that the default dataset name matches benchflow's published registry.""" + def test_default_dataset_matches_harbor_registry(self) -> None: + """Test that the default dataset name matches Harbor's published registry.""" assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_default_agent_is_openhands(self) -> None: - """Test that the default agent is openhands.""" - assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands" - - def test_run_benchflow_job_passes_model_and_concurrency( + def test_run_harbor_evaluation_passes_filters_and_limits( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test benchflow job command writes the expected YAML config.""" - captured_cmd: list[str] = [] - captured_env: dict[str, str] = {} - captured_config: dict = {} - - # Force legacy benchflow binary path so the command format is deterministic - monkeypatch.setattr( - "benchmarks.skillsbench.run_infer.shutil.which", - lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, - ) + """Test Harbor command includes task filters and n-limit.""" + captured: dict[str, list[str]] = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured_cmd[:] = cmd - captured_env.clear() - captured_env.update(env) - with open(cmd[3]) as f: - captured_config.update(yaml.safe_load(f)) + captured["cmd"] = cmd return type( "Completed", (), - {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""}, + {"returncode": 0, "stdout": "ok", "stderr": ""}, )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - tasks_dir = tmp_path / "tasks" - tasks_dir.mkdir() - jobs_dir = tmp_path / "jobs" - - run_benchflow_job( + harbor_output_dir = run_harbor_evaluation( llm=LLM( - model="anthropic/claude-sonnet-4-5", + model="litellm_proxy/test-model", api_key="test-key", base_url="https://proxy.example.com", ), - tasks_dir=tasks_dir, - jobs_dir=jobs_dir, - num_workers=4, - ) - - cmd = captured_cmd - assert cmd[0] == "/usr/local/bin/benchflow" - assert cmd[1] == "job" - assert cmd[2] == "--config" - assert captured_config["tasks_dir"] == str(tasks_dir) - assert captured_config["jobs_dir"] == str(jobs_dir.resolve()) - assert captured_config["agent"] == "openhands" - assert captured_config["model"] == "anthropic/claude-sonnet-4-5" - assert captured_config["concurrency"] == 4 - assert captured_config["sandbox_user"] is None - - def test_llm_credentials_passed_via_subprocess_env( + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + num_workers=2, + task_ids=["benchflow/task-a", "benchflow/task-b"], + n_limit=3, + ) + + expected_output_dir = tmp_path / "harbor_output" + assert harbor_output_dir == expected_output_dir + + cmd = captured["cmd"] + assert cmd[:8] == [ + "harbor", + "run", + "-d", + "benchflow/skillsbench", + "-a", + "openhands-sdk", + "-m", + "litellm_proxy/test-model", + ] + assert "--jobs-dir" in cmd + assert str(expected_output_dir.resolve()) in cmd + assert cmd.count("--include-task-name") == 2 + assert "benchflow/task-a" in cmd + assert "benchflow/task-b" in cmd + assert cmd[cmd.index("--n-concurrent") + 1] == "2" + assert cmd[cmd.index("--n-tasks") + 1] == "3" + + def test_llm_credentials_passed_via_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test that LLM credentials are passed via subprocess env and YAML.""" - captured_cmd: list[str] = [] - captured_env: dict[str, str] = {} - captured_config: dict = {} - - # Force legacy benchflow binary path so the command format is deterministic - monkeypatch.setattr( - "benchmarks.skillsbench.run_infer.shutil.which", - lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, - ) + """Test that LLM credentials are passed via subprocess env, not --ae flags.""" + captured: dict = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured_cmd[:] = cmd - captured_env.clear() - captured_env.update(env) - with open(cmd[3]) as f: - captured_config.update(yaml.safe_load(f)) + captured["cmd"] = cmd + captured["env"] = env return type( "Completed", (), @@ -108,127 +88,60 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - tasks_dir = tmp_path / "tasks" - tasks_dir.mkdir() - - run_benchflow_job( + run_harbor_evaluation( llm=LLM( model="test-model", api_key="my-secret-key", base_url="https://my-proxy.example.com", ), - tasks_dir=tasks_dir, - jobs_dir=tmp_path / "jobs", - ) - - # Credentials in subprocess env - assert captured_env["LLM_API_KEY"] == "my-secret-key" - assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com" - assert "--ae" not in captured_cmd - assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key" - assert ( - captured_config["agent_env"]["LLM_BASE_URL"] - == "https://my-proxy.example.com" - ) - - def test_direct_gemini_model_sets_provider_env_vars(self) -> None: - """Direct provider models need provider-specific env vars.""" - env = _build_benchflow_agent_env( - LLM( - model="gemini/gemini-3.1-flash-lite-preview", - api_key="gemini-test-key", - ) - ) - - assert env["LLM_API_KEY"] == "gemini-test-key" - assert env["GEMINI_API_KEY"] == "gemini-test-key" - assert env["GOOGLE_API_KEY"] == "gemini-test-key" - - def test_proxy_model_does_not_set_provider_env_vars(self) -> None: - """LiteLLM proxy configs should keep using generic LLM_* vars only.""" - env = _build_benchflow_agent_env( - LLM( - model="litellm_proxy/anthropic/claude-sonnet-4-20250514", - api_key="proxy-key", - base_url="https://proxy.example.com", - ) + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), ) - assert env["LLM_API_KEY"] == "proxy-key" - assert env["LLM_BASE_URL"] == "https://proxy.example.com" - assert "ANTHROPIC_API_KEY" not in env - assert "ANTHROPIC_BASE_URL" not in env + assert captured["env"]["LLM_API_KEY"] == "my-secret-key" + assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" -class TestConvertBenchflowToEvalOutput: - """Tests for convert_benchflow_to_eval_output function.""" +class TestConvertHarborToEvalOutput: + """Tests for convert_harbor_to_eval_output function.""" - def _create_benchflow_structure( - self, tmp_path: Path, tasks: list[tuple[str, dict]] - ) -> Path: - """Create a mock benchflow jobs directory structure. - - benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json - """ - jobs_dir = tmp_path / "jobs" - for task_name, result in tasks: - trial_dir = jobs_dir / task_name / "trial-0" - trial_dir.mkdir(parents=True) - (trial_dir / "result.json").write_text(json.dumps(result)) - return jobs_dir - - def _create_benchflow_timestamped_job( - self, tmp_path: Path, tasks: list[tuple[str, dict]] + def _create_harbor_structure( + self, tmp_path: Path, trials: list[tuple[str, dict]] ) -> Path: - """Create a mock benchflow 0.3.0 jobs directory structure. - - benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json - """ - jobs_dir = tmp_path / "jobs" - job_dir = jobs_dir / "2026-04-21__23-12-35" + """Create a mock Harbor output structure.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" job_dir.mkdir(parents=True) - (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)})) - for task_name, result in tasks: - trial_dir = job_dir / f"{task_name}__abc123" - trial_dir.mkdir(parents=True) - (trial_dir / "result.json").write_text(json.dumps(result)) - return jobs_dir + (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) - def test_successful_trial_parsing(self, tmp_path: Path) -> None: - """Test successful parsing of a benchflow trial result. + for trial_name, trial_result in trials: + trial_dir = job_dir / trial_name + trial_dir.mkdir() + (trial_dir / "result.json").write_text(json.dumps(trial_result)) - benchflow 0.3.0 does not write cost/token fields to result.json. - Metrics are read from agent/trajectory.json (harbor-format agent) - or parsed from agent/openhands.txt (ACP agent stdout). - """ + return harbor_dir + + def test_successful_trial_parsing(self, tmp_path: Path) -> None: + """Test successful parsing of harbor trial result.""" trial_result = { "task_name": "benchflow/weighted-gdp-calc", - "rewards": {"reward": 1.0}, - "error": None, + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - # Write agent/trajectory.json with final_metrics (harbor-format agent output). - # agent/ sits next to result.json, inside the trial-0 subdirectory. - trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0" - agent_dir = trial_dir / "agent" - agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "trajectory.json").write_text( - json.dumps( - { - "final_metrics": { - "total_prompt_tokens": 1000, - "total_completion_tokens": 200, - "total_cost_usd": 0.05, - } - } - ) + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) assert output_file.exists() with open(output_file) as f: @@ -238,52 +151,26 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 - assert entries[0]["metrics"]["total_prompt_tokens"] == 1000 - assert entries[0]["metrics"]["total_completion_tokens"] == 200 - - def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None: - """Test that metrics are extracted from agent/openhands.txt (ACP agent).""" - trial_result = { - "task_name": "benchflow/acp-task", - "rewards": {"reward": 1.0}, - "error": None, - } - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, [("acp-task", trial_result)] - ) - # Write agent/openhands.txt simulating openhands ACP stdout - trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123" - agent_dir = trial_dir / "agent" - agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "openhands.txt").write_text( - "OpenHands SDK v1.16.0\n" - "Tokens: ↑ input 404.21K • cache hit 70.47% • reasoning 579 • ↓ output 7.83K • $0.0487\n" - "Total cost: $0.0487\n" - ) - output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487) - assert entries[0]["metrics"]["total_prompt_tokens"] == 404210 - assert entries[0]["metrics"]["total_completion_tokens"] == 7830 def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { "task_name": "benchflow/task-1", - "rewards": {"reward": 0.0}, - "error": None, + "trial_name": "task-1__xyz", + "agent_result": { + "n_input_tokens": None, + "n_output_tokens": None, + "cost_usd": None, + }, + "verifier_result": {"rewards": {"reward": 0.0}}, + "exception_info": None, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("task-1", trial_result)] + harbor_dir = self._create_harbor_structure( + tmp_path, [("task-1__xyz", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -291,19 +178,21 @@ def test_failed_trial(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is False assert entries[0]["metrics"]["total_cost_usd"] == 0.0 - def test_trial_with_error(self, tmp_path: Path) -> None: - """Test that errored trials are written as error entries.""" + def test_trial_with_exception(self, tmp_path: Path) -> None: + """Test that exception trials are written as error entries.""" trial_result = { "task_name": "benchflow/error-task", - "rewards": {}, - "error": "LLM_API_KEY not set", + "trial_name": "error-task__err", + "agent_result": {}, + "verifier_result": {}, + "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("error-task", trial_result)] + harbor_dir = self._create_harbor_structure( + tmp_path, [("error-task__err", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -313,121 +202,20 @@ def test_trial_with_error(self, tmp_path: Path) -> None: assert entries[0]["error"] is not None assert entries[0]["test_result"] == {} - def test_missing_jobs_directory(self, tmp_path: Path) -> None: - """Test handling when jobs directory is empty.""" - jobs_dir = tmp_path / "jobs" - jobs_dir.mkdir() - - with pytest.raises(RuntimeError, match="No task directories found"): - convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl") - - def test_task_id_filtering(self, tmp_path: Path) -> None: - """Test that only specified task IDs are converted.""" - trials = [ - ( - "task-a", - { - "task_name": "benchflow/task-a", - "rewards": {"reward": 1.0}, - "error": None, - }, - ), - ( - "task-b", - { - "task_name": "benchflow/task-b", - "rewards": {"reward": 0.0}, - "error": None, - }, - ), - ] - jobs_dir = self._create_benchflow_structure(tmp_path, trials) - output_file = tmp_path / "output.jsonl" + def test_missing_job_directory(self, tmp_path: Path) -> None: + """Test handling when no job directory exists.""" + harbor_dir = tmp_path / "harbor_output" + harbor_dir.mkdir() - convert_benchflow_to_eval_output( - jobs_dir, output_file, task_ids=["benchflow/task-a"] - ) + with pytest.raises(RuntimeError, match="No harbor job directory found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/task-a" - - def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None: - """Test that task names without prefix get benchflow/ prepended.""" - trial_result = { - "task_name": "weighted-gdp-calc", # no benchflow/ prefix - "rewards": {"reward": 1.0}, - "error": None, - } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" - - def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None: - """Test benchflow 0.3.0 timestamped jobs directory layout.""" - trial_result = { - "task_name": "weighted-gdp-calc", - "rewards": {"reward": 1.0}, - "error": None, - "n_input_tokens": 42, - "n_output_tokens": 7, - "cost_usd": 0.01, - } - - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - output_file = tmp_path / "output.jsonl" - - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" - assert entries[0]["test_result"]["passed"] is True - - def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None: - """Test filtering strips the run suffix from trial directory names.""" - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, - [ - ( - "task-a", - { - "task_name": "task-a", - "rewards": {"reward": 1.0}, - "error": None, - }, - ), - ( - "task-b", - { - "task_name": "task-b", - "rewards": {"reward": 0.0}, - "error": None, - }, - ), - ], - ) - output_file = tmp_path / "output.jsonl" - - convert_benchflow_to_eval_output( - jobs_dir, output_file, task_ids=["benchflow/task-a"] - ) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] + def test_empty_job_directory(self, tmp_path: Path) -> None: + """Test handling of harbor job dir with no trial subdirs.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test"})) - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/task-a" + with pytest.raises(RuntimeError, match="No trial result files found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") diff --git a/uv.lock b/uv.lock index 147abedc9..2cd0b3640 100644 --- a/uv.lock +++ b/uv.lock @@ -1282,7 +1282,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, - { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1293,7 +1292,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1304,7 +1302,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, - { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -1519,6 +1516,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +socks = [ + { name = "socksio" }, +] + [[package]] name = "httpx-sse" version = "0.4.2" @@ -1814,14 +1816,12 @@ wheels = [ [[package]] name = "litellm" -version = "1.80.10" +version = "1.83.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "click" }, { name = "fastuuid" }, - { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, - { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "httpx" }, { name = "importlib-metadata" }, { name = "jinja2" }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" } +sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" }, + { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" }, ] [[package]] @@ -2402,7 +2402,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2467,7 +2467,6 @@ dependencies = [ { name = "python-json-logger" }, { name = "requests" }, { name = "swebench" }, - { name = "swesmith" }, { name = "swt-bench" }, { name = "tenacity" }, { name = "toml" }, @@ -2522,7 +2521,6 @@ requires-dist = [ { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, { name = "swebench", specifier = "==4.1.0" }, - { name = "swesmith", specifier = ">=0.0.9" }, { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" }, { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, @@ -2546,7 +2544,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, @@ -2554,7 +2552,7 @@ dependencies = [ { name = "fakeredis", extra = ["lua"] }, { name = "fastmcp" }, { name = "filelock" }, - { name = "httpx" }, + { name = "httpx", extra = ["socks"] }, { name = "litellm" }, { name = "lmnr" }, { name = "pydantic" }, @@ -2577,8 +2575,8 @@ requires-dist = [ { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" }, { name = "fastmcp", specifier = ">=3.0.0" }, { name = "filelock", specifier = ">=3.20.1" }, - { name = "httpx", specifier = ">=0.27.0" }, - { name = "litellm", specifier = "==1.80.10" }, + { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, + { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" }, { name = "lmnr", specifier = ">=0.7.24" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, @@ -2590,7 +2588,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2619,7 +2617,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, @@ -6709,6 +6707,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "socksio" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -6834,15 +6841,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, ] -[[package]] -name = "swesmith" -version = "0.0.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" }, -] - [[package]] name = "swt-bench" version = "1.0.1" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index acd5adc96..3e0a3a091 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6 +Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a From 8ff7bbab21af4ae2b884981867a6d7302ec5d26f Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Thu, 23 Apr 2026 16:56:57 -0400 Subject: [PATCH 05/12] Update skillsbench dataset handling Co-authored-by: openhands --- benchmarks/skillsbench/README.md | 16 +- benchmarks/skillsbench/run_infer.py | 351 +++++++++++++++++++++++++--- tests/test_skillsbench_run_infer.py | 231 +++++++++++++++++- 3 files changed, 558 insertions(+), 40 deletions(-) diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 60ff73652..c2c11928b 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -21,11 +21,12 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites 1. **Install Harbor**: Harbor is the official harness for running SkillsBench. + This integration is currently validated against `harbor==0.1.33`. ```bash - pip install harbor + pip install harbor==0.1.33 # or - uv pip install harbor + uv pip install harbor==0.1.33 ``` 2. **Docker**: Harbor requires Docker to be installed and running. @@ -34,12 +35,18 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Usage +By default, `skillsbench-infer` keeps a local copy of `tasks/` from +`https://github.com/benchflow-ai/skillsbench` on the `main` branch under +`benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in +`benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the +upstream `main` commit changes. The only supported dataset sources are this synced +SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`. + ### Running Inference Run the SkillsBench evaluation using the OpenHands SDK agent: ```bash -# Run full evaluation uv run skillsbench-infer .llm_config/claude.json # Run specific tasks @@ -53,6 +60,9 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5 # Run with multiple workers uv run skillsbench-infer .llm_config/claude.json --num-workers 4 + +# Run against a Harbor registry dataset instead of the synced GitHub tasks +uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 ``` ### LLM Configuration diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index a8afa7281..c8e06eee5 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -11,8 +11,11 @@ import argparse import json import os +import re +import shutil import subprocess import sys +import tempfile from datetime import datetime, timezone from pathlib import Path @@ -29,13 +32,21 @@ # Output filename for results OUTPUT_FILENAME = "output.jsonl" +SKILLSBENCH_REPO_URL = "https://github.com/benchflow-ai/skillsbench.git" +SKILLSBENCH_REPO_BRANCH = "main" +DATASET_CACHE_DIR = Path(__file__).parent / "data" +TASKS_CACHE_DIR = DATASET_CACHE_DIR / "tasks" +TASKS_METADATA_PATH = DATASET_CACHE_DIR / "source.json" +REGISTRY_DATASET_PREFIX = "benchflow/skillsbench" +INSTANCE_ID_PREFIX = "benchflow" + def check_harbor_installed() -> bool: """Check if harbor CLI is installed and available.""" harbor_exe = HARBOR_DEFAULTS["harbor_executable"] try: result = subprocess.run( - [harbor_exe, "--version"], + [harbor_exe, "--help"], capture_output=True, text=True, timeout=10, @@ -45,9 +56,252 @@ def check_harbor_installed() -> bool: return False +def _run_command(cmd: list[str], error_message: str) -> str: + """Run a subprocess command and return stdout.""" + result = subprocess.run( + cmd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + stderr = result.stderr.strip() or result.stdout.strip() + raise RuntimeError(f"{error_message}: {stderr}") + return result.stdout.strip() + + +def _get_supported_task_filter_flag(harbor_exe: str) -> str: + """Detect whether Harbor expects --task-name or --include-task-name.""" + try: + result = subprocess.run( + [harbor_exe, "run", "--help"], + capture_output=True, + text=True, + ) + except FileNotFoundError: + return "--include-task-name" + + help_text = f"{result.stdout}\n{result.stderr}" + supported_flags = set(re.findall(r"(? str: + """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk.""" + try: + result = subprocess.run( + [harbor_exe, "run", "--help"], + capture_output=True, + text=True, + ) + except FileNotFoundError: + return HARBOR_DEFAULTS["agent_name"] + + help_text = f"{result.stdout}\n{result.stderr}" + compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower()) + if "openhands-sdk" in compact_help_text: + return "openhands-sdk" + if "openhands" in compact_help_text: + return "openhands" + return HARBOR_DEFAULTS["agent_name"] + + +def get_skillsbench_main_commit( + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> str: + """Resolve the latest commit hash for the upstream SkillsBench branch.""" + stdout = _run_command( + ["git", "ls-remote", repo_url, f"refs/heads/{branch}"], + "Failed to resolve SkillsBench upstream commit", + ) + commit_hash, _, ref = stdout.partition("\t") + if not commit_hash or ref != f"refs/heads/{branch}": + raise RuntimeError( + f"Unexpected git ls-remote output for {repo_url} {branch}: {stdout}" + ) + return commit_hash + + +def _load_cached_commit(metadata_path: Path = TASKS_METADATA_PATH) -> str | None: + """Load the cached upstream commit hash for the local task snapshot.""" + if not metadata_path.is_file(): + return None + + try: + with open(metadata_path, encoding="utf-8") as f: + metadata = json.load(f) + except (OSError, json.JSONDecodeError) as e: + logger.warning( + "Ignoring unreadable SkillsBench dataset metadata at %s: %s", + metadata_path, + e, + ) + return None + + commit_hash = metadata.get("commit_hash") + return commit_hash if isinstance(commit_hash, str) and commit_hash else None + + +def download_skillsbench_tasks( + commit_hash: str, + tasks_dir: Path = TASKS_CACHE_DIR, + metadata_path: Path = TASKS_METADATA_PATH, + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> None: + """Download only the SkillsBench tasks directory for a specific commit.""" + data_dir = tasks_dir.parent + data_dir.mkdir(parents=True, exist_ok=True) + + logger.info( + "Downloading SkillsBench tasks from %s@%s into %s", + repo_url, + commit_hash, + tasks_dir, + ) + + with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir: + clone_dir = Path(temp_dir) / "skillsbench" + _run_command( + [ + "git", + "clone", + "--depth", + "1", + "--branch", + branch, + "--filter=blob:none", + "--sparse", + repo_url, + str(clone_dir), + ], + "Failed to clone SkillsBench repository", + ) + _run_command( + ["git", "-C", str(clone_dir), "sparse-checkout", "set", "tasks"], + "Failed to sparsely checkout SkillsBench tasks", + ) + checked_out_commit = _run_command( + ["git", "-C", str(clone_dir), "rev-parse", "HEAD"], + "Failed to read cloned SkillsBench commit", + ) + if checked_out_commit != commit_hash: + raise RuntimeError( + "Cloned SkillsBench commit does not match upstream HEAD: " + f"expected {commit_hash}, got {checked_out_commit}" + ) + + source_tasks_dir = clone_dir / "tasks" + if not source_tasks_dir.is_dir(): + raise RuntimeError( + f"SkillsBench clone at {clone_dir} does not contain a tasks/ directory" + ) + + if tasks_dir.exists(): + shutil.rmtree(tasks_dir) + shutil.copytree(source_tasks_dir, tasks_dir) + + metadata = { + "repo_url": repo_url, + "branch": branch, + "commit_hash": commit_hash, + "synced_at": datetime.now(timezone.utc).isoformat(), + } + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + +def ensure_skillsbench_tasks( + tasks_dir: Path = TASKS_CACHE_DIR, + metadata_path: Path = TASKS_METADATA_PATH, + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> Path: + """Ensure a local SkillsBench task snapshot exists and matches upstream HEAD.""" + cached_commit = _load_cached_commit(metadata_path) + has_cached_tasks = tasks_dir.is_dir() and any(tasks_dir.iterdir()) + + try: + upstream_commit = get_skillsbench_main_commit(repo_url=repo_url, branch=branch) + except RuntimeError as e: + if has_cached_tasks and cached_commit: + logger.warning( + "Failed to check SkillsBench upstream HEAD; using cached tasks from " + "%s (%s): %s", + tasks_dir, + cached_commit, + e, + ) + return tasks_dir + raise + + if has_cached_tasks and cached_commit == upstream_commit: + logger.info( + "Using cached SkillsBench tasks at %s (commit %s)", + tasks_dir, + upstream_commit, + ) + return tasks_dir + + if has_cached_tasks: + logger.info( + "Refreshing SkillsBench tasks in %s from commit %s to %s", + tasks_dir, + cached_commit or "", + upstream_commit, + ) + else: + logger.info("No cached SkillsBench tasks found at %s; downloading", tasks_dir) + + download_skillsbench_tasks( + commit_hash=upstream_commit, + tasks_dir=tasks_dir, + metadata_path=metadata_path, + repo_url=repo_url, + branch=branch, + ) + return tasks_dir + + +def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]: + """Resolve the dataset argument to a synced local snapshot or registry id.""" + if dataset == INFER_DEFAULTS["dataset"]: + local_tasks_dir = ensure_skillsbench_tasks() + return str(local_tasks_dir.resolve()), True + if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith( + f"{REGISTRY_DATASET_PREFIX}@" + ): + return dataset, False + raise ValueError( + "Unsupported SkillsBench dataset source. Use the default synced " + "SkillsBench snapshot or a Harbor registry id matching " + "'benchflow/skillsbench@'." + ) + + +def _normalize_task_filter_value(task_id: str, *, dataset_is_path: bool) -> str: + """Normalize task filter values for Harbor's local-path dataset handling.""" + if dataset_is_path: + return task_id.rsplit("/", 1)[-1] + return task_id + + +def _canonicalize_instance_id(task_name: str) -> str: + """Normalize SkillsBench task names to stable benchflow/ ids.""" + if "/" in task_name: + return task_name + return f"{INSTANCE_ID_PREFIX}/{task_name}" + + def run_harbor_evaluation( llm: LLM, dataset: str, + *, + dataset_is_path: bool, output_dir: str, num_workers: int = 1, task_ids: list[str] | None = None, @@ -57,7 +311,8 @@ def run_harbor_evaluation( Args: llm: LLM configuration for the agent. - dataset: Harbor dataset name (e.g., benchflow/skillsbench). + dataset: Synced SkillsBench task snapshot path or Harbor registry id. + dataset_is_path: Whether ``dataset`` should be passed via ``--path``. output_dir: Directory to store output files. num_workers: Number of parallel workers. task_ids: Optional list of specific task IDs to run. @@ -69,16 +324,18 @@ def run_harbor_evaluation( harbor_output_dir = Path(output_dir) / "harbor_output" harbor_output_dir.mkdir(parents=True, exist_ok=True) harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + agent_name = _get_supported_agent_name(harbor_exe) + task_filter_flag = _get_supported_task_filter_flag(harbor_exe) # Build harbor command using harbor CLI flags. # Use absolute path for --jobs-dir to avoid CWD-relative path issues. cmd = [ harbor_exe, "run", - "-d", + "--path" if dataset_is_path else "-d", dataset, "-a", - HARBOR_DEFAULTS["agent_name"], + agent_name, "-m", llm.model, "--jobs-dir", @@ -87,21 +344,17 @@ def run_harbor_evaluation( str(num_workers), ] - # Pass LLM credentials as agent environment variables - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) - if llm.base_url: - cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) - # Add specific task names if provided if task_ids: for task_id in task_ids: - cmd.extend(["--include-task-name", task_id]) + cmd.extend( + [ + task_filter_flag, + _normalize_task_filter_value( + task_id, dataset_is_path=dataset_is_path + ), + ] + ) if n_limit is not None: cmd.extend(["--n-tasks", str(n_limit)]) @@ -131,10 +384,31 @@ def run_harbor_evaluation( ) if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + if ( + task_ids + and task_filter_flag == "--task-name" + and "No such option: --task-name" in result.stderr + ): + fallback_cmd = [ + "--include-task-name" if part == "--task-name" else part + for part in cmd + ] + logger.warning( + "Harbor does not support --task-name; retrying with " + "--include-task-name" + ) + result = subprocess.run( + fallback_cmd, + capture_output=True, + text=True, + env=env, + ) + + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") logger.info("Harbor evaluation completed successfully") logger.info(f"stdout: {result.stdout}") @@ -207,7 +481,9 @@ def convert_harbor_to_eval_output( with open(result_file) as f: trial = json.load(f) - instance_id = trial.get("task_name", result_file.parent.name) + instance_id = _canonicalize_instance_id( + trial.get("task_name", result_file.parent.name) + ) # Check for exceptions if trial.get("exception_info"): @@ -256,7 +532,7 @@ def convert_harbor_to_eval_output( logger.error(f"Failed to process result file {result_file}: {e}") errors.append( { - "instance_id": result_file.parent.name, + "instance_id": _canonicalize_instance_id(result_file.parent.name), "error": str(e), "test_result": {}, } @@ -302,13 +578,14 @@ def main() -> None: formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Run full skillsbench evaluation + # Run full skillsbench evaluation using a local tasks/ snapshot synced from + # https://github.com/benchflow-ai/skillsbench main uv run skillsbench-infer .llm_config/claude.json # Run specific tasks uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with custom dataset version + # Run against a Harbor registry dataset instead of the synced GitHub tasks uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 """, ) @@ -322,7 +599,11 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="Harbor dataset name (e.g., benchflow/skillsbench)", + help=( + "SkillsBench dataset source. The default value syncs tasks/ from the " + "benchflow-ai/skillsbench main branch. You can also pass a Harbor " + "registry id like benchflow/skillsbench@1.0." + ), ) parser.add_argument( "--output-dir", @@ -385,6 +666,20 @@ def main() -> None: ) sys.exit(1) + resolved_dataset = args.dataset + dataset_is_path = False + dataset_commit_hash: str | None = None + if not args.skip_harbor: + try: + resolved_dataset, dataset_is_path = resolve_skillsbench_dataset( + args.dataset + ) + except ValueError as e: + logger.error(str(e)) + sys.exit(1) + if dataset_is_path and args.dataset == INFER_DEFAULTS["dataset"]: + dataset_commit_hash = _load_cached_commit() + # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( @@ -402,6 +697,9 @@ def main() -> None: metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, + "resolved_dataset": resolved_dataset, + "dataset_is_path": dataset_is_path, + "dataset_commit_hash": dataset_commit_hash, "timestamp": datetime.now(timezone.utc).isoformat(), "harbor_agent": HARBOR_DEFAULTS["agent_name"], "note": args.note, @@ -427,7 +725,8 @@ def main() -> None: try: harbor_output_dir = run_harbor_evaluation( llm=llm, - dataset=args.dataset, + dataset=resolved_dataset, + dataset_is_path=dataset_is_path, output_dir=structured_output_dir, num_workers=args.num_workers, task_ids=task_ids, diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 5f8452cb3..0632a6a46 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -8,22 +8,135 @@ from benchmarks.skillsbench.config import INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( convert_harbor_to_eval_output, + ensure_skillsbench_tasks, + resolve_skillsbench_dataset, run_harbor_evaluation, ) from openhands.sdk import LLM +class TestDatasetSync: + """Tests for syncing the local SkillsBench task snapshot.""" + + def test_ensure_skillsbench_tasks_reuses_matching_cache( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that an up-to-date cached tasks directory is reused.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + (tasks_dir / "task-a").mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "abc123"})) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + lambda repo_url, branch: "abc123", + ) + + called = False + + def fake_download(**kwargs) -> None: + nonlocal called + called = True + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.download_skillsbench_tasks", + fake_download, + ) + + resolved = ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert resolved == tasks_dir + assert called is False + + def test_ensure_skillsbench_tasks_refreshes_stale_cache( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that a stale cached commit triggers a redownload.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "old-commit"})) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + lambda repo_url, branch: "new-commit", + ) + + captured: dict[str, str] = {} + + def fake_download( + *, + commit_hash: str, + tasks_dir: Path, + metadata_path: Path, + repo_url: str, + branch: str, + ) -> None: + captured["commit_hash"] = commit_hash + captured["tasks_dir"] = str(tasks_dir) + captured["metadata_path"] = str(metadata_path) + tasks_dir.mkdir(exist_ok=True) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.download_skillsbench_tasks", + fake_download, + ) + + ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert captured["commit_hash"] == "new-commit" + assert captured["tasks_dir"] == str(tasks_dir) + assert captured["metadata_path"] == str(metadata_path) + + def test_ensure_skillsbench_tasks_uses_cache_if_remote_check_fails( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that a usable cache is kept when the upstream HEAD check fails.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + (tasks_dir / "task-a").mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "cached-commit"})) + + def fake_head(repo_url: str, branch: str) -> str: + raise RuntimeError("network unavailable") + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + fake_head, + ) + + resolved = ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert resolved == tasks_dir + + def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None: + """Test that explicit Harbor dataset ids are passed through unchanged.""" + resolved_dataset, dataset_is_path = resolve_skillsbench_dataset( + "benchflow/skillsbench@1.0" + ) + + assert resolved_dataset == "benchflow/skillsbench@1.0" + assert dataset_is_path is False + + class TestRunHarborEvaluation: """Tests for building Harbor invocation arguments.""" - def test_default_dataset_matches_harbor_registry(self) -> None: - """Test that the default dataset name matches Harbor's published registry.""" - assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_run_harbor_evaluation_passes_filters_and_limits( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test Harbor command includes task filters and n-limit.""" + """Test Harbor command normalizes local task ids and includes main flags.""" captured: dict[str, list[str]] = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): @@ -35,6 +148,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--include-task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) harbor_output_dir = run_harbor_evaluation( llm=LLM( @@ -42,7 +163,8 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): api_key="test-key", base_url="https://proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], + dataset=str(tmp_path / "tasks"), + dataset_is_path=True, output_dir=str(tmp_path), num_workers=2, task_ids=["benchflow/task-a", "benchflow/task-b"], @@ -56,21 +178,69 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): assert cmd[:8] == [ "harbor", "run", - "-d", - "benchflow/skillsbench", + "--path", + str(tmp_path / "tasks"), "-a", - "openhands-sdk", + "openhands", "-m", "litellm_proxy/test-model", ] assert "--jobs-dir" in cmd assert str(expected_output_dir.resolve()) in cmd assert cmd.count("--include-task-name") == 2 - assert "benchflow/task-a" in cmd - assert "benchflow/task-b" in cmd + assert "task-a" in cmd + assert "task-b" in cmd + assert "benchflow/task-a" not in cmd + assert "--ae" not in cmd assert cmd[cmd.index("--n-concurrent") + 1] == "2" assert cmd[cmd.index("--n-tasks") + 1] == "3" + def test_run_harbor_evaluation_retries_with_legacy_task_flag( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test Harbor falls back to --include-task-name when --task-name fails.""" + captured_cmds: list[list[str]] = [] + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured_cmds.append(cmd) + if "--task-name" in cmd: + return type( + "Completed", + (), + { + "returncode": 2, + "stdout": "", + "stderr": "No such option: --task-name", + }, + )() + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) + + run_harbor_evaluation( + llm=LLM(model="test-model"), + dataset=str(tmp_path / "tasks"), + dataset_is_path=True, + output_dir=str(tmp_path), + task_ids=["benchflow/task-a"], + ) + + assert len(captured_cmds) == 2 + assert "--task-name" in captured_cmds[0] + assert "--include-task-name" in captured_cmds[1] + def test_llm_credentials_passed_via_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -87,6 +257,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--include-task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) run_harbor_evaluation( llm=LLM( @@ -95,11 +273,13 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): base_url="https://my-proxy.example.com", ), dataset=INFER_DEFAULTS["dataset"], + dataset_is_path=False, output_dir=str(tmp_path), ) assert captured["env"]["LLM_API_KEY"] == "my-secret-key" assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + assert "--ae" not in captured["cmd"] class TestConvertHarborToEvalOutput: @@ -152,6 +332,35 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + def test_local_trial_names_are_normalized_to_canonical_instance_ids( + self, tmp_path: Path + ) -> None: + """Test local Harbor task names without namespace keep benchflow ids.""" + trial_result = { + "task_name": "weighted-gdp-calc", + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { From eb2015016ac2b3508025ea2d4b5ebd482e4ea854 Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Sun, 5 Apr 2026 19:33:34 -0400 Subject: [PATCH 06/12] integrate skillsbench --- benchmarks/skillsbench/README.md | 163 ++++++++++ benchmarks/skillsbench/__init__.py | 1 + benchmarks/skillsbench/config.py | 16 + benchmarks/skillsbench/eval_infer.py | 280 ++++++++++++++++ benchmarks/skillsbench/run_infer.py | 467 +++++++++++++++++++++++++++ benchmarks/utils/report_costs.py | 4 +- pyproject.toml | 2 + 7 files changed, 932 insertions(+), 1 deletion(-) create mode 100644 benchmarks/skillsbench/README.md create mode 100644 benchmarks/skillsbench/__init__.py create mode 100644 benchmarks/skillsbench/config.py create mode 100644 benchmarks/skillsbench/eval_infer.py create mode 100644 benchmarks/skillsbench/run_infer.py diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md new file mode 100644 index 000000000..60ff73652 --- /dev/null +++ b/benchmarks/skillsbench/README.md @@ -0,0 +1,163 @@ +# SkillsBench Evaluation + +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. + +## Overview + +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain + +- Software engineering +- Office & white collar +- Natural science +- Media & content production +- Cybersecurity +- Finance +- Robotics +- Manufacturing +- Energy +- Mathematics +- Healthcare + +## Prerequisites + +1. **Install Harbor**: Harbor is the official harness for running SkillsBench. + + ```bash + pip install harbor + # or + uv pip install harbor + ``` + +2. **Docker**: Harbor requires Docker to be installed and running. + +3. **LLM API Key**: Configure your LLM provider credentials. + +## Usage + +### Running Inference + +Run the SkillsBench evaluation using the OpenHands SDK agent: + +```bash +# Run full evaluation +uv run skillsbench-infer .llm_config/claude.json + +# Run specific tasks +uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc + +# Run tasks from a file +uv run skillsbench-infer .llm_config/claude.json --select tasks.txt + +# Limit the run to 5 tasks (useful for smoke tests) +uv run skillsbench-infer .llm_config/claude.json --n-limit 5 + +# Run with multiple workers +uv run skillsbench-infer .llm_config/claude.json --num-workers 4 +``` + +### LLM Configuration + +Create an LLM configuration file (e.g., `.llm_config/claude.json`): + +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "api_key": "YOUR_API_KEY" +} +``` + +Or use a LiteLLM proxy: + +```json +{ + "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514", + "base_url": "https://your-proxy.example.com", + "api_key": "YOUR_API_KEY" +} +``` + +### Evaluating Results + +After running inference, evaluate the results: + +```bash +uv run skillsbench-eval ./evaluation_outputs/.../output.jsonl +``` + +This generates a report file (`output.report.json`) with: +- Total/completed/resolved instance counts +- Success rate +- Aggregate metrics (cost, tokens) + +## Output Format + +### Inference Output (`output.jsonl`) + +Each line contains: + +```json +{ + "instance_id": "benchflow/task-name", + "test_result": { + "trial_name": "...", + "trial_uri": "...", + "rewards": {"reward": 1.0}, + "passed": true + }, + "instruction": "", + "error": null, + "history": [], + "metrics": { + "total_prompt_tokens": 5000, + "total_completion_tokens": 1000, + "total_cost_usd": 0.05 + } +} +``` + +### Evaluation Report (`output.report.json`) + +```json +{ + "total_instances": 100, + "completed_instances": 95, + "resolved_instances": 80, + "unresolved_instances": 15, + "error_instances": 5, + "aggregate_metrics": { + "total_cost_usd": 5.25, + "total_prompt_tokens": 500000, + "total_completion_tokens": 100000 + } +} +``` + +## Architecture + +The integration follows the Harbor agent adapter pattern: + +1. **Harbor Harness**: Manages task containers and lifecycle +2. **OpenHands SDK Agent**: Runs inside containers to solve tasks +3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format + +```text +┌──────────────────────────────────────────────────┐ +│ Harbor Harness │ +│ ┌────────────────────────────────────────────┐ │ +│ │ Task Container │ │ +│ │ ┌──────────────────────────────────────┐ │ │ +│ │ │ OpenHands SDK Agent │ │ │ +│ │ │ - Terminal tool │ │ │ +│ │ │ - File editor tool │ │ │ +│ │ │ - Task tracker tool │ │ │ +│ │ └──────────────────────────────────────┘ │ │ +│ └────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────┘ +``` + +## References + +- [SkillsBench](https://www.skillsbench.ai/) - The benchmark +- [Harbor](https://harborframework.com) - The evaluation harness +- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK +- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format diff --git a/benchmarks/skillsbench/__init__.py b/benchmarks/skillsbench/__init__.py new file mode 100644 index 000000000..c02f7bafb --- /dev/null +++ b/benchmarks/skillsbench/__init__.py @@ -0,0 +1 @@ +# SkillsBench evaluation benchmark diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py new file mode 100644 index 000000000..8b55a92b0 --- /dev/null +++ b/benchmarks/skillsbench/config.py @@ -0,0 +1,16 @@ +"""SkillsBench configuration defaults.""" + +# Default inference settings (only include values actually used by argparse) +INFER_DEFAULTS = { + "dataset": "benchflow/skillsbench", + "output_dir": "./evaluation_outputs", + "num_workers": 1, +} + +# Harbor configuration defaults +HARBOR_DEFAULTS = { + # Harbor executable + "harbor_executable": "harbor", + # Default agent name for openhands-sdk + "agent_name": "openhands-sdk", +} diff --git a/benchmarks/skillsbench/eval_infer.py b/benchmarks/skillsbench/eval_infer.py new file mode 100644 index 000000000..f55a91736 --- /dev/null +++ b/benchmarks/skillsbench/eval_infer.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""SkillsBench Evaluation Script. + +This script processes SkillsBench output and generates evaluation reports. +It reads the output.jsonl produced by run_infer, aggregates results, +and writes a summary report. + +Usage: + uv run skillsbench-eval +""" + +import argparse +import json +import sys +from pathlib import Path + +from benchmarks.utils.laminar import LaminarService +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def process_skillsbench_results( + input_file: str, + output_file: str, +) -> dict: + """Process SkillsBench output.jsonl and generate evaluation report. + + SkillsBench format (from harbor conversion): + { + "instance_id": "task_id", + "test_result": { + "trajectory_path": "...", + "total_steps": N, + "final_metrics": {...}, + "passed": true/false # May be populated by harbor grading + }, + "instruction": "...", + "history": [...] + } + + Report format (similar to SWE-Bench): + { + "total_instances": N, + "submitted_instances": N, + "completed_instances": N, + "incomplete_instances": N, + "resolved_instances": N, + "unresolved_instances": N, + "error_instances": N, + "submitted_ids": [...], + "completed_ids": [...], + "incomplete_ids": [...], + "resolved_ids": [...], + "unresolved_ids": [...] + } + """ + logger.info(f"Processing {input_file} to generate report: {output_file}") + + # Use sets for O(1) lookup and automatic deduplication + # Convert to sorted lists only when building final report + completed_ids: set[str] = set() + resolved_ids: set[str] = set() + unresolved_ids: set[str] = set() + incomplete_ids: set[str] = set() + error_ids: set[str] = set() + + # Aggregate metrics + total_cost_usd = 0.0 + total_prompt_tokens = 0 + total_completion_tokens = 0 + + with open(input_file) as infile: + for line_num, line in enumerate(infile, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + + # Extract required fields + instance_id = data.get("instance_id") + if not instance_id: + logger.warning(f"Line {line_num}: Missing instance_id") + continue + + if instance_id in completed_ids: + logger.warning( + f"Line {line_num}: Duplicate instance_id {instance_id}" + ) + continue + + # Check for errors + error = data.get("error") + if error: + error_ids.add(instance_id) + incomplete_ids.add(instance_id) + continue + + # Extract test result + test_result = data.get("test_result", {}) + + # Check if task passed (harbor may include this) + passed = test_result.get("passed") + # If not explicitly set, we mark as completed but ungraded + is_resolved = passed is True + + # Add to completed instances + completed_ids.add(instance_id) + + if is_resolved: + resolved_ids.add(instance_id) + else: + unresolved_ids.add(instance_id) + + # Aggregate metrics + # Use explicit None check to handle zero values correctly + # (using `or` would incorrectly fallback when value is 0) + metrics = data.get("metrics", {}) + final_metrics = test_result.get("final_metrics", {}) + + cost = metrics.get("total_cost_usd") + if cost is None: + cost = final_metrics.get("total_cost_usd", 0.0) + + prompt_tokens = metrics.get("total_prompt_tokens") + if prompt_tokens is None: + prompt_tokens = final_metrics.get("total_prompt_tokens", 0) + + completion_tokens = metrics.get("total_completion_tokens") + if completion_tokens is None: + completion_tokens = final_metrics.get("total_completion_tokens", 0) + + # After the None checks above, these values are guaranteed to be non-None + total_cost_usd += cost + total_prompt_tokens += prompt_tokens + total_completion_tokens += completion_tokens + + except json.JSONDecodeError as e: + logger.error(f"Line {line_num}: Invalid JSON - {e}") + except Exception as e: + logger.error(f"Line {line_num}: Unexpected error - {e}") + + # Check for separate error file (used in manual workflows where errors + # are extracted to a separate file for analysis/retry) + error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl") + if error_path.exists(): + with open(error_path) as error_file: + for line_num, line in enumerate(error_file, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + instance_id = data.get("instance_id") + if not instance_id: + continue + if instance_id in completed_ids or instance_id in incomplete_ids: + continue + + incomplete_ids.add(instance_id) + error_ids.add(instance_id) + except (json.JSONDecodeError, Exception) as e: + logger.error(f"Error file line {line_num}: {e}") + + submitted_ids = completed_ids | incomplete_ids + + # Generate report - convert sets to sorted lists for consistent output + report = { + "total_instances": len(submitted_ids), + "submitted_instances": len(submitted_ids), + "completed_instances": len(completed_ids), + "incomplete_instances": len(incomplete_ids), + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "error_instances": len(error_ids), + "submitted_ids": sorted(submitted_ids), + "completed_ids": sorted(completed_ids), + "incomplete_ids": sorted(incomplete_ids), + "resolved_ids": sorted(resolved_ids), + "unresolved_ids": sorted(unresolved_ids), + "error_ids": sorted(error_ids), + # Aggregate metrics + "aggregate_metrics": { + "total_cost_usd": total_cost_usd, + "total_prompt_tokens": total_prompt_tokens, + "total_completion_tokens": total_completion_tokens, + }, + } + + # Write report + with open(output_file, "w") as outfile: + json.dump(report, outfile, indent=4) + + logger.info("Report generated successfully:") + logger.info(f" Total instances: {report['total_instances']}") + logger.info(f" Completed instances: {report['completed_instances']}") + logger.info(f" Resolved instances: {report['resolved_instances']}") + logger.info(f" Unresolved instances: {report['unresolved_instances']}") + logger.info(f" Error instances: {report['error_instances']}") + if report["completed_instances"] > 0: + logger.info( + f" Success rate: " + f"{report['resolved_instances'] / report['completed_instances'] * 100:.1f}%" + ) + logger.info(f" Total cost: ${total_cost_usd:.4f}") + + return report + + +def main() -> None: + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Process SkillsBench output and generate evaluation report", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run skillsbench-eval output.jsonl + uv run skillsbench-eval /path/to/output.jsonl + """, + ) + + parser.add_argument("input_file", help="Path to the SkillsBench output.jsonl file") + parser.add_argument( + "--output-file", + help="Output file for report (default: input_file with .report.json extension)", + ) + + args = parser.parse_args() + + # Validate input file + input_file = Path(args.input_file) + if not input_file.exists(): + logger.error(f"Input file does not exist: {input_file}") + sys.exit(1) + + if not input_file.suffix == ".jsonl": + logger.warning(f"Input file does not have .jsonl extension: {input_file}") + + # Determine output file + if args.output_file: + output_file = Path(args.output_file) + else: + output_file = input_file.with_suffix(".report.json") + + logger.info(f"Input file: {input_file}") + logger.info(f"Output file: {output_file}") + + try: + # Process results and generate report + process_skillsbench_results( + str(input_file), + str(output_file), + ) + except Exception as e: + logger.error(f"Script failed: {e}") + sys.exit(1) + + # Non-critical telemetry and reporting - wrap in try/except so expensive + # multi-hour evaluations don't fail at the telemetry step after completing + try: + LaminarService.get().update_evaluation_scores(str(input_file), str(output_file)) + except Exception as e: + logger.warning(f"Laminar update failed (non-critical): {e}") + + try: + generate_cost_report(str(input_file)) + except Exception as e: + logger.warning(f"Cost report generation failed (non-critical): {e}") + + logger.info("Script completed successfully!") + print(json.dumps({"report_json": str(output_file)})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py new file mode 100644 index 000000000..a8afa7281 --- /dev/null +++ b/benchmarks/skillsbench/run_infer.py @@ -0,0 +1,467 @@ +"""SkillsBench inference script using Harbor with openhands-sdk agent. + +This script runs SkillsBench evaluation using Harbor as the harness +and openhands-sdk as the agent. Results are saved in a format compatible +with the standard evaluation pipeline. + +Usage: + uv run skillsbench-infer --dataset benchflow/skillsbench +""" + +import argparse +import json +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +from pydantic import SecretStr + +from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS +from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import LLM, get_logger + + +logger = get_logger(__name__) + +# Output filename for results +OUTPUT_FILENAME = "output.jsonl" + + +def check_harbor_installed() -> bool: + """Check if harbor CLI is installed and available.""" + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + try: + result = subprocess.run( + [harbor_exe, "--version"], + capture_output=True, + text=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +def run_harbor_evaluation( + llm: LLM, + dataset: str, + output_dir: str, + num_workers: int = 1, + task_ids: list[str] | None = None, + n_limit: int | None = None, +) -> Path: + """Run harbor evaluation with openhands-sdk agent. + + Args: + llm: LLM configuration for the agent. + dataset: Harbor dataset name (e.g., benchflow/skillsbench). + output_dir: Directory to store output files. + num_workers: Number of parallel workers. + task_ids: Optional list of specific task IDs to run. + n_limit: Optional maximum number of dataset tasks to run. + + Returns: + Path to the harbor output directory. + """ + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + + # Build harbor command using harbor CLI flags. + # Use absolute path for --jobs-dir to avoid CWD-relative path issues. + cmd = [ + harbor_exe, + "run", + "-d", + dataset, + "-a", + HARBOR_DEFAULTS["agent_name"], + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(num_workers), + ] + + # Pass LLM credentials as agent environment variables + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + + # Add specific task names if provided + if task_ids: + for task_id in task_ids: + cmd.extend(["--include-task-name", task_id]) + + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) + + logger.info(f"Running harbor command: {' '.join(cmd)}") + logger.info(f"Output directory: {harbor_output_dir}") + + # harbor's openhands-sdk agent reads LLM credentials from the host process + # environment (os.environ), not from --ae flags which go to the sandbox. + env = os.environ.copy() + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + ) + + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + + logger.info("Harbor evaluation completed successfully") + logger.info(f"stdout: {result.stdout}") + + except FileNotFoundError: + raise RuntimeError( + "Harbor CLI not found. Please install harbor: pip install harbor" + ) + + return harbor_output_dir + + +def _find_job_dir(harbor_output_dir: Path) -> Path: + """Find the harbor job directory (timestamp-named) inside the output dir.""" + # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) + # containing result.json and trial subdirectories + candidates = [ + d + for d in harbor_output_dir.iterdir() + if d.is_dir() and (d / "result.json").exists() + ] + if not candidates: + raise RuntimeError( + f"No harbor job directory found in {harbor_output_dir}. " + f"Expected a timestamp-named directory containing result.json." + ) + # Use the most recent job directory if multiple exist + return sorted(candidates)[-1] + + +def convert_harbor_to_eval_output( + harbor_output_dir: Path, + eval_output_path: Path, +) -> None: + """Convert harbor output to evaluation output format. + + Harbor stores trial results in a job directory structured as: + harbor_output/TIMESTAMP/TRIAL_NAME/result.json + + Each trial's result.json contains task_name, verifier_result, agent_result, + timing info, and exception details. + + Args: + harbor_output_dir: Path to harbor output directory. + eval_output_path: Path to write the converted output.jsonl. + """ + logger.info(f"Converting harbor output from {harbor_output_dir}") + + job_dir = _find_job_dir(harbor_output_dir) + logger.info(f"Using harbor job directory: {job_dir}") + + # Find trial result files (each trial dir has a result.json) + result_files = list(job_dir.glob("*/result.json")) + # Exclude the job-level result.json + result_files = [f for f in result_files if f.parent != job_dir] + + if not result_files: + raise RuntimeError( + f"No trial result files found in {job_dir}. " + f"Expected result.json files in trial subdirectories." + ) + + logger.info(f"Found {len(result_files)} trial results in {job_dir}") + + results: list[dict] = [] + errors: list[dict] = [] + + for result_file in result_files: + try: + with open(result_file) as f: + trial = json.load(f) + + instance_id = trial.get("task_name", result_file.parent.name) + + # Check for exceptions + if trial.get("exception_info"): + errors.append( + { + "instance_id": instance_id, + "error": str(trial["exception_info"]), + "test_result": {}, + } + ) + continue + + # Extract verifier results + verifier_result = trial.get("verifier_result", {}) + rewards = verifier_result.get("rewards", {}) + passed = rewards.get("reward", 0.0) > 0 + + # Extract agent metrics + agent_result = trial.get("agent_result", {}) + + eval_entry = { + "instance_id": instance_id, + "test_result": { + "trial_name": trial.get("trial_name"), + "trial_uri": trial.get("trial_uri"), + "rewards": rewards, + "passed": passed, + }, + "instruction": "", + "error": None, + "history": [], + "metrics": { + "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, + "total_completion_tokens": ( + agent_result.get("n_output_tokens") or 0 + ), + "total_cost_usd": agent_result.get("cost_usd") or 0.0, + }, + } + results.append(eval_entry) + logger.info( + f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" + ) + + except (json.JSONDecodeError, OSError) as e: + logger.error(f"Failed to process result file {result_file}: {e}") + errors.append( + { + "instance_id": result_file.parent.name, + "error": str(e), + "test_result": {}, + } + ) + + if not results and not errors: + raise RuntimeError(f"No trials processed from {harbor_output_dir}") + + if not results: + logger.warning( + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" + ) + + # Write results to output.jsonl + with open(eval_output_path, "w") as f: + for entry in results: + f.write(json.dumps(entry) + "\n") + for entry in errors: + f.write(json.dumps(entry) + "\n") + + logger.info( + f"Wrote {len(results)} successful + {len(errors)} failed entries " + f"to {eval_output_path}" + ) + + +def load_task_ids_from_file(filepath: str) -> list[str]: + """Load task IDs from a text file (one per line).""" + task_ids = [] + with open(filepath) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + task_ids.append(line) + return task_ids + + +def main() -> None: + """Main entry point for skillsbench inference.""" + parser = argparse.ArgumentParser( + description="Run SkillsBench evaluation with openhands-sdk via Harbor", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run full skillsbench evaluation + uv run skillsbench-infer .llm_config/claude.json + + # Run specific tasks + uv run skillsbench-infer .llm_config/claude.json --select tasks.txt + + # Run with custom dataset version + uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 + """, + ) + + parser.add_argument( + "llm_config_path", + type=str, + help="Path to JSON LLM configuration file", + ) + parser.add_argument( + "--dataset", + type=str, + default=INFER_DEFAULTS["dataset"], + help="Harbor dataset name (e.g., benchflow/skillsbench)", + ) + parser.add_argument( + "--output-dir", + type=str, + default=INFER_DEFAULTS["output_dir"], + help="Base output directory for evaluation results", + ) + parser.add_argument( + "--num-workers", + type=int, + default=INFER_DEFAULTS["num_workers"], + help="Number of parallel workers", + ) + parser.add_argument( + "--n-limit", + type=int, + help="Maximum number of dataset tasks to run after Harbor filtering", + ) + parser.add_argument( + "--select", + type=str, + help="Path to text file containing task IDs to run (one per line)", + ) + parser.add_argument( + "--task-id", + type=str, + action="append", + help="Specific task ID to run (can be specified multiple times)", + ) + parser.add_argument( + "--note", + type=str, + help="Optional note for the evaluation run", + ) + parser.add_argument( + "--skip-harbor", + action="store_true", + help="Skip running harbor and only convert existing results", + ) + + args = parser.parse_args() + + # Validate LLM config + if not os.path.isfile(args.llm_config_path): + logger.error(f"LLM config file does not exist: {args.llm_config_path}") + sys.exit(1) + + with open(args.llm_config_path) as f: + llm_config = f.read() + llm = LLM.model_validate_json(llm_config) + logger.info(f"Using LLM: {llm.model}") + + # Check harbor installation + if not args.skip_harbor and not check_harbor_installed(): + logger.error( + "Harbor CLI is not installed. Please install it:\n" + " pip install harbor\n" + " # or\n" + " uv pip install harbor" + ) + sys.exit(1) + + # Construct output directory + dataset_description = args.dataset.replace("/", "__").replace("@", "-") + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name=dataset_description, + model_name=llm.model, + max_iterations=100, # Not directly used but required for path construction + eval_note=args.note, + ) + + logger.info(f"Output directory: {structured_output_dir}") + os.makedirs(structured_output_dir, exist_ok=True) + + # Save metadata + metadata = { + "llm": llm.model_dump_json(), + "dataset": args.dataset, + "timestamp": datetime.now(timezone.utc).isoformat(), + "harbor_agent": HARBOR_DEFAULTS["agent_name"], + "note": args.note, + } + metadata_path = Path(structured_output_dir) / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2) + + # Collect task IDs if specified + task_ids: list[str] | None = None + if args.select: + loaded_ids = load_task_ids_from_file(args.select) + task_ids = loaded_ids + logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") + elif args.task_id: + task_ids = list(args.task_id) + logger.info(f"Running {len(task_ids)} specified task IDs") + + output_path = Path(structured_output_dir) / OUTPUT_FILENAME + + if not args.skip_harbor: + # Run harbor evaluation + try: + harbor_output_dir = run_harbor_evaluation( + llm=llm, + dataset=args.dataset, + output_dir=structured_output_dir, + num_workers=args.num_workers, + task_ids=task_ids, + n_limit=args.n_limit, + ) + + # Convert harbor output to standard format + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, + eval_output_path=output_path, + ) + + except Exception as e: + logger.error(f"Evaluation failed: {e}") + sys.exit(1) + else: + # Skip harbor, just convert existing results + harbor_output_dir = Path(structured_output_dir) / "harbor_output" + if harbor_output_dir.exists(): + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, + eval_output_path=output_path, + ) + else: + logger.error(f"No harbor output found at {harbor_output_dir}") + sys.exit(1) + + # Generate cost report + if output_path.exists(): + generate_cost_report(str(output_path)) + + logger.info("SkillsBench inference completed!") + print(json.dumps({"output_json": str(output_path)})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py index 8f38909f3..7a21a3831 100755 --- a/benchmarks/utils/report_costs.py +++ b/benchmarks/utils/report_costs.py @@ -48,7 +48,9 @@ def extract_accumulated_cost(jsonl_data: List[Optional[Dict]]) -> float: if entry is None: continue metrics = entry.get("metrics") or {} - accumulated_cost = metrics.get("accumulated_cost") + accumulated_cost = metrics.get("accumulated_cost") or metrics.get( + "total_cost_usd" + ) if accumulated_cost is not None: total_cost += float(accumulated_cost) diff --git a/pyproject.toml b/pyproject.toml index 11773729a..3fdcfcbdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,8 @@ hybridgym-issuelocalize-infer = "benchmarks.hybridgym_issuelocalize.run_infer:ma hybridgym-issuelocalize-eval = "benchmarks.hybridgym_issuelocalize.eval_infer:main" swesmith-infer = "benchmarks.swesmith.run_infer:main" swesmith-eval = "benchmarks.swesmith.eval_infer:main" +skillsbench-infer = "benchmarks.skillsbench.run_infer:main" +skillsbench-eval = "benchmarks.skillsbench.eval_infer:main" [build-system] requires = ["setuptools>=61.0", "wheel"] From 90e8c9f63f8460f8b3ca46cd87467ec62383fb6c Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Sun, 5 Apr 2026 19:44:33 -0400 Subject: [PATCH 07/12] add skillsbench tests --- tests/test_skillsbench_eval_infer.py | 125 +++++++++++++++ tests/test_skillsbench_run_infer.py | 221 +++++++++++++++++++++++++++ 2 files changed, 346 insertions(+) create mode 100644 tests/test_skillsbench_eval_infer.py create mode 100644 tests/test_skillsbench_run_infer.py diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py new file mode 100644 index 000000000..56d54f27a --- /dev/null +++ b/tests/test_skillsbench_eval_infer.py @@ -0,0 +1,125 @@ +"""Tests for SkillsBench eval_infer module.""" + +import json +from pathlib import Path + +from benchmarks.skillsbench.eval_infer import process_skillsbench_results + + +class TestProcessSkillsbenchResults: + """Tests for the process_skillsbench_results function.""" + + def test_empty_input(self, tmp_path: Path) -> None: + """Test processing empty input file.""" + input_file = tmp_path / "empty.jsonl" + output_file = tmp_path / "empty.report.json" + input_file.write_text("") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["total_instances"] == 0 + assert result["completed_instances"] == 0 + assert result["resolved_instances"] == 0 + + def test_resolved_instance(self, tmp_path: Path) -> None: + """Test processing a resolved (passed=True) instance.""" + input_file = tmp_path / "resolved.jsonl" + output_file = tmp_path / "resolved.report.json" + + entry = { + "instance_id": "benchflow/weighted-gdp-calc", + "test_result": {"passed": True, "rewards": {"reward": 1.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 1 + assert result["unresolved_instances"] == 0 + assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] + + def test_unresolved_instance(self, tmp_path: Path) -> None: + """Test processing an unresolved (passed=False) instance.""" + input_file = tmp_path / "unresolved.jsonl" + output_file = tmp_path / "unresolved.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": False, "rewards": {"reward": 0.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 0 + assert result["unresolved_instances"] == 1 + + def test_instance_with_error(self, tmp_path: Path) -> None: + """Test processing an instance that errored.""" + input_file = tmp_path / "error.jsonl" + output_file = tmp_path / "error.report.json" + + entry = { + "instance_id": "benchflow/error-task", + "test_result": {}, + "error": "ValueError: LLM_API_KEY environment variable must be set", + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["error_instances"] == 1 + assert result["incomplete_instances"] == 1 + assert result["completed_instances"] == 0 + assert "benchflow/error-task" in result["error_ids"] + + def test_multiple_instances(self, tmp_path: Path) -> None: + """Test processing multiple instances with mixed results.""" + input_file = tmp_path / "multi.jsonl" + output_file = tmp_path / "multi.report.json" + + entries = [ + { + "instance_id": "benchflow/task-1", + "test_result": {"passed": True}, + "error": None, + }, + { + "instance_id": "benchflow/task-2", + "test_result": {"passed": False}, + "error": None, + }, + {"instance_id": "benchflow/task-3", "test_result": {}, "error": "Timeout"}, + ] + input_file.write_text("\n".join(json.dumps(e) for e in entries) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["total_instances"] == 3 + assert result["completed_instances"] == 2 + assert result["resolved_instances"] == 1 + assert result["unresolved_instances"] == 1 + assert result["error_instances"] == 1 + + def test_report_file_written(self, tmp_path: Path) -> None: + """Test that report file is written correctly.""" + input_file = tmp_path / "input.jsonl" + output_file = tmp_path / "output.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": True}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + process_skillsbench_results(str(input_file), str(output_file)) + + assert output_file.exists() + with open(output_file) as f: + report = json.load(f) + assert "total_instances" in report + assert "resolved_ids" in report + assert "aggregate_metrics" in report diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py new file mode 100644 index 000000000..5f8452cb3 --- /dev/null +++ b/tests/test_skillsbench_run_infer.py @@ -0,0 +1,221 @@ +"""Tests for SkillsBench run_infer module.""" + +import json +from pathlib import Path + +import pytest + +from benchmarks.skillsbench.config import INFER_DEFAULTS +from benchmarks.skillsbench.run_infer import ( + convert_harbor_to_eval_output, + run_harbor_evaluation, +) +from openhands.sdk import LLM + + +class TestRunHarborEvaluation: + """Tests for building Harbor invocation arguments.""" + + def test_default_dataset_matches_harbor_registry(self) -> None: + """Test that the default dataset name matches Harbor's published registry.""" + assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" + + def test_run_harbor_evaluation_passes_filters_and_limits( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test Harbor command includes task filters and n-limit.""" + captured: dict[str, list[str]] = {} + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured["cmd"] = cmd + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + + harbor_output_dir = run_harbor_evaluation( + llm=LLM( + model="litellm_proxy/test-model", + api_key="test-key", + base_url="https://proxy.example.com", + ), + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + num_workers=2, + task_ids=["benchflow/task-a", "benchflow/task-b"], + n_limit=3, + ) + + expected_output_dir = tmp_path / "harbor_output" + assert harbor_output_dir == expected_output_dir + + cmd = captured["cmd"] + assert cmd[:8] == [ + "harbor", + "run", + "-d", + "benchflow/skillsbench", + "-a", + "openhands-sdk", + "-m", + "litellm_proxy/test-model", + ] + assert "--jobs-dir" in cmd + assert str(expected_output_dir.resolve()) in cmd + assert cmd.count("--include-task-name") == 2 + assert "benchflow/task-a" in cmd + assert "benchflow/task-b" in cmd + assert cmd[cmd.index("--n-concurrent") + 1] == "2" + assert cmd[cmd.index("--n-tasks") + 1] == "3" + + def test_llm_credentials_passed_via_env( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that LLM credentials are passed via subprocess env, not --ae flags.""" + captured: dict = {} + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured["cmd"] = cmd + captured["env"] = env + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + + run_harbor_evaluation( + llm=LLM( + model="test-model", + api_key="my-secret-key", + base_url="https://my-proxy.example.com", + ), + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + ) + + assert captured["env"]["LLM_API_KEY"] == "my-secret-key" + assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + + +class TestConvertHarborToEvalOutput: + """Tests for convert_harbor_to_eval_output function.""" + + def _create_harbor_structure( + self, tmp_path: Path, trials: list[tuple[str, dict]] + ) -> Path: + """Create a mock Harbor output structure.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) + + for trial_name, trial_result in trials: + trial_dir = job_dir / trial_name + trial_dir.mkdir() + (trial_dir / "result.json").write_text(json.dumps(trial_result)) + + return harbor_dir + + def test_successful_trial_parsing(self, tmp_path: Path) -> None: + """Test successful parsing of harbor trial result.""" + trial_result = { + "task_name": "benchflow/weighted-gdp-calc", + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_harbor_to_eval_output(harbor_dir, output_file) + + assert output_file.exists() + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + assert entries[0]["test_result"]["passed"] is True + assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + + def test_failed_trial(self, tmp_path: Path) -> None: + """Test parsing of a trial with reward 0.""" + trial_result = { + "task_name": "benchflow/task-1", + "trial_name": "task-1__xyz", + "agent_result": { + "n_input_tokens": None, + "n_output_tokens": None, + "cost_usd": None, + }, + "verifier_result": {"rewards": {"reward": 0.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("task-1__xyz", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["test_result"]["passed"] is False + assert entries[0]["metrics"]["total_cost_usd"] == 0.0 + + def test_trial_with_exception(self, tmp_path: Path) -> None: + """Test that exception trials are written as error entries.""" + trial_result = { + "task_name": "benchflow/error-task", + "trial_name": "error-task__err", + "agent_result": {}, + "verifier_result": {}, + "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("error-task__err", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/error-task" + assert entries[0]["error"] is not None + assert entries[0]["test_result"] == {} + + def test_missing_job_directory(self, tmp_path: Path) -> None: + """Test handling when no job directory exists.""" + harbor_dir = tmp_path / "harbor_output" + harbor_dir.mkdir() + + with pytest.raises(RuntimeError, match="No harbor job directory found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + + def test_empty_job_directory(self, tmp_path: Path) -> None: + """Test handling of harbor job dir with no trial subdirs.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test"})) + + with pytest.raises(RuntimeError, match="No trial result files found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") From 4d69c04ac4943f099ad823682edf852f61719fcb Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Wed, 22 Apr 2026 20:47:30 -0400 Subject: [PATCH 08/12] feat(skillsbench): migrate harness from Harbor to benchflow 0.3.0 Switch the SkillsBench evaluation harness from Harbor/openhands-sdk to benchflow 0.3.0 with the native openhands ACP agent. Key changes: - Replace Harbor-specific logic with benchflow CLI invocation (`bench eval create -f config.yaml` / legacy `benchflow job --config`) - Add sparse-checkout task download to avoid cloning the full skillsbench repo - Fix metrics extraction: benchflow 0.3.0 result.json omits cost/token fields; now reads from agent/trajectory.json (harbor-format) or parses agent/openhands.txt stdout (ACP agent) - Fix timestamp detection with regex (_TIMESTAMP_RE) to correctly identify benchflow 0.3.0 job dirs (YYYY-MM-DD__HH-MM-SS) vs plain task dirs - Fix openhands install failure on Ubuntu 24.04 (PEP 668) by injecting PIP_BREAK_SYSTEM_PACKAGES=1 into agent_env - Add provider-specific env var injection for direct Gemini/Anthropic models - Update README and config to reflect benchflow harness Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 1 + benchmarks/skillsbench/README.md | 42 +- benchmarks/skillsbench/config.py | 11 +- benchmarks/skillsbench/run_infer.py | 655 ++++++++++++++++++--------- tests/test_skillsbench_eval_infer.py | 17 - tests/test_skillsbench_run_infer.py | 442 +++++++++++++----- uv.lock | 50 +- vendor/software-agent-sdk | 2 +- 8 files changed, 824 insertions(+), 396 deletions(-) diff --git a/.gitignore b/.gitignore index 459fad588..9164fd12b 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,5 @@ workspace/ # Evaluation outputs eval_outputs/ +evaluation_outputs/ builds/ diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 60ff73652..21339842c 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -1,10 +1,10 @@ # SkillsBench Evaluation -This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent. ## Overview -SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include: - Software engineering - Office & white collar @@ -20,23 +20,25 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites -1. **Install Harbor**: Harbor is the official harness for running SkillsBench. +1. **Install benchflow**: benchflow is the official harness for running SkillsBench. ```bash - pip install harbor + uv tool install benchflow==0.3.0 # or - uv pip install harbor + pip install benchflow==0.3.0 + # or + uv pip install benchflow==0.3.0 ``` -2. **Docker**: Harbor requires Docker to be installed and running. +2. **Docker**: benchflow requires Docker to be installed and running. -3. **LLM API Key**: Configure your LLM provider credentials. +3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment. ## Usage ### Running Inference -Run the SkillsBench evaluation using the OpenHands SDK agent: +Run the SkillsBench evaluation using the `openhands` agent: ```bash # Run full evaluation @@ -62,7 +64,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`): ```json { "model": "anthropic/claude-sonnet-4-20250514", - "api_key": "YOUR_API_KEY" + "api_key": "YOUR_ANTHROPIC_API_KEY" } ``` @@ -99,8 +101,6 @@ Each line contains: { "instance_id": "benchflow/task-name", "test_result": { - "trial_name": "...", - "trial_uri": "...", "rewards": {"reward": 1.0}, "passed": true }, @@ -134,22 +134,21 @@ Each line contains: ## Architecture -The integration follows the Harbor agent adapter pattern: +The integration uses the benchflow CLI as the evaluation harness: -1. **Harbor Harness**: Manages task containers and lifecycle -2. **OpenHands SDK Agent**: Runs inside containers to solve tasks -3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format +1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty +2. **benchflow job**: Runs all tasks concurrently with `openhands` +3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format ```text ┌──────────────────────────────────────────────────┐ -│ Harbor Harness │ +│ benchflow job │ │ ┌────────────────────────────────────────────┐ │ -│ │ Task Container │ │ +│ │ Task Container (Docker) │ │ │ │ ┌──────────────────────────────────────┐ │ │ -│ │ │ OpenHands SDK Agent │ │ │ +│ │ │ openhands │ │ │ │ │ │ - Terminal tool │ │ │ │ │ │ - File editor tool │ │ │ -│ │ │ - Task tracker tool │ │ │ │ │ └──────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────┘ @@ -158,6 +157,5 @@ The integration follows the Harbor agent adapter pattern: ## References - [SkillsBench](https://www.skillsbench.ai/) - The benchmark -- [Harbor](https://harborframework.com) - The evaluation harness -- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK -- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format +- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness +- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 8b55a92b0..4ed541ab9 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,16 +1,13 @@ """SkillsBench configuration defaults.""" -# Default inference settings (only include values actually used by argparse) +# Default inference settings INFER_DEFAULTS = { "dataset": "benchflow/skillsbench", "output_dir": "./evaluation_outputs", "num_workers": 1, } -# Harbor configuration defaults -HARBOR_DEFAULTS = { - # Harbor executable - "harbor_executable": "harbor", - # Default agent name for openhands-sdk - "agent_name": "openhands-sdk", +# benchflow configuration defaults +BENCHFLOW_DEFAULTS = { + "agent_name": "openhands", } diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index a8afa7281..2e11a100a 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -1,24 +1,31 @@ -"""SkillsBench inference script using Harbor with openhands-sdk agent. +"""SkillsBench inference script using the benchflow SDK. -This script runs SkillsBench evaluation using Harbor as the harness -and openhands-sdk as the agent. Results are saved in a format compatible +This script runs SkillsBench evaluation using `benchflow job` as the harness +and `openhands` as the default agent. Results are saved in a format compatible with the standard evaluation pipeline. Usage: - uv run skillsbench-infer --dataset benchflow/skillsbench + uv run skillsbench-infer + + # Run specific tasks + uv run skillsbench-infer --select tasks.txt """ import argparse import json import os +import re +import shutil import subprocess import sys +import tempfile from datetime import datetime, timezone from pathlib import Path +import yaml from pydantic import SecretStr -from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -26,256 +33,469 @@ logger = get_logger(__name__) -# Output filename for results +# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS +_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$") + +# "Total cost: $0.0487" +_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)") +# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K" +_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b") + OUTPUT_FILENAME = "output.jsonl" +TASK_REPOS = { + "skillsbench": { + "repo": "https://github.com/benchflow-ai/skillsbench.git", + "subdir": "tasks", + } +} + +_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = { + "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"), + "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), + "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), + "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"), +} + + +def _infer_direct_provider(model: str) -> str | None: + """Infer the provider prefix for direct model names. + + Examples: + - gemini/gemini-2.5-pro -> gemini + - anthropic/claude-sonnet-4-5 -> anthropic + - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars) + """ + if not model or model.startswith("litellm_proxy/"): + return None + if "/" in model: + provider = model.split("/", 1)[0].lower() + if provider in _DIRECT_PROVIDER_ENV_VARS: + return provider + return None + + +def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]: + """Build the sandbox environment for benchflow's openhands agent. + + Only LLM-specific variables are returned — these go INTO the sandbox + container via the ``agent_env`` YAML key. The calling process inherits + the host environment normally; dumping ``os.environ`` here would leak + the entire host env into every container. + """ + env: dict[str, str] = {} + api_key: str | None = None + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url + + provider = _infer_direct_provider(llm.model) + if provider and api_key: + key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider] + for var_name in key_vars: + env[var_name] = api_key + if llm.base_url and base_url_var: + env[base_url_var] = llm.base_url + + return env + + +def check_benchflow_installed() -> bool: + """Check if benchflow CLI is installed and available. + + Tries ``bench`` first (current name), then falls back to the legacy + ``benchflow`` binary. + """ + for cmd in ("bench", "benchflow"): + try: + result = subprocess.run( + [cmd, "--help"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + return True + except (FileNotFoundError, subprocess.TimeoutExpired): + continue + return False + -def check_harbor_installed() -> bool: - """Check if harbor CLI is installed and available.""" - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] +def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]: + """Map a benchflow dataset name to its task repository metadata.""" + dataset_name = dataset.split("@", 1)[0].split("/")[-1] try: - result = subprocess.run( - [harbor_exe, "--version"], + return dataset_name, TASK_REPOS[dataset_name] + except KeyError as exc: + raise ValueError( + f"Unsupported SkillsBench dataset: {dataset!r}. " + f"Known datasets: {sorted(TASK_REPOS)}" + ) from exc + + +def ensure_tasks( + dataset: str, + tasks_dir: Path, + task_ids: list[str] | None = None, +) -> None: + """Download tasks for a benchflow dataset into tasks_dir. + + BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the + benchmark task repository directly when the local tasks directory is empty. + + When *task_ids* is provided, a sparse checkout is used so only the + requested task subdirectories are downloaded — much faster than a full + clone for large repos. + """ + if tasks_dir.exists() and any(tasks_dir.iterdir()): + logger.info(f"Tasks already present in {tasks_dir}, skipping download") + return + + _, repo_info = _resolve_task_repo(dataset) + tasks_dir.mkdir(parents=True, exist_ok=True) + clone_dir = tasks_dir.parent / "_clone" + if clone_dir.exists(): + shutil.rmtree(clone_dir, ignore_errors=True) + + subdir = repo_info.get("subdir", "") + + if task_ids: + # Sparse checkout: only download the specific task directories + short_names = [tid.split("/")[-1] for tid in task_ids] + + cmd_clone = [ + "git", + "clone", + "--no-checkout", + "--depth", + "1", + repo_info["repo"], + str(clone_dir), + ] + logger.info(f"Sparse clone: {' '.join(cmd_clone)}") + result = subprocess.run(cmd_clone, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"task download failed: {result.stderr}") + + # Init sparse-checkout and set the desired paths + subprocess.run( + ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"], capture_output=True, text=True, - timeout=10, + check=True, ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired): - return False + sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names] + subprocess.run( + ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths], + capture_output=True, + text=True, + check=True, + ) + subprocess.run( + ["git", "-C", str(clone_dir), "checkout"], + capture_output=True, + text=True, + check=True, + ) + else: + # Full shallow clone + cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)] + logger.info(f"Downloading tasks: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + logger.error(f"Failed to clone tasks: {result.stderr}") + raise RuntimeError(f"task download failed: {result.stderr}") + + try: + source_dir = clone_dir / subdir if subdir else clone_dir + + for entry in source_dir.iterdir(): + target = tasks_dir / entry.name + if entry.is_dir(): + shutil.copytree(entry, target, dirs_exist_ok=True) + else: + shutil.copy2(entry, target) + finally: + shutil.rmtree(clone_dir, ignore_errors=True) + logger.info(f"Tasks downloaded to {tasks_dir}") -def run_harbor_evaluation( + +def run_benchflow_job( llm: LLM, - dataset: str, - output_dir: str, + tasks_dir: Path, + jobs_dir: Path, num_workers: int = 1, task_ids: list[str] | None = None, - n_limit: int | None = None, ) -> Path: - """Run harbor evaluation with openhands-sdk agent. + """Run benchflow job command. Args: llm: LLM configuration for the agent. - dataset: Harbor dataset name (e.g., benchflow/skillsbench). - output_dir: Directory to store output files. - num_workers: Number of parallel workers. - task_ids: Optional list of specific task IDs to run. - n_limit: Optional maximum number of dataset tasks to run. + tasks_dir: Path to directory containing task subdirectories. + jobs_dir: Directory for benchflow job output. + num_workers: Number of parallel workers (concurrency). + task_ids: Optional list of task IDs to filter (short names, not full paths). Returns: - Path to the harbor output directory. + Path to jobs_dir. """ - harbor_output_dir = Path(output_dir) / "harbor_output" - harbor_output_dir.mkdir(parents=True, exist_ok=True) - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] - - # Build harbor command using harbor CLI flags. - # Use absolute path for --jobs-dir to avoid CWD-relative path issues. - cmd = [ - harbor_exe, - "run", - "-d", - dataset, - "-a", - HARBOR_DEFAULTS["agent_name"], - "-m", - llm.model, - "--jobs-dir", - str(harbor_output_dir.resolve()), - "--n-concurrent", - str(num_workers), - ] - - # Pass LLM credentials as agent environment variables - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) - if llm.base_url: - cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + jobs_dir.mkdir(parents=True, exist_ok=True) + + agent_env = _build_benchflow_agent_env(llm) + # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without + # --break-system-packages. benchflow's openhands install_cmd uses plain + # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04, + # causing "Agent openhands install failed (rc=1)". Setting this env var + # makes pip skip the restriction without modifying the install_cmd. + agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1") + config = { + "tasks_dir": str(tasks_dir), + "jobs_dir": str(jobs_dir.resolve()), + "agent": BENCHFLOW_DEFAULTS["agent_name"], + "model": llm.model, + "environment": "docker", + "concurrency": num_workers, + # OpenHands is installed inside the sandbox as root by benchflow's + # registry install command. Running as the default "agent" user can + # lose access to that binary on some task images. + "sandbox_user": None, + "agent_env": agent_env, + } - # Add specific task names if provided - if task_ids: - for task_id in task_ids: - cmd.extend(["--include-task-name", task_id]) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False + ) as tmp: + yaml.safe_dump(config, tmp, sort_keys=False) + config_path = tmp.name - if n_limit is not None: - cmd.extend(["--n-tasks", str(n_limit)]) + # Prefer `bench eval create` (current), fall back to legacy `benchflow job` + bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench" + if "benchflow" in bench_bin: + cmd = [bench_bin, "job", "--config", config_path] + else: + cmd = [bench_bin, "eval", "create", "-f", config_path] - logger.info(f"Running harbor command: {' '.join(cmd)}") - logger.info(f"Output directory: {harbor_output_dir}") + logger.info(f"Running: {' '.join(cmd)}") - # harbor's openhands-sdk agent reads LLM credentials from the host process - # environment (os.environ), not from --ae flags which go to the sandbox. - env = os.environ.copy() - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - env["LLM_API_KEY"] = api_key - if llm.base_url: - env["LLM_BASE_URL"] = llm.base_url + # Inject LLM vars into the host process env so benchflow's provider + # resolution can pick them up; the subprocess inherits normally (env=None). + host_env = os.environ.copy() + host_env.update(agent_env) + result = subprocess.run(cmd, capture_output=True, text=True, env=host_env) + Path(config_path).unlink(missing_ok=True) - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - env=env, - ) + if result.returncode != 0: + logger.error(f"benchflow job failed (code {result.returncode})") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"benchflow job failed: {result.stderr}") - if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + logger.info("benchflow job completed") + logger.info(f"stdout: {result.stdout}") - logger.info("Harbor evaluation completed successfully") - logger.info(f"stdout: {result.stdout}") + return jobs_dir - except FileNotFoundError: - raise RuntimeError( - "Harbor CLI not found. Please install harbor: pip install harbor" - ) - return harbor_output_dir - - -def _find_job_dir(harbor_output_dir: Path) -> Path: - """Find the harbor job directory (timestamp-named) inside the output dir.""" - # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) - # containing result.json and trial subdirectories - candidates = [ - d - for d in harbor_output_dir.iterdir() - if d.is_dir() and (d / "result.json").exists() - ] - if not candidates: - raise RuntimeError( - f"No harbor job directory found in {harbor_output_dir}. " - f"Expected a timestamp-named directory containing result.json." +def _extract_trial_metrics(trial_dir: Path) -> dict: + """Extract token/cost metrics from benchflow 0.3.0 trial output files. + + benchflow 0.3.0 does not write cost/token fields to result.json. + Instead, metrics are read from: + 1. agent/trajectory.json → final_metrics (harbor-format agent) + 2. agent/openhands.txt → "Total cost:" and "Tokens:" lines (ACP agent) + """ + # 1. Harbor-format trajectory.json written by openhands-sdk agent + traj_file = trial_dir / "agent" / "trajectory.json" + if traj_file.exists(): + try: + with open(traj_file) as f: + traj = json.load(f) + fm = traj.get("final_metrics") or {} + if fm: + return { + "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0), + "total_completion_tokens": int( + fm.get("total_completion_tokens") or 0 + ), + "total_cost_usd": float(fm.get("total_cost_usd") or 0.0), + } + except (json.JSONDecodeError, OSError): + pass + + # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native) + def _parse_token_count(value: str, suffix: str) -> int: + n = float(value) + return int( + n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1) ) - # Use the most recent job directory if multiple exist - return sorted(candidates)[-1] + for log_name in ("openhands.txt", "openhands_sdk.txt"): + log_file = trial_dir / "agent" / log_name + if not log_file.exists(): + continue + try: + text = log_file.read_text(errors="replace") + cost_usd = 0.0 + prompt_tokens = 0 + completion_tokens = 0 + m = _COST_RE.search(text) + if m: + cost_usd = float(m.group(1)) + m = _TOKENS_RE.search(text) + if m: + prompt_tokens = _parse_token_count(m.group(1), m.group(2)) + completion_tokens = _parse_token_count(m.group(3), m.group(4)) + if cost_usd or prompt_tokens: + return { + "total_prompt_tokens": prompt_tokens, + "total_completion_tokens": completion_tokens, + "total_cost_usd": cost_usd, + } + except OSError: + pass -def convert_harbor_to_eval_output( - harbor_output_dir: Path, + return { + "total_prompt_tokens": 0, + "total_completion_tokens": 0, + "total_cost_usd": 0.0, + } + + +def convert_benchflow_to_eval_output( + jobs_dir: Path, eval_output_path: Path, + task_ids: list[str] | None = None, ) -> None: - """Convert harbor output to evaluation output format. + """Convert benchflow job output to standard evaluation output format. - Harbor stores trial results in a job directory structured as: - harbor_output/TIMESTAMP/TRIAL_NAME/result.json + benchflow 0.3.0 stores trial results as: + jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json - Each trial's result.json contains task_name, verifier_result, agent_result, - timing info, and exception details. + Each result.json contains task_name, rewards, error, verifier_error, and timing. Args: - harbor_output_dir: Path to harbor output directory. - eval_output_path: Path to write the converted output.jsonl. + jobs_dir: Path to benchflow jobs directory. + eval_output_path: Path to write output.jsonl. + task_ids: Optional filter for specific task IDs (short names). """ - logger.info(f"Converting harbor output from {harbor_output_dir}") - - job_dir = _find_job_dir(harbor_output_dir) - logger.info(f"Using harbor job directory: {job_dir}") + logger.info(f"Converting benchflow output from {jobs_dir}") + + # benchflow 0.3.0 writes: + # jobs/summary.json + # jobs/TIMESTAMP/TRIAL_NAME/result.json + # while older local outputs may place results directly under jobs/. + job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()] + timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)] + + if timestamp_job_dirs: + selected_job_dir = sorted(timestamp_job_dirs)[-1] + logger.info(f"Using benchflow job directory: {selected_job_dir}") + task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()] + else: + task_dirs = job_dirs - # Find trial result files (each trial dir has a result.json) - result_files = list(job_dir.glob("*/result.json")) - # Exclude the job-level result.json - result_files = [f for f in result_files if f.parent != job_dir] + if not task_dirs: + raise RuntimeError(f"No task directories found in {jobs_dir}") - if not result_files: - raise RuntimeError( - f"No trial result files found in {job_dir}. " - f"Expected result.json files in trial subdirectories." - ) + if task_ids: + short_ids = {tid.split("/")[-1] for tid in task_ids} + task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids] - logger.info(f"Found {len(result_files)} trial results in {job_dir}") + logger.info(f"Processing {len(task_dirs)} task directories") results: list[dict] = [] errors: list[dict] = [] - for result_file in result_files: + for task_dir in sorted(task_dirs): + # Find the trial result — benchflow writes trial-0/result.json + trial_results = list(task_dir.glob("trial-*/result.json")) + if not trial_results: + # Fall back to a direct result.json + direct = task_dir / "result.json" + if direct.exists(): + trial_results = [direct] + + if not trial_results: + logger.warning(f"No result.json found in {task_dir}, skipping") + errors.append( + { + "instance_id": f"benchflow/{task_dir.name}", + "error": "No result.json found", + "test_result": {}, + } + ) + continue + + # Use the last trial (highest retry index) + result_file = sorted(trial_results)[-1] + try: with open(result_file) as f: trial = json.load(f) - instance_id = trial.get("task_name", result_file.parent.name) + task_basename = task_dir.name.split("__")[0] + task_name = trial.get("task_name") or f"benchflow/{task_basename}" + # Normalise to benchflow/ form + if "/" not in task_name: + task_name = f"benchflow/{task_name}" - # Check for exceptions - if trial.get("exception_info"): + error = trial.get("error") + verifier_error = trial.get("verifier_error") + + if error or verifier_error: errors.append( { - "instance_id": instance_id, - "error": str(trial["exception_info"]), + "instance_id": task_name, + "error": str(error or verifier_error), "test_result": {}, } ) continue - # Extract verifier results - verifier_result = trial.get("verifier_result", {}) - rewards = verifier_result.get("rewards", {}) - passed = rewards.get("reward", 0.0) > 0 - - # Extract agent metrics - agent_result = trial.get("agent_result", {}) + rewards = trial.get("rewards") or {} + passed = bool(rewards.get("reward", 0.0)) eval_entry = { - "instance_id": instance_id, + "instance_id": task_name, "test_result": { - "trial_name": trial.get("trial_name"), - "trial_uri": trial.get("trial_uri"), "rewards": rewards, "passed": passed, }, "instruction": "", "error": None, "history": [], - "metrics": { - "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, - "total_completion_tokens": ( - agent_result.get("n_output_tokens") or 0 - ), - "total_cost_usd": agent_result.get("cost_usd") or 0.0, - }, + "metrics": _extract_trial_metrics(result_file.parent), } results.append(eval_entry) - logger.info( - f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" - ) + logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}") except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to process result file {result_file}: {e}") + logger.error(f"Failed to read {result_file}: {e}") errors.append( { - "instance_id": result_file.parent.name, + "instance_id": f"benchflow/{task_dir.name}", "error": str(e), "test_result": {}, } ) if not results and not errors: - raise RuntimeError(f"No trials processed from {harbor_output_dir}") + raise RuntimeError(f"No trials processed from {jobs_dir}") if not results: logger.warning( - f"All {len(errors)} trials failed in {harbor_output_dir}; " - "writing error entries for downstream reporting" + f"All {len(errors)} trials failed; writing error entries for reporting" ) - # Write results to output.jsonl with open(eval_output_path, "w") as f: - for entry in results: - f.write(json.dumps(entry) + "\n") - for entry in errors: + for entry in results + errors: f.write(json.dumps(entry) + "\n") logger.info( @@ -298,18 +518,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]: def main() -> None: """Main entry point for skillsbench inference.""" parser = argparse.ArgumentParser( - description="Run SkillsBench evaluation with openhands-sdk via Harbor", + description="Run SkillsBench evaluation with benchflow and openhands", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run full skillsbench evaluation uv run skillsbench-infer .llm_config/claude.json - # Run specific tasks + # Run specific tasks from a file uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with custom dataset version - uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 + # Run with more concurrency + uv run skillsbench-infer .llm_config/claude.json --num-workers 4 """, ) @@ -322,7 +542,7 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="Harbor dataset name (e.g., benchflow/skillsbench)", + help="benchflow dataset name (e.g., benchflow/skillsbench)", ) parser.add_argument( "--output-dir", @@ -334,12 +554,12 @@ def main() -> None: "--num-workers", type=int, default=INFER_DEFAULTS["num_workers"], - help="Number of parallel workers", + help="Number of parallel workers (concurrency)", ) parser.add_argument( "--n-limit", type=int, - help="Maximum number of dataset tasks to run after Harbor filtering", + help="Maximum number of tasks to run", ) parser.add_argument( "--select", @@ -358,14 +578,13 @@ def main() -> None: help="Optional note for the evaluation run", ) parser.add_argument( - "--skip-harbor", + "--skip-run", action="store_true", - help="Skip running harbor and only convert existing results", + help="Skip running benchflow and only convert existing results", ) args = parser.parse_args() - # Validate LLM config if not os.path.isfile(args.llm_config_path): logger.error(f"LLM config file does not exist: {args.llm_config_path}") sys.exit(1) @@ -375,87 +594,103 @@ def main() -> None: llm = LLM.model_validate_json(llm_config) logger.info(f"Using LLM: {llm.model}") - # Check harbor installation - if not args.skip_harbor and not check_harbor_installed(): + if not args.skip_run and not check_benchflow_installed(): logger.error( - "Harbor CLI is not installed. Please install it:\n" - " pip install harbor\n" + "benchflow CLI is not installed. Please install it:\n" + " uv tool install benchflow==0.3.0\n" " # or\n" - " uv pip install harbor" + " pip install benchflow==0.3.0\n" + " # or\n" + " uv pip install benchflow==0.3.0" ) sys.exit(1) - # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( base_dir=args.output_dir, dataset_name=dataset_description, model_name=llm.model, - max_iterations=100, # Not directly used but required for path construction + max_iterations=100, eval_note=args.note, ) logger.info(f"Output directory: {structured_output_dir}") os.makedirs(structured_output_dir, exist_ok=True) - # Save metadata metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, "timestamp": datetime.now(timezone.utc).isoformat(), - "harbor_agent": HARBOR_DEFAULTS["agent_name"], + "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"], "note": args.note, } metadata_path = Path(structured_output_dir) / "metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) - # Collect task IDs if specified task_ids: list[str] | None = None if args.select: - loaded_ids = load_task_ids_from_file(args.select) - task_ids = loaded_ids - logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") + task_ids = load_task_ids_from_file(args.select) + logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}") elif args.task_id: task_ids = list(args.task_id) logger.info(f"Running {len(task_ids)} specified task IDs") + tasks_dir = Path(structured_output_dir) / "tasks" + jobs_dir = Path(structured_output_dir) / "jobs" output_path = Path(structured_output_dir) / OUTPUT_FILENAME - if not args.skip_harbor: - # Run harbor evaluation + if not args.skip_run: try: - harbor_output_dir = run_harbor_evaluation( + ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids) + + # Apply n_limit by slicing available task directories + effective_task_dirs = tasks_dir + if args.n_limit is not None or task_ids is not None: + all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir()) + if task_ids: + short_ids = {tid.split("/")[-1] for tid in task_ids} + all_dirs = [d for d in all_dirs if d.name in short_ids] + if args.n_limit is not None: + all_dirs = all_dirs[: args.n_limit] + + # Write a filtered tasks dir symlink tree + filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered" + filtered_tasks_dir.mkdir(exist_ok=True) + for d in all_dirs: + link = filtered_tasks_dir / d.name + if not link.exists(): + link.symlink_to(d.resolve()) + effective_task_dirs = filtered_tasks_dir + + run_benchflow_job( llm=llm, - dataset=args.dataset, - output_dir=structured_output_dir, + tasks_dir=effective_task_dirs, + jobs_dir=jobs_dir, num_workers=args.num_workers, task_ids=task_ids, - n_limit=args.n_limit, ) - # Convert harbor output to standard format - convert_harbor_to_eval_output( - harbor_output_dir=harbor_output_dir, + convert_benchflow_to_eval_output( + jobs_dir=jobs_dir, eval_output_path=output_path, + task_ids=task_ids, ) except Exception as e: logger.error(f"Evaluation failed: {e}") sys.exit(1) else: - # Skip harbor, just convert existing results - harbor_output_dir = Path(structured_output_dir) / "harbor_output" - if harbor_output_dir.exists(): - convert_harbor_to_eval_output( - harbor_output_dir=harbor_output_dir, + if jobs_dir.exists(): + convert_benchflow_to_eval_output( + jobs_dir=jobs_dir, eval_output_path=output_path, + task_ids=task_ids, ) else: - logger.error(f"No harbor output found at {harbor_output_dir}") + logger.error(f"No jobs output found at {jobs_dir}") sys.exit(1) - # Generate cost report if output_path.exists(): generate_cost_report(str(output_path)) diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py index 56d54f27a..1334da297 100644 --- a/tests/test_skillsbench_eval_infer.py +++ b/tests/test_skillsbench_eval_infer.py @@ -39,23 +39,6 @@ def test_resolved_instance(self, tmp_path: Path) -> None: assert result["unresolved_instances"] == 0 assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] - def test_unresolved_instance(self, tmp_path: Path) -> None: - """Test processing an unresolved (passed=False) instance.""" - input_file = tmp_path / "unresolved.jsonl" - output_file = tmp_path / "unresolved.report.json" - - entry = { - "instance_id": "benchflow/task-1", - "test_result": {"passed": False, "rewards": {"reward": 0.0}}, - "error": None, - } - input_file.write_text(json.dumps(entry) + "\n") - - result = process_skillsbench_results(str(input_file), str(output_file)) - - assert result["resolved_instances"] == 0 - assert result["unresolved_instances"] == 1 - def test_instance_with_error(self, tmp_path: Path) -> None: """Test processing an instance that errored.""" input_file = tmp_path / "error.jsonl" diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 5f8452cb3..784b4d1cc 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -4,82 +4,102 @@ from pathlib import Path import pytest +import yaml -from benchmarks.skillsbench.config import INFER_DEFAULTS +from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( - convert_harbor_to_eval_output, - run_harbor_evaluation, + _build_benchflow_agent_env, + convert_benchflow_to_eval_output, + run_benchflow_job, ) from openhands.sdk import LLM -class TestRunHarborEvaluation: - """Tests for building Harbor invocation arguments.""" +class TestRunBenchflowJob: + """Tests for building benchflow job invocation arguments.""" - def test_default_dataset_matches_harbor_registry(self) -> None: - """Test that the default dataset name matches Harbor's published registry.""" + def test_default_dataset_matches_benchflow_registry(self) -> None: + """Test that the default dataset name matches benchflow's published registry.""" assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_run_harbor_evaluation_passes_filters_and_limits( + def test_default_agent_is_openhands(self) -> None: + """Test that the default agent is openhands.""" + assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands" + + def test_run_benchflow_job_passes_model_and_concurrency( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test Harbor command includes task filters and n-limit.""" - captured: dict[str, list[str]] = {} + """Test benchflow job command writes the expected YAML config.""" + captured_cmd: list[str] = [] + captured_env: dict[str, str] = {} + captured_config: dict = {} + + # Force legacy benchflow binary path so the command format is deterministic + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.shutil.which", + lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, + ) def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured["cmd"] = cmd + captured_cmd[:] = cmd + captured_env.clear() + captured_env.update(env) + with open(cmd[3]) as f: + captured_config.update(yaml.safe_load(f)) return type( "Completed", (), - {"returncode": 0, "stdout": "ok", "stderr": ""}, + {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""}, )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - harbor_output_dir = run_harbor_evaluation( + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + jobs_dir = tmp_path / "jobs" + + run_benchflow_job( llm=LLM( - model="litellm_proxy/test-model", + model="anthropic/claude-sonnet-4-5", api_key="test-key", base_url="https://proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], - output_dir=str(tmp_path), - num_workers=2, - task_ids=["benchflow/task-a", "benchflow/task-b"], - n_limit=3, - ) - - expected_output_dir = tmp_path / "harbor_output" - assert harbor_output_dir == expected_output_dir - - cmd = captured["cmd"] - assert cmd[:8] == [ - "harbor", - "run", - "-d", - "benchflow/skillsbench", - "-a", - "openhands-sdk", - "-m", - "litellm_proxy/test-model", - ] - assert "--jobs-dir" in cmd - assert str(expected_output_dir.resolve()) in cmd - assert cmd.count("--include-task-name") == 2 - assert "benchflow/task-a" in cmd - assert "benchflow/task-b" in cmd - assert cmd[cmd.index("--n-concurrent") + 1] == "2" - assert cmd[cmd.index("--n-tasks") + 1] == "3" - - def test_llm_credentials_passed_via_env( + tasks_dir=tasks_dir, + jobs_dir=jobs_dir, + num_workers=4, + ) + + cmd = captured_cmd + assert cmd[0] == "/usr/local/bin/benchflow" + assert cmd[1] == "job" + assert cmd[2] == "--config" + assert captured_config["tasks_dir"] == str(tasks_dir) + assert captured_config["jobs_dir"] == str(jobs_dir.resolve()) + assert captured_config["agent"] == "openhands" + assert captured_config["model"] == "anthropic/claude-sonnet-4-5" + assert captured_config["concurrency"] == 4 + assert captured_config["sandbox_user"] is None + + def test_llm_credentials_passed_via_subprocess_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test that LLM credentials are passed via subprocess env, not --ae flags.""" - captured: dict = {} + """Test that LLM credentials are passed via subprocess env and YAML.""" + captured_cmd: list[str] = [] + captured_env: dict[str, str] = {} + captured_config: dict = {} + + # Force legacy benchflow binary path so the command format is deterministic + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.shutil.which", + lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, + ) def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured["cmd"] = cmd - captured["env"] = env + captured_cmd[:] = cmd + captured_env.clear() + captured_env.update(env) + with open(cmd[3]) as f: + captured_config.update(yaml.safe_load(f)) return type( "Completed", (), @@ -88,60 +108,127 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - run_harbor_evaluation( + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + + run_benchflow_job( llm=LLM( model="test-model", api_key="my-secret-key", base_url="https://my-proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], - output_dir=str(tmp_path), + tasks_dir=tasks_dir, + jobs_dir=tmp_path / "jobs", + ) + + # Credentials in subprocess env + assert captured_env["LLM_API_KEY"] == "my-secret-key" + assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com" + assert "--ae" not in captured_cmd + assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key" + assert ( + captured_config["agent_env"]["LLM_BASE_URL"] + == "https://my-proxy.example.com" ) - assert captured["env"]["LLM_API_KEY"] == "my-secret-key" - assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + def test_direct_gemini_model_sets_provider_env_vars(self) -> None: + """Direct provider models need provider-specific env vars.""" + env = _build_benchflow_agent_env( + LLM( + model="gemini/gemini-3.1-flash-lite-preview", + api_key="gemini-test-key", + ) + ) + assert env["LLM_API_KEY"] == "gemini-test-key" + assert env["GEMINI_API_KEY"] == "gemini-test-key" + assert env["GOOGLE_API_KEY"] == "gemini-test-key" -class TestConvertHarborToEvalOutput: - """Tests for convert_harbor_to_eval_output function.""" + def test_proxy_model_does_not_set_provider_env_vars(self) -> None: + """LiteLLM proxy configs should keep using generic LLM_* vars only.""" + env = _build_benchflow_agent_env( + LLM( + model="litellm_proxy/anthropic/claude-sonnet-4-20250514", + api_key="proxy-key", + base_url="https://proxy.example.com", + ) + ) - def _create_harbor_structure( - self, tmp_path: Path, trials: list[tuple[str, dict]] - ) -> Path: - """Create a mock Harbor output structure.""" - harbor_dir = tmp_path / "harbor_output" - job_dir = harbor_dir / "2026-01-01__00-00-00" - job_dir.mkdir(parents=True) - (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) + assert env["LLM_API_KEY"] == "proxy-key" + assert env["LLM_BASE_URL"] == "https://proxy.example.com" + assert "ANTHROPIC_API_KEY" not in env + assert "ANTHROPIC_BASE_URL" not in env - for trial_name, trial_result in trials: - trial_dir = job_dir / trial_name - trial_dir.mkdir() - (trial_dir / "result.json").write_text(json.dumps(trial_result)) - return harbor_dir +class TestConvertBenchflowToEvalOutput: + """Tests for convert_benchflow_to_eval_output function.""" + + def _create_benchflow_structure( + self, tmp_path: Path, tasks: list[tuple[str, dict]] + ) -> Path: + """Create a mock benchflow jobs directory structure. + + benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json + """ + jobs_dir = tmp_path / "jobs" + for task_name, result in tasks: + trial_dir = jobs_dir / task_name / "trial-0" + trial_dir.mkdir(parents=True) + (trial_dir / "result.json").write_text(json.dumps(result)) + return jobs_dir + + def _create_benchflow_timestamped_job( + self, tmp_path: Path, tasks: list[tuple[str, dict]] + ) -> Path: + """Create a mock benchflow 0.3.0 jobs directory structure. + + benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json + """ + jobs_dir = tmp_path / "jobs" + job_dir = jobs_dir / "2026-04-21__23-12-35" + job_dir.mkdir(parents=True) + (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)})) + for task_name, result in tasks: + trial_dir = job_dir / f"{task_name}__abc123" + trial_dir.mkdir(parents=True) + (trial_dir / "result.json").write_text(json.dumps(result)) + return jobs_dir def test_successful_trial_parsing(self, tmp_path: Path) -> None: - """Test successful parsing of harbor trial result.""" + """Test successful parsing of a benchflow trial result. + + benchflow 0.3.0 does not write cost/token fields to result.json. + Metrics are read from agent/trajectory.json (harbor-format agent) + or parsed from agent/openhands.txt (ACP agent stdout). + """ trial_result = { "task_name": "benchflow/weighted-gdp-calc", - "trial_name": "weighted-gdp-calc__abc123", - "trial_uri": "file:///path/to/trial", - "agent_result": { - "n_input_tokens": 1000, - "n_output_tokens": 200, - "cost_usd": 0.05, - }, - "verifier_result": {"rewards": {"reward": 1.0}}, - "exception_info": None, + "rewards": {"reward": 1.0}, + "error": None, } - harbor_dir = self._create_harbor_structure( - tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + # Write agent/trajectory.json with final_metrics (harbor-format agent output). + # agent/ sits next to result.json, inside the trial-0 subdirectory. + trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0" + agent_dir = trial_dir / "agent" + agent_dir.mkdir(parents=True, exist_ok=True) + (agent_dir / "trajectory.json").write_text( + json.dumps( + { + "final_metrics": { + "total_prompt_tokens": 1000, + "total_completion_tokens": 200, + "total_cost_usd": 0.05, + } + } + ) ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) assert output_file.exists() with open(output_file) as f: @@ -151,26 +238,52 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + assert entries[0]["metrics"]["total_prompt_tokens"] == 1000 + assert entries[0]["metrics"]["total_completion_tokens"] == 200 + + def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None: + """Test that metrics are extracted from agent/openhands.txt (ACP agent).""" + trial_result = { + "task_name": "benchflow/acp-task", + "rewards": {"reward": 1.0}, + "error": None, + } + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, [("acp-task", trial_result)] + ) + # Write agent/openhands.txt simulating openhands ACP stdout + trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123" + agent_dir = trial_dir / "agent" + agent_dir.mkdir(parents=True, exist_ok=True) + (agent_dir / "openhands.txt").write_text( + "OpenHands SDK v1.16.0\n" + "Tokens: ↑ input 404.21K • cache hit 70.47% • reasoning 579 • ↓ output 7.83K • $0.0487\n" + "Total cost: $0.0487\n" + ) + output_file = tmp_path / "output.jsonl" + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487) + assert entries[0]["metrics"]["total_prompt_tokens"] == 404210 + assert entries[0]["metrics"]["total_completion_tokens"] == 7830 def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { "task_name": "benchflow/task-1", - "trial_name": "task-1__xyz", - "agent_result": { - "n_input_tokens": None, - "n_output_tokens": None, - "cost_usd": None, - }, - "verifier_result": {"rewards": {"reward": 0.0}}, - "exception_info": None, + "rewards": {"reward": 0.0}, + "error": None, } - harbor_dir = self._create_harbor_structure( - tmp_path, [("task-1__xyz", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("task-1", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -178,21 +291,19 @@ def test_failed_trial(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is False assert entries[0]["metrics"]["total_cost_usd"] == 0.0 - def test_trial_with_exception(self, tmp_path: Path) -> None: - """Test that exception trials are written as error entries.""" + def test_trial_with_error(self, tmp_path: Path) -> None: + """Test that errored trials are written as error entries.""" trial_result = { "task_name": "benchflow/error-task", - "trial_name": "error-task__err", - "agent_result": {}, - "verifier_result": {}, - "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, + "rewards": {}, + "error": "LLM_API_KEY not set", } - harbor_dir = self._create_harbor_structure( - tmp_path, [("error-task__err", trial_result)] + jobs_dir = self._create_benchflow_structure( + tmp_path, [("error-task", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_harbor_to_eval_output(harbor_dir, output_file) + convert_benchflow_to_eval_output(jobs_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -202,20 +313,121 @@ def test_trial_with_exception(self, tmp_path: Path) -> None: assert entries[0]["error"] is not None assert entries[0]["test_result"] == {} - def test_missing_job_directory(self, tmp_path: Path) -> None: - """Test handling when no job directory exists.""" - harbor_dir = tmp_path / "harbor_output" - harbor_dir.mkdir() + def test_missing_jobs_directory(self, tmp_path: Path) -> None: + """Test handling when jobs directory is empty.""" + jobs_dir = tmp_path / "jobs" + jobs_dir.mkdir() + + with pytest.raises(RuntimeError, match="No task directories found"): + convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl") + + def test_task_id_filtering(self, tmp_path: Path) -> None: + """Test that only specified task IDs are converted.""" + trials = [ + ( + "task-a", + { + "task_name": "benchflow/task-a", + "rewards": {"reward": 1.0}, + "error": None, + }, + ), + ( + "task-b", + { + "task_name": "benchflow/task-b", + "rewards": {"reward": 0.0}, + "error": None, + }, + ), + ] + jobs_dir = self._create_benchflow_structure(tmp_path, trials) + output_file = tmp_path / "output.jsonl" - with pytest.raises(RuntimeError, match="No harbor job directory found"): - convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + convert_benchflow_to_eval_output( + jobs_dir, output_file, task_ids=["benchflow/task-a"] + ) - def test_empty_job_directory(self, tmp_path: Path) -> None: - """Test handling of harbor job dir with no trial subdirs.""" - harbor_dir = tmp_path / "harbor_output" - job_dir = harbor_dir / "2026-01-01__00-00-00" - job_dir.mkdir(parents=True) - (job_dir / "result.json").write_text(json.dumps({"id": "test"})) + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/task-a" + + def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None: + """Test that task names without prefix get benchflow/ prepended.""" + trial_result = { + "task_name": "weighted-gdp-calc", # no benchflow/ prefix + "rewards": {"reward": 1.0}, + "error": None, + } + jobs_dir = self._create_benchflow_structure( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + + def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None: + """Test benchflow 0.3.0 timestamped jobs directory layout.""" + trial_result = { + "task_name": "weighted-gdp-calc", + "rewards": {"reward": 1.0}, + "error": None, + "n_input_tokens": 42, + "n_output_tokens": 7, + "cost_usd": 0.01, + } + + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, [("weighted-gdp-calc", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_benchflow_to_eval_output(jobs_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + assert entries[0]["test_result"]["passed"] is True - with pytest.raises(RuntimeError, match="No trial result files found"): - convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") + def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None: + """Test filtering strips the run suffix from trial directory names.""" + jobs_dir = self._create_benchflow_timestamped_job( + tmp_path, + [ + ( + "task-a", + { + "task_name": "task-a", + "rewards": {"reward": 1.0}, + "error": None, + }, + ), + ( + "task-b", + { + "task_name": "task-b", + "rewards": {"reward": 0.0}, + "error": None, + }, + ), + ], + ) + output_file = tmp_path / "output.jsonl" + + convert_benchflow_to_eval_output( + jobs_dir, output_file, task_ids=["benchflow/task-a"] + ) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert len(entries) == 1 + assert entries[0]["instance_id"] == "benchflow/task-a" diff --git a/uv.lock b/uv.lock index 2cd0b3640..147abedc9 100644 --- a/uv.lock +++ b/uv.lock @@ -1282,6 +1282,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1292,6 +1293,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1302,6 +1304,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -1516,11 +1519,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] -[package.optional-dependencies] -socks = [ - { name = "socksio" }, -] - [[package]] name = "httpx-sse" version = "0.4.2" @@ -1816,12 +1814,14 @@ wheels = [ [[package]] name = "litellm" -version = "1.83.0" +version = "1.80.10" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "click" }, { name = "fastuuid" }, + { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "httpx" }, { name = "importlib-metadata" }, { name = "jinja2" }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" }, ] [[package]] @@ -2402,7 +2402,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2467,6 +2467,7 @@ dependencies = [ { name = "python-json-logger" }, { name = "requests" }, { name = "swebench" }, + { name = "swesmith" }, { name = "swt-bench" }, { name = "tenacity" }, { name = "toml" }, @@ -2521,6 +2522,7 @@ requires-dist = [ { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, { name = "swebench", specifier = "==4.1.0" }, + { name = "swesmith", specifier = ">=0.0.9" }, { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" }, { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, @@ -2544,7 +2546,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, @@ -2552,7 +2554,7 @@ dependencies = [ { name = "fakeredis", extra = ["lua"] }, { name = "fastmcp" }, { name = "filelock" }, - { name = "httpx", extra = ["socks"] }, + { name = "httpx" }, { name = "litellm" }, { name = "lmnr" }, { name = "pydantic" }, @@ -2575,8 +2577,8 @@ requires-dist = [ { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" }, { name = "fastmcp", specifier = ">=3.0.0" }, { name = "filelock", specifier = ">=3.20.1" }, - { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, - { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" }, + { name = "httpx", specifier = ">=0.27.0" }, + { name = "litellm", specifier = "==1.80.10" }, { name = "lmnr", specifier = ">=0.7.24" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, @@ -2588,7 +2590,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2617,7 +2619,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.16.1" +version = "1.16.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, @@ -6707,15 +6709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "socksio" -version = "1.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, -] - [[package]] name = "sortedcontainers" version = "2.4.0" @@ -6841,6 +6834,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, ] +[[package]] +name = "swesmith" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" }, +] + [[package]] name = "swt-bench" version = "1.0.1" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 3e0a3a091..acd5adc96 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a +Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6 From 935f489e5233d919fb9db6ccfa63a39f89b68511 Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Wed, 22 Apr 2026 20:57:53 -0400 Subject: [PATCH 09/12] Revert "feat(skillsbench): migrate harness from Harbor to benchflow 0.3.0" This reverts commit 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b. --- .gitignore | 1 - benchmarks/skillsbench/README.md | 42 +- benchmarks/skillsbench/config.py | 11 +- benchmarks/skillsbench/run_infer.py | 655 +++++++++------------------ tests/test_skillsbench_eval_infer.py | 17 + tests/test_skillsbench_run_infer.py | 442 +++++------------- uv.lock | 50 +- vendor/software-agent-sdk | 2 +- 8 files changed, 396 insertions(+), 824 deletions(-) diff --git a/.gitignore b/.gitignore index 9164fd12b..459fad588 100644 --- a/.gitignore +++ b/.gitignore @@ -216,5 +216,4 @@ workspace/ # Evaluation outputs eval_outputs/ -evaluation_outputs/ builds/ diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 21339842c..60ff73652 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -1,10 +1,10 @@ # SkillsBench Evaluation -This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent. +This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent. ## Overview -SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include: +SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain - Software engineering - Office & white collar @@ -20,25 +20,23 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites -1. **Install benchflow**: benchflow is the official harness for running SkillsBench. +1. **Install Harbor**: Harbor is the official harness for running SkillsBench. ```bash - uv tool install benchflow==0.3.0 + pip install harbor # or - pip install benchflow==0.3.0 - # or - uv pip install benchflow==0.3.0 + uv pip install harbor ``` -2. **Docker**: benchflow requires Docker to be installed and running. +2. **Docker**: Harbor requires Docker to be installed and running. -3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment. +3. **LLM API Key**: Configure your LLM provider credentials. ## Usage ### Running Inference -Run the SkillsBench evaluation using the `openhands` agent: +Run the SkillsBench evaluation using the OpenHands SDK agent: ```bash # Run full evaluation @@ -64,7 +62,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`): ```json { "model": "anthropic/claude-sonnet-4-20250514", - "api_key": "YOUR_ANTHROPIC_API_KEY" + "api_key": "YOUR_API_KEY" } ``` @@ -101,6 +99,8 @@ Each line contains: { "instance_id": "benchflow/task-name", "test_result": { + "trial_name": "...", + "trial_uri": "...", "rewards": {"reward": 1.0}, "passed": true }, @@ -134,21 +134,22 @@ Each line contains: ## Architecture -The integration uses the benchflow CLI as the evaluation harness: +The integration follows the Harbor agent adapter pattern: -1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty -2. **benchflow job**: Runs all tasks concurrently with `openhands` -3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format +1. **Harbor Harness**: Manages task containers and lifecycle +2. **OpenHands SDK Agent**: Runs inside containers to solve tasks +3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format ```text ┌──────────────────────────────────────────────────┐ -│ benchflow job │ +│ Harbor Harness │ │ ┌────────────────────────────────────────────┐ │ -│ │ Task Container (Docker) │ │ +│ │ Task Container │ │ │ │ ┌──────────────────────────────────────┐ │ │ -│ │ │ openhands │ │ │ +│ │ │ OpenHands SDK Agent │ │ │ │ │ │ - Terminal tool │ │ │ │ │ │ - File editor tool │ │ │ +│ │ │ - Task tracker tool │ │ │ │ │ └──────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────┘ @@ -157,5 +158,6 @@ The integration uses the benchflow CLI as the evaluation harness: ## References - [SkillsBench](https://www.skillsbench.ai/) - The benchmark -- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness -- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation +- [Harbor](https://harborframework.com) - The evaluation harness +- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK +- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 4ed541ab9..8b55a92b0 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,13 +1,16 @@ """SkillsBench configuration defaults.""" -# Default inference settings +# Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { "dataset": "benchflow/skillsbench", "output_dir": "./evaluation_outputs", "num_workers": 1, } -# benchflow configuration defaults -BENCHFLOW_DEFAULTS = { - "agent_name": "openhands", +# Harbor configuration defaults +HARBOR_DEFAULTS = { + # Harbor executable + "harbor_executable": "harbor", + # Default agent name for openhands-sdk + "agent_name": "openhands-sdk", } diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index 2e11a100a..a8afa7281 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -1,31 +1,24 @@ -"""SkillsBench inference script using the benchflow SDK. +"""SkillsBench inference script using Harbor with openhands-sdk agent. -This script runs SkillsBench evaluation using `benchflow job` as the harness -and `openhands` as the default agent. Results are saved in a format compatible +This script runs SkillsBench evaluation using Harbor as the harness +and openhands-sdk as the agent. Results are saved in a format compatible with the standard evaluation pipeline. Usage: - uv run skillsbench-infer - - # Run specific tasks - uv run skillsbench-infer --select tasks.txt + uv run skillsbench-infer --dataset benchflow/skillsbench """ import argparse import json import os -import re -import shutil import subprocess import sys -import tempfile from datetime import datetime, timezone from pathlib import Path -import yaml from pydantic import SecretStr -from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -33,469 +26,256 @@ logger = get_logger(__name__) -# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS -_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$") - -# "Total cost: $0.0487" -_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)") -# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K" -_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b") - +# Output filename for results OUTPUT_FILENAME = "output.jsonl" -TASK_REPOS = { - "skillsbench": { - "repo": "https://github.com/benchflow-ai/skillsbench.git", - "subdir": "tasks", - } -} - -_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = { - "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"), - "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), - "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"), - "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"), -} - - -def _infer_direct_provider(model: str) -> str | None: - """Infer the provider prefix for direct model names. - - Examples: - - gemini/gemini-2.5-pro -> gemini - - anthropic/claude-sonnet-4-5 -> anthropic - - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars) - """ - if not model or model.startswith("litellm_proxy/"): - return None - if "/" in model: - provider = model.split("/", 1)[0].lower() - if provider in _DIRECT_PROVIDER_ENV_VARS: - return provider - return None - - -def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]: - """Build the sandbox environment for benchflow's openhands agent. - - Only LLM-specific variables are returned — these go INTO the sandbox - container via the ``agent_env`` YAML key. The calling process inherits - the host environment normally; dumping ``os.environ`` here would leak - the entire host env into every container. - """ - env: dict[str, str] = {} - api_key: str | None = None - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - env["LLM_API_KEY"] = api_key - if llm.base_url: - env["LLM_BASE_URL"] = llm.base_url - - provider = _infer_direct_provider(llm.model) - if provider and api_key: - key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider] - for var_name in key_vars: - env[var_name] = api_key - if llm.base_url and base_url_var: - env[base_url_var] = llm.base_url - - return env - - -def check_benchflow_installed() -> bool: - """Check if benchflow CLI is installed and available. - - Tries ``bench`` first (current name), then falls back to the legacy - ``benchflow`` binary. - """ - for cmd in ("bench", "benchflow"): - try: - result = subprocess.run( - [cmd, "--help"], - capture_output=True, - text=True, - timeout=10, - ) - if result.returncode == 0: - return True - except (FileNotFoundError, subprocess.TimeoutExpired): - continue - return False - -def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]: - """Map a benchflow dataset name to its task repository metadata.""" - dataset_name = dataset.split("@", 1)[0].split("/")[-1] +def check_harbor_installed() -> bool: + """Check if harbor CLI is installed and available.""" + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] try: - return dataset_name, TASK_REPOS[dataset_name] - except KeyError as exc: - raise ValueError( - f"Unsupported SkillsBench dataset: {dataset!r}. " - f"Known datasets: {sorted(TASK_REPOS)}" - ) from exc - - -def ensure_tasks( - dataset: str, - tasks_dir: Path, - task_ids: list[str] | None = None, -) -> None: - """Download tasks for a benchflow dataset into tasks_dir. - - BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the - benchmark task repository directly when the local tasks directory is empty. - - When *task_ids* is provided, a sparse checkout is used so only the - requested task subdirectories are downloaded — much faster than a full - clone for large repos. - """ - if tasks_dir.exists() and any(tasks_dir.iterdir()): - logger.info(f"Tasks already present in {tasks_dir}, skipping download") - return - - _, repo_info = _resolve_task_repo(dataset) - tasks_dir.mkdir(parents=True, exist_ok=True) - clone_dir = tasks_dir.parent / "_clone" - if clone_dir.exists(): - shutil.rmtree(clone_dir, ignore_errors=True) - - subdir = repo_info.get("subdir", "") - - if task_ids: - # Sparse checkout: only download the specific task directories - short_names = [tid.split("/")[-1] for tid in task_ids] - - cmd_clone = [ - "git", - "clone", - "--no-checkout", - "--depth", - "1", - repo_info["repo"], - str(clone_dir), - ] - logger.info(f"Sparse clone: {' '.join(cmd_clone)}") - result = subprocess.run(cmd_clone, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"task download failed: {result.stderr}") - - # Init sparse-checkout and set the desired paths - subprocess.run( - ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"], + result = subprocess.run( + [harbor_exe, "--version"], capture_output=True, text=True, - check=True, + timeout=10, ) - sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names] - subprocess.run( - ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths], - capture_output=True, - text=True, - check=True, - ) - subprocess.run( - ["git", "-C", str(clone_dir), "checkout"], - capture_output=True, - text=True, - check=True, - ) - else: - # Full shallow clone - cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)] - logger.info(f"Downloading tasks: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - logger.error(f"Failed to clone tasks: {result.stderr}") - raise RuntimeError(f"task download failed: {result.stderr}") - - try: - source_dir = clone_dir / subdir if subdir else clone_dir - - for entry in source_dir.iterdir(): - target = tasks_dir / entry.name - if entry.is_dir(): - shutil.copytree(entry, target, dirs_exist_ok=True) - else: - shutil.copy2(entry, target) - finally: - shutil.rmtree(clone_dir, ignore_errors=True) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False - logger.info(f"Tasks downloaded to {tasks_dir}") - -def run_benchflow_job( +def run_harbor_evaluation( llm: LLM, - tasks_dir: Path, - jobs_dir: Path, + dataset: str, + output_dir: str, num_workers: int = 1, task_ids: list[str] | None = None, + n_limit: int | None = None, ) -> Path: - """Run benchflow job command. + """Run harbor evaluation with openhands-sdk agent. Args: llm: LLM configuration for the agent. - tasks_dir: Path to directory containing task subdirectories. - jobs_dir: Directory for benchflow job output. - num_workers: Number of parallel workers (concurrency). - task_ids: Optional list of task IDs to filter (short names, not full paths). + dataset: Harbor dataset name (e.g., benchflow/skillsbench). + output_dir: Directory to store output files. + num_workers: Number of parallel workers. + task_ids: Optional list of specific task IDs to run. + n_limit: Optional maximum number of dataset tasks to run. Returns: - Path to jobs_dir. + Path to the harbor output directory. """ - jobs_dir.mkdir(parents=True, exist_ok=True) - - agent_env = _build_benchflow_agent_env(llm) - # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without - # --break-system-packages. benchflow's openhands install_cmd uses plain - # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04, - # causing "Agent openhands install failed (rc=1)". Setting this env var - # makes pip skip the restriction without modifying the install_cmd. - agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1") - config = { - "tasks_dir": str(tasks_dir), - "jobs_dir": str(jobs_dir.resolve()), - "agent": BENCHFLOW_DEFAULTS["agent_name"], - "model": llm.model, - "environment": "docker", - "concurrency": num_workers, - # OpenHands is installed inside the sandbox as root by benchflow's - # registry install command. Running as the default "agent" user can - # lose access to that binary on some task images. - "sandbox_user": None, - "agent_env": agent_env, - } - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False - ) as tmp: - yaml.safe_dump(config, tmp, sort_keys=False) - config_path = tmp.name - - # Prefer `bench eval create` (current), fall back to legacy `benchflow job` - bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench" - if "benchflow" in bench_bin: - cmd = [bench_bin, "job", "--config", config_path] - else: - cmd = [bench_bin, "eval", "create", "-f", config_path] + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + + # Build harbor command using harbor CLI flags. + # Use absolute path for --jobs-dir to avoid CWD-relative path issues. + cmd = [ + harbor_exe, + "run", + "-d", + dataset, + "-a", + HARBOR_DEFAULTS["agent_name"], + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(num_workers), + ] + + # Pass LLM credentials as agent environment variables + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) - logger.info(f"Running: {' '.join(cmd)}") + # Add specific task names if provided + if task_ids: + for task_id in task_ids: + cmd.extend(["--include-task-name", task_id]) - # Inject LLM vars into the host process env so benchflow's provider - # resolution can pick them up; the subprocess inherits normally (env=None). - host_env = os.environ.copy() - host_env.update(agent_env) - result = subprocess.run(cmd, capture_output=True, text=True, env=host_env) - Path(config_path).unlink(missing_ok=True) + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) - if result.returncode != 0: - logger.error(f"benchflow job failed (code {result.returncode})") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"benchflow job failed: {result.stderr}") + logger.info(f"Running harbor command: {' '.join(cmd)}") + logger.info(f"Output directory: {harbor_output_dir}") - logger.info("benchflow job completed") - logger.info(f"stdout: {result.stdout}") + # harbor's openhands-sdk agent reads LLM credentials from the host process + # environment (os.environ), not from --ae flags which go to the sandbox. + env = os.environ.copy() + if llm.api_key: + api_key = ( + llm.api_key.get_secret_value() + if isinstance(llm.api_key, SecretStr) + else llm.api_key + ) + env["LLM_API_KEY"] = api_key + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url - return jobs_dir + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env=env, + ) + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") -def _extract_trial_metrics(trial_dir: Path) -> dict: - """Extract token/cost metrics from benchflow 0.3.0 trial output files. + logger.info("Harbor evaluation completed successfully") + logger.info(f"stdout: {result.stdout}") - benchflow 0.3.0 does not write cost/token fields to result.json. - Instead, metrics are read from: - 1. agent/trajectory.json → final_metrics (harbor-format agent) - 2. agent/openhands.txt → "Total cost:" and "Tokens:" lines (ACP agent) - """ - # 1. Harbor-format trajectory.json written by openhands-sdk agent - traj_file = trial_dir / "agent" / "trajectory.json" - if traj_file.exists(): - try: - with open(traj_file) as f: - traj = json.load(f) - fm = traj.get("final_metrics") or {} - if fm: - return { - "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0), - "total_completion_tokens": int( - fm.get("total_completion_tokens") or 0 - ), - "total_cost_usd": float(fm.get("total_cost_usd") or 0.0), - } - except (json.JSONDecodeError, OSError): - pass - - # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native) - def _parse_token_count(value: str, suffix: str) -> int: - n = float(value) - return int( - n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1) + except FileNotFoundError: + raise RuntimeError( + "Harbor CLI not found. Please install harbor: pip install harbor" ) - for log_name in ("openhands.txt", "openhands_sdk.txt"): - log_file = trial_dir / "agent" / log_name - if not log_file.exists(): - continue - try: - text = log_file.read_text(errors="replace") - cost_usd = 0.0 - prompt_tokens = 0 - completion_tokens = 0 - m = _COST_RE.search(text) - if m: - cost_usd = float(m.group(1)) - m = _TOKENS_RE.search(text) - if m: - prompt_tokens = _parse_token_count(m.group(1), m.group(2)) - completion_tokens = _parse_token_count(m.group(3), m.group(4)) - if cost_usd or prompt_tokens: - return { - "total_prompt_tokens": prompt_tokens, - "total_completion_tokens": completion_tokens, - "total_cost_usd": cost_usd, - } - except OSError: - pass - - return { - "total_prompt_tokens": 0, - "total_completion_tokens": 0, - "total_cost_usd": 0.0, - } + return harbor_output_dir + + +def _find_job_dir(harbor_output_dir: Path) -> Path: + """Find the harbor job directory (timestamp-named) inside the output dir.""" + # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) + # containing result.json and trial subdirectories + candidates = [ + d + for d in harbor_output_dir.iterdir() + if d.is_dir() and (d / "result.json").exists() + ] + if not candidates: + raise RuntimeError( + f"No harbor job directory found in {harbor_output_dir}. " + f"Expected a timestamp-named directory containing result.json." + ) + # Use the most recent job directory if multiple exist + return sorted(candidates)[-1] -def convert_benchflow_to_eval_output( - jobs_dir: Path, +def convert_harbor_to_eval_output( + harbor_output_dir: Path, eval_output_path: Path, - task_ids: list[str] | None = None, ) -> None: - """Convert benchflow job output to standard evaluation output format. + """Convert harbor output to evaluation output format. - benchflow 0.3.0 stores trial results as: - jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json + Harbor stores trial results in a job directory structured as: + harbor_output/TIMESTAMP/TRIAL_NAME/result.json - Each result.json contains task_name, rewards, error, verifier_error, and timing. + Each trial's result.json contains task_name, verifier_result, agent_result, + timing info, and exception details. Args: - jobs_dir: Path to benchflow jobs directory. - eval_output_path: Path to write output.jsonl. - task_ids: Optional filter for specific task IDs (short names). + harbor_output_dir: Path to harbor output directory. + eval_output_path: Path to write the converted output.jsonl. """ - logger.info(f"Converting benchflow output from {jobs_dir}") - - # benchflow 0.3.0 writes: - # jobs/summary.json - # jobs/TIMESTAMP/TRIAL_NAME/result.json - # while older local outputs may place results directly under jobs/. - job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()] - timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)] - - if timestamp_job_dirs: - selected_job_dir = sorted(timestamp_job_dirs)[-1] - logger.info(f"Using benchflow job directory: {selected_job_dir}") - task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()] - else: - task_dirs = job_dirs + logger.info(f"Converting harbor output from {harbor_output_dir}") - if not task_dirs: - raise RuntimeError(f"No task directories found in {jobs_dir}") + job_dir = _find_job_dir(harbor_output_dir) + logger.info(f"Using harbor job directory: {job_dir}") - if task_ids: - short_ids = {tid.split("/")[-1] for tid in task_ids} - task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids] + # Find trial result files (each trial dir has a result.json) + result_files = list(job_dir.glob("*/result.json")) + # Exclude the job-level result.json + result_files = [f for f in result_files if f.parent != job_dir] + + if not result_files: + raise RuntimeError( + f"No trial result files found in {job_dir}. " + f"Expected result.json files in trial subdirectories." + ) - logger.info(f"Processing {len(task_dirs)} task directories") + logger.info(f"Found {len(result_files)} trial results in {job_dir}") results: list[dict] = [] errors: list[dict] = [] - for task_dir in sorted(task_dirs): - # Find the trial result — benchflow writes trial-0/result.json - trial_results = list(task_dir.glob("trial-*/result.json")) - if not trial_results: - # Fall back to a direct result.json - direct = task_dir / "result.json" - if direct.exists(): - trial_results = [direct] - - if not trial_results: - logger.warning(f"No result.json found in {task_dir}, skipping") - errors.append( - { - "instance_id": f"benchflow/{task_dir.name}", - "error": "No result.json found", - "test_result": {}, - } - ) - continue - - # Use the last trial (highest retry index) - result_file = sorted(trial_results)[-1] - + for result_file in result_files: try: with open(result_file) as f: trial = json.load(f) - task_basename = task_dir.name.split("__")[0] - task_name = trial.get("task_name") or f"benchflow/{task_basename}" - # Normalise to benchflow/ form - if "/" not in task_name: - task_name = f"benchflow/{task_name}" + instance_id = trial.get("task_name", result_file.parent.name) - error = trial.get("error") - verifier_error = trial.get("verifier_error") - - if error or verifier_error: + # Check for exceptions + if trial.get("exception_info"): errors.append( { - "instance_id": task_name, - "error": str(error or verifier_error), + "instance_id": instance_id, + "error": str(trial["exception_info"]), "test_result": {}, } ) continue - rewards = trial.get("rewards") or {} - passed = bool(rewards.get("reward", 0.0)) + # Extract verifier results + verifier_result = trial.get("verifier_result", {}) + rewards = verifier_result.get("rewards", {}) + passed = rewards.get("reward", 0.0) > 0 + + # Extract agent metrics + agent_result = trial.get("agent_result", {}) eval_entry = { - "instance_id": task_name, + "instance_id": instance_id, "test_result": { + "trial_name": trial.get("trial_name"), + "trial_uri": trial.get("trial_uri"), "rewards": rewards, "passed": passed, }, "instruction": "", "error": None, "history": [], - "metrics": _extract_trial_metrics(result_file.parent), + "metrics": { + "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, + "total_completion_tokens": ( + agent_result.get("n_output_tokens") or 0 + ), + "total_cost_usd": agent_result.get("cost_usd") or 0.0, + }, } results.append(eval_entry) - logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}") + logger.info( + f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" + ) except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to read {result_file}: {e}") + logger.error(f"Failed to process result file {result_file}: {e}") errors.append( { - "instance_id": f"benchflow/{task_dir.name}", + "instance_id": result_file.parent.name, "error": str(e), "test_result": {}, } ) if not results and not errors: - raise RuntimeError(f"No trials processed from {jobs_dir}") + raise RuntimeError(f"No trials processed from {harbor_output_dir}") if not results: logger.warning( - f"All {len(errors)} trials failed; writing error entries for reporting" + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" ) + # Write results to output.jsonl with open(eval_output_path, "w") as f: - for entry in results + errors: + for entry in results: + f.write(json.dumps(entry) + "\n") + for entry in errors: f.write(json.dumps(entry) + "\n") logger.info( @@ -518,18 +298,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]: def main() -> None: """Main entry point for skillsbench inference.""" parser = argparse.ArgumentParser( - description="Run SkillsBench evaluation with benchflow and openhands", + description="Run SkillsBench evaluation with openhands-sdk via Harbor", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run full skillsbench evaluation uv run skillsbench-infer .llm_config/claude.json - # Run specific tasks from a file + # Run specific tasks uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with more concurrency - uv run skillsbench-infer .llm_config/claude.json --num-workers 4 + # Run with custom dataset version + uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 """, ) @@ -542,7 +322,7 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="benchflow dataset name (e.g., benchflow/skillsbench)", + help="Harbor dataset name (e.g., benchflow/skillsbench)", ) parser.add_argument( "--output-dir", @@ -554,12 +334,12 @@ def main() -> None: "--num-workers", type=int, default=INFER_DEFAULTS["num_workers"], - help="Number of parallel workers (concurrency)", + help="Number of parallel workers", ) parser.add_argument( "--n-limit", type=int, - help="Maximum number of tasks to run", + help="Maximum number of dataset tasks to run after Harbor filtering", ) parser.add_argument( "--select", @@ -578,13 +358,14 @@ def main() -> None: help="Optional note for the evaluation run", ) parser.add_argument( - "--skip-run", + "--skip-harbor", action="store_true", - help="Skip running benchflow and only convert existing results", + help="Skip running harbor and only convert existing results", ) args = parser.parse_args() + # Validate LLM config if not os.path.isfile(args.llm_config_path): logger.error(f"LLM config file does not exist: {args.llm_config_path}") sys.exit(1) @@ -594,103 +375,87 @@ def main() -> None: llm = LLM.model_validate_json(llm_config) logger.info(f"Using LLM: {llm.model}") - if not args.skip_run and not check_benchflow_installed(): + # Check harbor installation + if not args.skip_harbor and not check_harbor_installed(): logger.error( - "benchflow CLI is not installed. Please install it:\n" - " uv tool install benchflow==0.3.0\n" + "Harbor CLI is not installed. Please install it:\n" + " pip install harbor\n" " # or\n" - " pip install benchflow==0.3.0\n" - " # or\n" - " uv pip install benchflow==0.3.0" + " uv pip install harbor" ) sys.exit(1) + # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( base_dir=args.output_dir, dataset_name=dataset_description, model_name=llm.model, - max_iterations=100, + max_iterations=100, # Not directly used but required for path construction eval_note=args.note, ) logger.info(f"Output directory: {structured_output_dir}") os.makedirs(structured_output_dir, exist_ok=True) + # Save metadata metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, "timestamp": datetime.now(timezone.utc).isoformat(), - "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"], + "harbor_agent": HARBOR_DEFAULTS["agent_name"], "note": args.note, } metadata_path = Path(structured_output_dir) / "metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) + # Collect task IDs if specified task_ids: list[str] | None = None if args.select: - task_ids = load_task_ids_from_file(args.select) - logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}") + loaded_ids = load_task_ids_from_file(args.select) + task_ids = loaded_ids + logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}") elif args.task_id: task_ids = list(args.task_id) logger.info(f"Running {len(task_ids)} specified task IDs") - tasks_dir = Path(structured_output_dir) / "tasks" - jobs_dir = Path(structured_output_dir) / "jobs" output_path = Path(structured_output_dir) / OUTPUT_FILENAME - if not args.skip_run: + if not args.skip_harbor: + # Run harbor evaluation try: - ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids) - - # Apply n_limit by slicing available task directories - effective_task_dirs = tasks_dir - if args.n_limit is not None or task_ids is not None: - all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir()) - if task_ids: - short_ids = {tid.split("/")[-1] for tid in task_ids} - all_dirs = [d for d in all_dirs if d.name in short_ids] - if args.n_limit is not None: - all_dirs = all_dirs[: args.n_limit] - - # Write a filtered tasks dir symlink tree - filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered" - filtered_tasks_dir.mkdir(exist_ok=True) - for d in all_dirs: - link = filtered_tasks_dir / d.name - if not link.exists(): - link.symlink_to(d.resolve()) - effective_task_dirs = filtered_tasks_dir - - run_benchflow_job( + harbor_output_dir = run_harbor_evaluation( llm=llm, - tasks_dir=effective_task_dirs, - jobs_dir=jobs_dir, + dataset=args.dataset, + output_dir=structured_output_dir, num_workers=args.num_workers, task_ids=task_ids, + n_limit=args.n_limit, ) - convert_benchflow_to_eval_output( - jobs_dir=jobs_dir, + # Convert harbor output to standard format + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, eval_output_path=output_path, - task_ids=task_ids, ) except Exception as e: logger.error(f"Evaluation failed: {e}") sys.exit(1) else: - if jobs_dir.exists(): - convert_benchflow_to_eval_output( - jobs_dir=jobs_dir, + # Skip harbor, just convert existing results + harbor_output_dir = Path(structured_output_dir) / "harbor_output" + if harbor_output_dir.exists(): + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, eval_output_path=output_path, - task_ids=task_ids, ) else: - logger.error(f"No jobs output found at {jobs_dir}") + logger.error(f"No harbor output found at {harbor_output_dir}") sys.exit(1) + # Generate cost report if output_path.exists(): generate_cost_report(str(output_path)) diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py index 1334da297..56d54f27a 100644 --- a/tests/test_skillsbench_eval_infer.py +++ b/tests/test_skillsbench_eval_infer.py @@ -39,6 +39,23 @@ def test_resolved_instance(self, tmp_path: Path) -> None: assert result["unresolved_instances"] == 0 assert "benchflow/weighted-gdp-calc" in result["resolved_ids"] + def test_unresolved_instance(self, tmp_path: Path) -> None: + """Test processing an unresolved (passed=False) instance.""" + input_file = tmp_path / "unresolved.jsonl" + output_file = tmp_path / "unresolved.report.json" + + entry = { + "instance_id": "benchflow/task-1", + "test_result": {"passed": False, "rewards": {"reward": 0.0}}, + "error": None, + } + input_file.write_text(json.dumps(entry) + "\n") + + result = process_skillsbench_results(str(input_file), str(output_file)) + + assert result["resolved_instances"] == 0 + assert result["unresolved_instances"] == 1 + def test_instance_with_error(self, tmp_path: Path) -> None: """Test processing an instance that errored.""" input_file = tmp_path / "error.jsonl" diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 784b4d1cc..5f8452cb3 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -4,102 +4,82 @@ from pathlib import Path import pytest -import yaml -from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS +from benchmarks.skillsbench.config import INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( - _build_benchflow_agent_env, - convert_benchflow_to_eval_output, - run_benchflow_job, + convert_harbor_to_eval_output, + run_harbor_evaluation, ) from openhands.sdk import LLM -class TestRunBenchflowJob: - """Tests for building benchflow job invocation arguments.""" +class TestRunHarborEvaluation: + """Tests for building Harbor invocation arguments.""" - def test_default_dataset_matches_benchflow_registry(self) -> None: - """Test that the default dataset name matches benchflow's published registry.""" + def test_default_dataset_matches_harbor_registry(self) -> None: + """Test that the default dataset name matches Harbor's published registry.""" assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_default_agent_is_openhands(self) -> None: - """Test that the default agent is openhands.""" - assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands" - - def test_run_benchflow_job_passes_model_and_concurrency( + def test_run_harbor_evaluation_passes_filters_and_limits( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test benchflow job command writes the expected YAML config.""" - captured_cmd: list[str] = [] - captured_env: dict[str, str] = {} - captured_config: dict = {} - - # Force legacy benchflow binary path so the command format is deterministic - monkeypatch.setattr( - "benchmarks.skillsbench.run_infer.shutil.which", - lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, - ) + """Test Harbor command includes task filters and n-limit.""" + captured: dict[str, list[str]] = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured_cmd[:] = cmd - captured_env.clear() - captured_env.update(env) - with open(cmd[3]) as f: - captured_config.update(yaml.safe_load(f)) + captured["cmd"] = cmd return type( "Completed", (), - {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""}, + {"returncode": 0, "stdout": "ok", "stderr": ""}, )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - tasks_dir = tmp_path / "tasks" - tasks_dir.mkdir() - jobs_dir = tmp_path / "jobs" - - run_benchflow_job( + harbor_output_dir = run_harbor_evaluation( llm=LLM( - model="anthropic/claude-sonnet-4-5", + model="litellm_proxy/test-model", api_key="test-key", base_url="https://proxy.example.com", ), - tasks_dir=tasks_dir, - jobs_dir=jobs_dir, - num_workers=4, - ) - - cmd = captured_cmd - assert cmd[0] == "/usr/local/bin/benchflow" - assert cmd[1] == "job" - assert cmd[2] == "--config" - assert captured_config["tasks_dir"] == str(tasks_dir) - assert captured_config["jobs_dir"] == str(jobs_dir.resolve()) - assert captured_config["agent"] == "openhands" - assert captured_config["model"] == "anthropic/claude-sonnet-4-5" - assert captured_config["concurrency"] == 4 - assert captured_config["sandbox_user"] is None - - def test_llm_credentials_passed_via_subprocess_env( + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), + num_workers=2, + task_ids=["benchflow/task-a", "benchflow/task-b"], + n_limit=3, + ) + + expected_output_dir = tmp_path / "harbor_output" + assert harbor_output_dir == expected_output_dir + + cmd = captured["cmd"] + assert cmd[:8] == [ + "harbor", + "run", + "-d", + "benchflow/skillsbench", + "-a", + "openhands-sdk", + "-m", + "litellm_proxy/test-model", + ] + assert "--jobs-dir" in cmd + assert str(expected_output_dir.resolve()) in cmd + assert cmd.count("--include-task-name") == 2 + assert "benchflow/task-a" in cmd + assert "benchflow/task-b" in cmd + assert cmd[cmd.index("--n-concurrent") + 1] == "2" + assert cmd[cmd.index("--n-tasks") + 1] == "3" + + def test_llm_credentials_passed_via_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test that LLM credentials are passed via subprocess env and YAML.""" - captured_cmd: list[str] = [] - captured_env: dict[str, str] = {} - captured_config: dict = {} - - # Force legacy benchflow binary path so the command format is deterministic - monkeypatch.setattr( - "benchmarks.skillsbench.run_infer.shutil.which", - lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None, - ) + """Test that LLM credentials are passed via subprocess env, not --ae flags.""" + captured: dict = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): - captured_cmd[:] = cmd - captured_env.clear() - captured_env.update(env) - with open(cmd[3]) as f: - captured_config.update(yaml.safe_load(f)) + captured["cmd"] = cmd + captured["env"] = env return type( "Completed", (), @@ -108,127 +88,60 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) - tasks_dir = tmp_path / "tasks" - tasks_dir.mkdir() - - run_benchflow_job( + run_harbor_evaluation( llm=LLM( model="test-model", api_key="my-secret-key", base_url="https://my-proxy.example.com", ), - tasks_dir=tasks_dir, - jobs_dir=tmp_path / "jobs", - ) - - # Credentials in subprocess env - assert captured_env["LLM_API_KEY"] == "my-secret-key" - assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com" - assert "--ae" not in captured_cmd - assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key" - assert ( - captured_config["agent_env"]["LLM_BASE_URL"] - == "https://my-proxy.example.com" - ) - - def test_direct_gemini_model_sets_provider_env_vars(self) -> None: - """Direct provider models need provider-specific env vars.""" - env = _build_benchflow_agent_env( - LLM( - model="gemini/gemini-3.1-flash-lite-preview", - api_key="gemini-test-key", - ) - ) - - assert env["LLM_API_KEY"] == "gemini-test-key" - assert env["GEMINI_API_KEY"] == "gemini-test-key" - assert env["GOOGLE_API_KEY"] == "gemini-test-key" - - def test_proxy_model_does_not_set_provider_env_vars(self) -> None: - """LiteLLM proxy configs should keep using generic LLM_* vars only.""" - env = _build_benchflow_agent_env( - LLM( - model="litellm_proxy/anthropic/claude-sonnet-4-20250514", - api_key="proxy-key", - base_url="https://proxy.example.com", - ) + dataset=INFER_DEFAULTS["dataset"], + output_dir=str(tmp_path), ) - assert env["LLM_API_KEY"] == "proxy-key" - assert env["LLM_BASE_URL"] == "https://proxy.example.com" - assert "ANTHROPIC_API_KEY" not in env - assert "ANTHROPIC_BASE_URL" not in env + assert captured["env"]["LLM_API_KEY"] == "my-secret-key" + assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" -class TestConvertBenchflowToEvalOutput: - """Tests for convert_benchflow_to_eval_output function.""" +class TestConvertHarborToEvalOutput: + """Tests for convert_harbor_to_eval_output function.""" - def _create_benchflow_structure( - self, tmp_path: Path, tasks: list[tuple[str, dict]] - ) -> Path: - """Create a mock benchflow jobs directory structure. - - benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json - """ - jobs_dir = tmp_path / "jobs" - for task_name, result in tasks: - trial_dir = jobs_dir / task_name / "trial-0" - trial_dir.mkdir(parents=True) - (trial_dir / "result.json").write_text(json.dumps(result)) - return jobs_dir - - def _create_benchflow_timestamped_job( - self, tmp_path: Path, tasks: list[tuple[str, dict]] + def _create_harbor_structure( + self, tmp_path: Path, trials: list[tuple[str, dict]] ) -> Path: - """Create a mock benchflow 0.3.0 jobs directory structure. - - benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json - """ - jobs_dir = tmp_path / "jobs" - job_dir = jobs_dir / "2026-04-21__23-12-35" + """Create a mock Harbor output structure.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" job_dir.mkdir(parents=True) - (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)})) - for task_name, result in tasks: - trial_dir = job_dir / f"{task_name}__abc123" - trial_dir.mkdir(parents=True) - (trial_dir / "result.json").write_text(json.dumps(result)) - return jobs_dir + (job_dir / "result.json").write_text(json.dumps({"id": "test-job"})) - def test_successful_trial_parsing(self, tmp_path: Path) -> None: - """Test successful parsing of a benchflow trial result. + for trial_name, trial_result in trials: + trial_dir = job_dir / trial_name + trial_dir.mkdir() + (trial_dir / "result.json").write_text(json.dumps(trial_result)) - benchflow 0.3.0 does not write cost/token fields to result.json. - Metrics are read from agent/trajectory.json (harbor-format agent) - or parsed from agent/openhands.txt (ACP agent stdout). - """ + return harbor_dir + + def test_successful_trial_parsing(self, tmp_path: Path) -> None: + """Test successful parsing of harbor trial result.""" trial_result = { "task_name": "benchflow/weighted-gdp-calc", - "rewards": {"reward": 1.0}, - "error": None, + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - # Write agent/trajectory.json with final_metrics (harbor-format agent output). - # agent/ sits next to result.json, inside the trial-0 subdirectory. - trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0" - agent_dir = trial_dir / "agent" - agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "trajectory.json").write_text( - json.dumps( - { - "final_metrics": { - "total_prompt_tokens": 1000, - "total_completion_tokens": 200, - "total_cost_usd": 0.05, - } - } - ) + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) assert output_file.exists() with open(output_file) as f: @@ -238,52 +151,26 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 - assert entries[0]["metrics"]["total_prompt_tokens"] == 1000 - assert entries[0]["metrics"]["total_completion_tokens"] == 200 - - def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None: - """Test that metrics are extracted from agent/openhands.txt (ACP agent).""" - trial_result = { - "task_name": "benchflow/acp-task", - "rewards": {"reward": 1.0}, - "error": None, - } - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, [("acp-task", trial_result)] - ) - # Write agent/openhands.txt simulating openhands ACP stdout - trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123" - agent_dir = trial_dir / "agent" - agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "openhands.txt").write_text( - "OpenHands SDK v1.16.0\n" - "Tokens: ↑ input 404.21K • cache hit 70.47% • reasoning 579 • ↓ output 7.83K • $0.0487\n" - "Total cost: $0.0487\n" - ) - output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487) - assert entries[0]["metrics"]["total_prompt_tokens"] == 404210 - assert entries[0]["metrics"]["total_completion_tokens"] == 7830 def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { "task_name": "benchflow/task-1", - "rewards": {"reward": 0.0}, - "error": None, + "trial_name": "task-1__xyz", + "agent_result": { + "n_input_tokens": None, + "n_output_tokens": None, + "cost_usd": None, + }, + "verifier_result": {"rewards": {"reward": 0.0}}, + "exception_info": None, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("task-1", trial_result)] + harbor_dir = self._create_harbor_structure( + tmp_path, [("task-1__xyz", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -291,19 +178,21 @@ def test_failed_trial(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is False assert entries[0]["metrics"]["total_cost_usd"] == 0.0 - def test_trial_with_error(self, tmp_path: Path) -> None: - """Test that errored trials are written as error entries.""" + def test_trial_with_exception(self, tmp_path: Path) -> None: + """Test that exception trials are written as error entries.""" trial_result = { "task_name": "benchflow/error-task", - "rewards": {}, - "error": "LLM_API_KEY not set", + "trial_name": "error-task__err", + "agent_result": {}, + "verifier_result": {}, + "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"}, } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("error-task", trial_result)] + harbor_dir = self._create_harbor_structure( + tmp_path, [("error-task__err", trial_result)] ) output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) + convert_harbor_to_eval_output(harbor_dir, output_file) with open(output_file) as f: entries = [json.loads(line) for line in f] @@ -313,121 +202,20 @@ def test_trial_with_error(self, tmp_path: Path) -> None: assert entries[0]["error"] is not None assert entries[0]["test_result"] == {} - def test_missing_jobs_directory(self, tmp_path: Path) -> None: - """Test handling when jobs directory is empty.""" - jobs_dir = tmp_path / "jobs" - jobs_dir.mkdir() - - with pytest.raises(RuntimeError, match="No task directories found"): - convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl") - - def test_task_id_filtering(self, tmp_path: Path) -> None: - """Test that only specified task IDs are converted.""" - trials = [ - ( - "task-a", - { - "task_name": "benchflow/task-a", - "rewards": {"reward": 1.0}, - "error": None, - }, - ), - ( - "task-b", - { - "task_name": "benchflow/task-b", - "rewards": {"reward": 0.0}, - "error": None, - }, - ), - ] - jobs_dir = self._create_benchflow_structure(tmp_path, trials) - output_file = tmp_path / "output.jsonl" + def test_missing_job_directory(self, tmp_path: Path) -> None: + """Test handling when no job directory exists.""" + harbor_dir = tmp_path / "harbor_output" + harbor_dir.mkdir() - convert_benchflow_to_eval_output( - jobs_dir, output_file, task_ids=["benchflow/task-a"] - ) + with pytest.raises(RuntimeError, match="No harbor job directory found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/task-a" - - def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None: - """Test that task names without prefix get benchflow/ prepended.""" - trial_result = { - "task_name": "weighted-gdp-calc", # no benchflow/ prefix - "rewards": {"reward": 1.0}, - "error": None, - } - jobs_dir = self._create_benchflow_structure( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - output_file = tmp_path / "output.jsonl" - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" - - def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None: - """Test benchflow 0.3.0 timestamped jobs directory layout.""" - trial_result = { - "task_name": "weighted-gdp-calc", - "rewards": {"reward": 1.0}, - "error": None, - "n_input_tokens": 42, - "n_output_tokens": 7, - "cost_usd": 0.01, - } - - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, [("weighted-gdp-calc", trial_result)] - ) - output_file = tmp_path / "output.jsonl" - - convert_benchflow_to_eval_output(jobs_dir, output_file) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] - - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" - assert entries[0]["test_result"]["passed"] is True - - def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None: - """Test filtering strips the run suffix from trial directory names.""" - jobs_dir = self._create_benchflow_timestamped_job( - tmp_path, - [ - ( - "task-a", - { - "task_name": "task-a", - "rewards": {"reward": 1.0}, - "error": None, - }, - ), - ( - "task-b", - { - "task_name": "task-b", - "rewards": {"reward": 0.0}, - "error": None, - }, - ), - ], - ) - output_file = tmp_path / "output.jsonl" - - convert_benchflow_to_eval_output( - jobs_dir, output_file, task_ids=["benchflow/task-a"] - ) - - with open(output_file) as f: - entries = [json.loads(line) for line in f] + def test_empty_job_directory(self, tmp_path: Path) -> None: + """Test handling of harbor job dir with no trial subdirs.""" + harbor_dir = tmp_path / "harbor_output" + job_dir = harbor_dir / "2026-01-01__00-00-00" + job_dir.mkdir(parents=True) + (job_dir / "result.json").write_text(json.dumps({"id": "test"})) - assert len(entries) == 1 - assert entries[0]["instance_id"] == "benchflow/task-a" + with pytest.raises(RuntimeError, match="No trial result files found"): + convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl") diff --git a/uv.lock b/uv.lock index 147abedc9..2cd0b3640 100644 --- a/uv.lock +++ b/uv.lock @@ -1282,7 +1282,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, - { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1293,7 +1292,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1304,7 +1302,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, - { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -1519,6 +1516,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +socks = [ + { name = "socksio" }, +] + [[package]] name = "httpx-sse" version = "0.4.2" @@ -1814,14 +1816,12 @@ wheels = [ [[package]] name = "litellm" -version = "1.80.10" +version = "1.83.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "click" }, { name = "fastuuid" }, - { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, - { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "httpx" }, { name = "importlib-metadata" }, { name = "jinja2" }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" } +sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" }, + { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" }, ] [[package]] @@ -2402,7 +2402,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2467,7 +2467,6 @@ dependencies = [ { name = "python-json-logger" }, { name = "requests" }, { name = "swebench" }, - { name = "swesmith" }, { name = "swt-bench" }, { name = "tenacity" }, { name = "toml" }, @@ -2522,7 +2521,6 @@ requires-dist = [ { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, { name = "swebench", specifier = "==4.1.0" }, - { name = "swesmith", specifier = ">=0.0.9" }, { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" }, { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, @@ -2546,7 +2544,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, @@ -2554,7 +2552,7 @@ dependencies = [ { name = "fakeredis", extra = ["lua"] }, { name = "fastmcp" }, { name = "filelock" }, - { name = "httpx" }, + { name = "httpx", extra = ["socks"] }, { name = "litellm" }, { name = "lmnr" }, { name = "pydantic" }, @@ -2577,8 +2575,8 @@ requires-dist = [ { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" }, { name = "fastmcp", specifier = ">=3.0.0" }, { name = "filelock", specifier = ">=3.20.1" }, - { name = "httpx", specifier = ">=0.27.0" }, - { name = "litellm", specifier = "==1.80.10" }, + { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" }, + { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" }, { name = "lmnr", specifier = ">=0.7.24" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, @@ -2590,7 +2588,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2619,7 +2617,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.16.0" +version = "1.16.1" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, @@ -6709,6 +6707,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "socksio" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -6834,15 +6841,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, ] -[[package]] -name = "swesmith" -version = "0.0.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" }, -] - [[package]] name = "swt-bench" version = "1.0.1" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index acd5adc96..3e0a3a091 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6 +Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a From 908e8519d7156b416dc732ca33420082f7c308ff Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Thu, 23 Apr 2026 16:56:57 -0400 Subject: [PATCH 10/12] Update skillsbench dataset handling Co-authored-by: openhands --- benchmarks/skillsbench/README.md | 16 +- benchmarks/skillsbench/run_infer.py | 351 +++++++++++++++++++++++++--- tests/test_skillsbench_run_infer.py | 231 +++++++++++++++++- 3 files changed, 558 insertions(+), 40 deletions(-) diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 60ff73652..c2c11928b 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -21,11 +21,12 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Prerequisites 1. **Install Harbor**: Harbor is the official harness for running SkillsBench. + This integration is currently validated against `harbor==0.1.33`. ```bash - pip install harbor + pip install harbor==0.1.33 # or - uv pip install harbor + uv pip install harbor==0.1.33 ``` 2. **Docker**: Harbor requires Docker to be installed and running. @@ -34,12 +35,18 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills ## Usage +By default, `skillsbench-infer` keeps a local copy of `tasks/` from +`https://github.com/benchflow-ai/skillsbench` on the `main` branch under +`benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in +`benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the +upstream `main` commit changes. The only supported dataset sources are this synced +SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`. + ### Running Inference Run the SkillsBench evaluation using the OpenHands SDK agent: ```bash -# Run full evaluation uv run skillsbench-infer .llm_config/claude.json # Run specific tasks @@ -53,6 +60,9 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5 # Run with multiple workers uv run skillsbench-infer .llm_config/claude.json --num-workers 4 + +# Run against a Harbor registry dataset instead of the synced GitHub tasks +uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 ``` ### LLM Configuration diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index a8afa7281..c8e06eee5 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -11,8 +11,11 @@ import argparse import json import os +import re +import shutil import subprocess import sys +import tempfile from datetime import datetime, timezone from pathlib import Path @@ -29,13 +32,21 @@ # Output filename for results OUTPUT_FILENAME = "output.jsonl" +SKILLSBENCH_REPO_URL = "https://github.com/benchflow-ai/skillsbench.git" +SKILLSBENCH_REPO_BRANCH = "main" +DATASET_CACHE_DIR = Path(__file__).parent / "data" +TASKS_CACHE_DIR = DATASET_CACHE_DIR / "tasks" +TASKS_METADATA_PATH = DATASET_CACHE_DIR / "source.json" +REGISTRY_DATASET_PREFIX = "benchflow/skillsbench" +INSTANCE_ID_PREFIX = "benchflow" + def check_harbor_installed() -> bool: """Check if harbor CLI is installed and available.""" harbor_exe = HARBOR_DEFAULTS["harbor_executable"] try: result = subprocess.run( - [harbor_exe, "--version"], + [harbor_exe, "--help"], capture_output=True, text=True, timeout=10, @@ -45,9 +56,252 @@ def check_harbor_installed() -> bool: return False +def _run_command(cmd: list[str], error_message: str) -> str: + """Run a subprocess command and return stdout.""" + result = subprocess.run( + cmd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + stderr = result.stderr.strip() or result.stdout.strip() + raise RuntimeError(f"{error_message}: {stderr}") + return result.stdout.strip() + + +def _get_supported_task_filter_flag(harbor_exe: str) -> str: + """Detect whether Harbor expects --task-name or --include-task-name.""" + try: + result = subprocess.run( + [harbor_exe, "run", "--help"], + capture_output=True, + text=True, + ) + except FileNotFoundError: + return "--include-task-name" + + help_text = f"{result.stdout}\n{result.stderr}" + supported_flags = set(re.findall(r"(? str: + """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk.""" + try: + result = subprocess.run( + [harbor_exe, "run", "--help"], + capture_output=True, + text=True, + ) + except FileNotFoundError: + return HARBOR_DEFAULTS["agent_name"] + + help_text = f"{result.stdout}\n{result.stderr}" + compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower()) + if "openhands-sdk" in compact_help_text: + return "openhands-sdk" + if "openhands" in compact_help_text: + return "openhands" + return HARBOR_DEFAULTS["agent_name"] + + +def get_skillsbench_main_commit( + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> str: + """Resolve the latest commit hash for the upstream SkillsBench branch.""" + stdout = _run_command( + ["git", "ls-remote", repo_url, f"refs/heads/{branch}"], + "Failed to resolve SkillsBench upstream commit", + ) + commit_hash, _, ref = stdout.partition("\t") + if not commit_hash or ref != f"refs/heads/{branch}": + raise RuntimeError( + f"Unexpected git ls-remote output for {repo_url} {branch}: {stdout}" + ) + return commit_hash + + +def _load_cached_commit(metadata_path: Path = TASKS_METADATA_PATH) -> str | None: + """Load the cached upstream commit hash for the local task snapshot.""" + if not metadata_path.is_file(): + return None + + try: + with open(metadata_path, encoding="utf-8") as f: + metadata = json.load(f) + except (OSError, json.JSONDecodeError) as e: + logger.warning( + "Ignoring unreadable SkillsBench dataset metadata at %s: %s", + metadata_path, + e, + ) + return None + + commit_hash = metadata.get("commit_hash") + return commit_hash if isinstance(commit_hash, str) and commit_hash else None + + +def download_skillsbench_tasks( + commit_hash: str, + tasks_dir: Path = TASKS_CACHE_DIR, + metadata_path: Path = TASKS_METADATA_PATH, + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> None: + """Download only the SkillsBench tasks directory for a specific commit.""" + data_dir = tasks_dir.parent + data_dir.mkdir(parents=True, exist_ok=True) + + logger.info( + "Downloading SkillsBench tasks from %s@%s into %s", + repo_url, + commit_hash, + tasks_dir, + ) + + with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir: + clone_dir = Path(temp_dir) / "skillsbench" + _run_command( + [ + "git", + "clone", + "--depth", + "1", + "--branch", + branch, + "--filter=blob:none", + "--sparse", + repo_url, + str(clone_dir), + ], + "Failed to clone SkillsBench repository", + ) + _run_command( + ["git", "-C", str(clone_dir), "sparse-checkout", "set", "tasks"], + "Failed to sparsely checkout SkillsBench tasks", + ) + checked_out_commit = _run_command( + ["git", "-C", str(clone_dir), "rev-parse", "HEAD"], + "Failed to read cloned SkillsBench commit", + ) + if checked_out_commit != commit_hash: + raise RuntimeError( + "Cloned SkillsBench commit does not match upstream HEAD: " + f"expected {commit_hash}, got {checked_out_commit}" + ) + + source_tasks_dir = clone_dir / "tasks" + if not source_tasks_dir.is_dir(): + raise RuntimeError( + f"SkillsBench clone at {clone_dir} does not contain a tasks/ directory" + ) + + if tasks_dir.exists(): + shutil.rmtree(tasks_dir) + shutil.copytree(source_tasks_dir, tasks_dir) + + metadata = { + "repo_url": repo_url, + "branch": branch, + "commit_hash": commit_hash, + "synced_at": datetime.now(timezone.utc).isoformat(), + } + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + +def ensure_skillsbench_tasks( + tasks_dir: Path = TASKS_CACHE_DIR, + metadata_path: Path = TASKS_METADATA_PATH, + repo_url: str = SKILLSBENCH_REPO_URL, + branch: str = SKILLSBENCH_REPO_BRANCH, +) -> Path: + """Ensure a local SkillsBench task snapshot exists and matches upstream HEAD.""" + cached_commit = _load_cached_commit(metadata_path) + has_cached_tasks = tasks_dir.is_dir() and any(tasks_dir.iterdir()) + + try: + upstream_commit = get_skillsbench_main_commit(repo_url=repo_url, branch=branch) + except RuntimeError as e: + if has_cached_tasks and cached_commit: + logger.warning( + "Failed to check SkillsBench upstream HEAD; using cached tasks from " + "%s (%s): %s", + tasks_dir, + cached_commit, + e, + ) + return tasks_dir + raise + + if has_cached_tasks and cached_commit == upstream_commit: + logger.info( + "Using cached SkillsBench tasks at %s (commit %s)", + tasks_dir, + upstream_commit, + ) + return tasks_dir + + if has_cached_tasks: + logger.info( + "Refreshing SkillsBench tasks in %s from commit %s to %s", + tasks_dir, + cached_commit or "", + upstream_commit, + ) + else: + logger.info("No cached SkillsBench tasks found at %s; downloading", tasks_dir) + + download_skillsbench_tasks( + commit_hash=upstream_commit, + tasks_dir=tasks_dir, + metadata_path=metadata_path, + repo_url=repo_url, + branch=branch, + ) + return tasks_dir + + +def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]: + """Resolve the dataset argument to a synced local snapshot or registry id.""" + if dataset == INFER_DEFAULTS["dataset"]: + local_tasks_dir = ensure_skillsbench_tasks() + return str(local_tasks_dir.resolve()), True + if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith( + f"{REGISTRY_DATASET_PREFIX}@" + ): + return dataset, False + raise ValueError( + "Unsupported SkillsBench dataset source. Use the default synced " + "SkillsBench snapshot or a Harbor registry id matching " + "'benchflow/skillsbench@'." + ) + + +def _normalize_task_filter_value(task_id: str, *, dataset_is_path: bool) -> str: + """Normalize task filter values for Harbor's local-path dataset handling.""" + if dataset_is_path: + return task_id.rsplit("/", 1)[-1] + return task_id + + +def _canonicalize_instance_id(task_name: str) -> str: + """Normalize SkillsBench task names to stable benchflow/ ids.""" + if "/" in task_name: + return task_name + return f"{INSTANCE_ID_PREFIX}/{task_name}" + + def run_harbor_evaluation( llm: LLM, dataset: str, + *, + dataset_is_path: bool, output_dir: str, num_workers: int = 1, task_ids: list[str] | None = None, @@ -57,7 +311,8 @@ def run_harbor_evaluation( Args: llm: LLM configuration for the agent. - dataset: Harbor dataset name (e.g., benchflow/skillsbench). + dataset: Synced SkillsBench task snapshot path or Harbor registry id. + dataset_is_path: Whether ``dataset`` should be passed via ``--path``. output_dir: Directory to store output files. num_workers: Number of parallel workers. task_ids: Optional list of specific task IDs to run. @@ -69,16 +324,18 @@ def run_harbor_evaluation( harbor_output_dir = Path(output_dir) / "harbor_output" harbor_output_dir.mkdir(parents=True, exist_ok=True) harbor_exe = HARBOR_DEFAULTS["harbor_executable"] + agent_name = _get_supported_agent_name(harbor_exe) + task_filter_flag = _get_supported_task_filter_flag(harbor_exe) # Build harbor command using harbor CLI flags. # Use absolute path for --jobs-dir to avoid CWD-relative path issues. cmd = [ harbor_exe, "run", - "-d", + "--path" if dataset_is_path else "-d", dataset, "-a", - HARBOR_DEFAULTS["agent_name"], + agent_name, "-m", llm.model, "--jobs-dir", @@ -87,21 +344,17 @@ def run_harbor_evaluation( str(num_workers), ] - # Pass LLM credentials as agent environment variables - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) - if llm.base_url: - cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) - # Add specific task names if provided if task_ids: for task_id in task_ids: - cmd.extend(["--include-task-name", task_id]) + cmd.extend( + [ + task_filter_flag, + _normalize_task_filter_value( + task_id, dataset_is_path=dataset_is_path + ), + ] + ) if n_limit is not None: cmd.extend(["--n-tasks", str(n_limit)]) @@ -131,10 +384,31 @@ def run_harbor_evaluation( ) if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + if ( + task_ids + and task_filter_flag == "--task-name" + and "No such option: --task-name" in result.stderr + ): + fallback_cmd = [ + "--include-task-name" if part == "--task-name" else part + for part in cmd + ] + logger.warning( + "Harbor does not support --task-name; retrying with " + "--include-task-name" + ) + result = subprocess.run( + fallback_cmd, + capture_output=True, + text=True, + env=env, + ) + + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") logger.info("Harbor evaluation completed successfully") logger.info(f"stdout: {result.stdout}") @@ -207,7 +481,9 @@ def convert_harbor_to_eval_output( with open(result_file) as f: trial = json.load(f) - instance_id = trial.get("task_name", result_file.parent.name) + instance_id = _canonicalize_instance_id( + trial.get("task_name", result_file.parent.name) + ) # Check for exceptions if trial.get("exception_info"): @@ -256,7 +532,7 @@ def convert_harbor_to_eval_output( logger.error(f"Failed to process result file {result_file}: {e}") errors.append( { - "instance_id": result_file.parent.name, + "instance_id": _canonicalize_instance_id(result_file.parent.name), "error": str(e), "test_result": {}, } @@ -302,13 +578,14 @@ def main() -> None: formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Run full skillsbench evaluation + # Run full skillsbench evaluation using a local tasks/ snapshot synced from + # https://github.com/benchflow-ai/skillsbench main uv run skillsbench-infer .llm_config/claude.json # Run specific tasks uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run with custom dataset version + # Run against a Harbor registry dataset instead of the synced GitHub tasks uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 """, ) @@ -322,7 +599,11 @@ def main() -> None: "--dataset", type=str, default=INFER_DEFAULTS["dataset"], - help="Harbor dataset name (e.g., benchflow/skillsbench)", + help=( + "SkillsBench dataset source. The default value syncs tasks/ from the " + "benchflow-ai/skillsbench main branch. You can also pass a Harbor " + "registry id like benchflow/skillsbench@1.0." + ), ) parser.add_argument( "--output-dir", @@ -385,6 +666,20 @@ def main() -> None: ) sys.exit(1) + resolved_dataset = args.dataset + dataset_is_path = False + dataset_commit_hash: str | None = None + if not args.skip_harbor: + try: + resolved_dataset, dataset_is_path = resolve_skillsbench_dataset( + args.dataset + ) + except ValueError as e: + logger.error(str(e)) + sys.exit(1) + if dataset_is_path and args.dataset == INFER_DEFAULTS["dataset"]: + dataset_commit_hash = _load_cached_commit() + # Construct output directory dataset_description = args.dataset.replace("/", "__").replace("@", "-") structured_output_dir = construct_eval_output_dir( @@ -402,6 +697,9 @@ def main() -> None: metadata = { "llm": llm.model_dump_json(), "dataset": args.dataset, + "resolved_dataset": resolved_dataset, + "dataset_is_path": dataset_is_path, + "dataset_commit_hash": dataset_commit_hash, "timestamp": datetime.now(timezone.utc).isoformat(), "harbor_agent": HARBOR_DEFAULTS["agent_name"], "note": args.note, @@ -427,7 +725,8 @@ def main() -> None: try: harbor_output_dir = run_harbor_evaluation( llm=llm, - dataset=args.dataset, + dataset=resolved_dataset, + dataset_is_path=dataset_is_path, output_dir=structured_output_dir, num_workers=args.num_workers, task_ids=task_ids, diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 5f8452cb3..0632a6a46 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -8,22 +8,135 @@ from benchmarks.skillsbench.config import INFER_DEFAULTS from benchmarks.skillsbench.run_infer import ( convert_harbor_to_eval_output, + ensure_skillsbench_tasks, + resolve_skillsbench_dataset, run_harbor_evaluation, ) from openhands.sdk import LLM +class TestDatasetSync: + """Tests for syncing the local SkillsBench task snapshot.""" + + def test_ensure_skillsbench_tasks_reuses_matching_cache( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that an up-to-date cached tasks directory is reused.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + (tasks_dir / "task-a").mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "abc123"})) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + lambda repo_url, branch: "abc123", + ) + + called = False + + def fake_download(**kwargs) -> None: + nonlocal called + called = True + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.download_skillsbench_tasks", + fake_download, + ) + + resolved = ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert resolved == tasks_dir + assert called is False + + def test_ensure_skillsbench_tasks_refreshes_stale_cache( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that a stale cached commit triggers a redownload.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "old-commit"})) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + lambda repo_url, branch: "new-commit", + ) + + captured: dict[str, str] = {} + + def fake_download( + *, + commit_hash: str, + tasks_dir: Path, + metadata_path: Path, + repo_url: str, + branch: str, + ) -> None: + captured["commit_hash"] = commit_hash + captured["tasks_dir"] = str(tasks_dir) + captured["metadata_path"] = str(metadata_path) + tasks_dir.mkdir(exist_ok=True) + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.download_skillsbench_tasks", + fake_download, + ) + + ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert captured["commit_hash"] == "new-commit" + assert captured["tasks_dir"] == str(tasks_dir) + assert captured["metadata_path"] == str(metadata_path) + + def test_ensure_skillsbench_tasks_uses_cache_if_remote_check_fails( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test that a usable cache is kept when the upstream HEAD check fails.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + (tasks_dir / "task-a").mkdir() + metadata_path = tmp_path / "source.json" + metadata_path.write_text(json.dumps({"commit_hash": "cached-commit"})) + + def fake_head(repo_url: str, branch: str) -> str: + raise RuntimeError("network unavailable") + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit", + fake_head, + ) + + resolved = ensure_skillsbench_tasks( + tasks_dir=tasks_dir, + metadata_path=metadata_path, + ) + + assert resolved == tasks_dir + + def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None: + """Test that explicit Harbor dataset ids are passed through unchanged.""" + resolved_dataset, dataset_is_path = resolve_skillsbench_dataset( + "benchflow/skillsbench@1.0" + ) + + assert resolved_dataset == "benchflow/skillsbench@1.0" + assert dataset_is_path is False + + class TestRunHarborEvaluation: """Tests for building Harbor invocation arguments.""" - def test_default_dataset_matches_harbor_registry(self) -> None: - """Test that the default dataset name matches Harbor's published registry.""" - assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench" - def test_run_harbor_evaluation_passes_filters_and_limits( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """Test Harbor command includes task filters and n-limit.""" + """Test Harbor command normalizes local task ids and includes main flags.""" captured: dict[str, list[str]] = {} def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): @@ -35,6 +148,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--include-task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) harbor_output_dir = run_harbor_evaluation( llm=LLM( @@ -42,7 +163,8 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): api_key="test-key", base_url="https://proxy.example.com", ), - dataset=INFER_DEFAULTS["dataset"], + dataset=str(tmp_path / "tasks"), + dataset_is_path=True, output_dir=str(tmp_path), num_workers=2, task_ids=["benchflow/task-a", "benchflow/task-b"], @@ -56,21 +178,69 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): assert cmd[:8] == [ "harbor", "run", - "-d", - "benchflow/skillsbench", + "--path", + str(tmp_path / "tasks"), "-a", - "openhands-sdk", + "openhands", "-m", "litellm_proxy/test-model", ] assert "--jobs-dir" in cmd assert str(expected_output_dir.resolve()) in cmd assert cmd.count("--include-task-name") == 2 - assert "benchflow/task-a" in cmd - assert "benchflow/task-b" in cmd + assert "task-a" in cmd + assert "task-b" in cmd + assert "benchflow/task-a" not in cmd + assert "--ae" not in cmd assert cmd[cmd.index("--n-concurrent") + 1] == "2" assert cmd[cmd.index("--n-tasks") + 1] == "3" + def test_run_harbor_evaluation_retries_with_legacy_task_flag( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test Harbor falls back to --include-task-name when --task-name fails.""" + captured_cmds: list[list[str]] = [] + + def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): + captured_cmds.append(cmd) + if "--task-name" in cmd: + return type( + "Completed", + (), + { + "returncode": 2, + "stdout": "", + "stderr": "No such option: --task-name", + }, + )() + return type( + "Completed", + (), + {"returncode": 0, "stdout": "ok", "stderr": ""}, + )() + + monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) + + run_harbor_evaluation( + llm=LLM(model="test-model"), + dataset=str(tmp_path / "tasks"), + dataset_is_path=True, + output_dir=str(tmp_path), + task_ids=["benchflow/task-a"], + ) + + assert len(captured_cmds) == 2 + assert "--task-name" in captured_cmds[0] + assert "--include-task-name" in captured_cmds[1] + def test_llm_credentials_passed_via_env( self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -87,6 +257,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): )() monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag", + lambda harbor_exe: "--include-task-name", + ) + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer._get_supported_agent_name", + lambda harbor_exe: "openhands", + ) run_harbor_evaluation( llm=LLM( @@ -95,11 +273,13 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict): base_url="https://my-proxy.example.com", ), dataset=INFER_DEFAULTS["dataset"], + dataset_is_path=False, output_dir=str(tmp_path), ) assert captured["env"]["LLM_API_KEY"] == "my-secret-key" assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com" + assert "--ae" not in captured["cmd"] class TestConvertHarborToEvalOutput: @@ -152,6 +332,35 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None: assert entries[0]["test_result"]["passed"] is True assert entries[0]["metrics"]["total_cost_usd"] == 0.05 + def test_local_trial_names_are_normalized_to_canonical_instance_ids( + self, tmp_path: Path + ) -> None: + """Test local Harbor task names without namespace keep benchflow ids.""" + trial_result = { + "task_name": "weighted-gdp-calc", + "trial_name": "weighted-gdp-calc__abc123", + "trial_uri": "file:///path/to/trial", + "agent_result": { + "n_input_tokens": 1000, + "n_output_tokens": 200, + "cost_usd": 0.05, + }, + "verifier_result": {"rewards": {"reward": 1.0}}, + "exception_info": None, + } + + harbor_dir = self._create_harbor_structure( + tmp_path, [("weighted-gdp-calc__abc123", trial_result)] + ) + output_file = tmp_path / "output.jsonl" + + convert_harbor_to_eval_output(harbor_dir, output_file) + + with open(output_file) as f: + entries = [json.loads(line) for line in f] + + assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc" + def test_failed_trial(self, tmp_path: Path) -> None: """Test parsing of a trial with reward 0.""" trial_result = { From c1a62a28fac0c999c06408580ba4046c76fb00e5 Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Tue, 28 Apr 2026 22:35:41 -0400 Subject: [PATCH 11/12] fix: benchflow dataset loading --- benchmarks/skillsbench/README.md | 7 ++++--- benchmarks/skillsbench/run_infer.py | 28 ++++++++++++++++++---------- tests/test_skillsbench_run_infer.py | 18 ++++++++++++++---- uv.lock | 14 ++++++++++++++ 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index c2c11928b..382c51f02 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -39,8 +39,9 @@ By default, `skillsbench-infer` keeps a local copy of `tasks/` from `https://github.com/benchflow-ai/skillsbench` on the `main` branch under `benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in `benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the -upstream `main` commit changes. The only supported dataset sources are this synced -SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`. +upstream `main` commit changes. Dataset aliases matching +`benchflow/skillsbench@...` resolve to this same local Harbor task dataset because +SkillsBench is not yet published in the public Harbor registry. ### Running Inference @@ -61,7 +62,7 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5 # Run with multiple workers uv run skillsbench-infer .llm_config/claude.json --num-workers 4 -# Run against a Harbor registry dataset instead of the synced GitHub tasks +# Versioned SkillsBench aliases also resolve to the synced local dataset uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 ``` diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index c8e06eee5..9e8c8496a 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -268,17 +268,22 @@ def ensure_skillsbench_tasks( def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]: - """Resolve the dataset argument to a synced local snapshot or registry id.""" - if dataset == INFER_DEFAULTS["dataset"]: - local_tasks_dir = ensure_skillsbench_tasks() - return str(local_tasks_dir.resolve()), True + """Resolve the dataset argument to a synced local SkillsBench snapshot. + + Harbor 0.5.x validates ``--dataset`` values against the registry before + starting a job. SkillsBench is not yet published in the public registry, so + ``benchflow/skillsbench`` and versioned aliases like + ``benchflow/skillsbench@1.0`` must be resolved to the locally synced Harbor + task dataset generated by the SkillsBench adapter. + """ if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith( f"{REGISTRY_DATASET_PREFIX}@" ): - return dataset, False + local_tasks_dir = ensure_skillsbench_tasks() + return str(local_tasks_dir.resolve()), True raise ValueError( "Unsupported SkillsBench dataset source. Use the default synced " - "SkillsBench snapshot or a Harbor registry id matching " + "SkillsBench snapshot or a SkillsBench dataset alias matching " "'benchflow/skillsbench@'." ) @@ -579,13 +584,14 @@ def main() -> None: epilog=""" Examples: # Run full skillsbench evaluation using a local tasks/ snapshot synced from - # https://github.com/benchflow-ai/skillsbench main + # https://github.com/benchflow-ai/skillsbench main (adapter-generated + # Harbor tasks stored under benchmarks/skillsbench/data/tasks) uv run skillsbench-infer .llm_config/claude.json # Run specific tasks uv run skillsbench-infer .llm_config/claude.json --select tasks.txt - # Run against a Harbor registry dataset instead of the synced GitHub tasks + # Versioned SkillsBench aliases also resolve to the synced local dataset uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 """, ) @@ -601,8 +607,10 @@ def main() -> None: default=INFER_DEFAULTS["dataset"], help=( "SkillsBench dataset source. The default value syncs tasks/ from the " - "benchflow-ai/skillsbench main branch. You can also pass a Harbor " - "registry id like benchflow/skillsbench@1.0." + "benchflow-ai/skillsbench main branch. Versioned aliases like " + "benchflow/skillsbench@1.0 also resolve to the same local Harbor " + "dataset because SkillsBench is not published in the public Harbor " + "registry yet." ), ) parser.add_argument( diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py index 0632a6a46..ae97989e8 100644 --- a/tests/test_skillsbench_run_infer.py +++ b/tests/test_skillsbench_run_infer.py @@ -120,14 +120,24 @@ def fake_head(repo_url: str, branch: str) -> str: assert resolved == tasks_dir - def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None: - """Test that explicit Harbor dataset ids are passed through unchanged.""" + def test_resolve_skillsbench_dataset_maps_aliases_to_local_snapshot( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test SkillsBench dataset aliases resolve to the local Harbor dataset.""" + tasks_dir = tmp_path / "tasks" + tasks_dir.mkdir() + + monkeypatch.setattr( + "benchmarks.skillsbench.run_infer.ensure_skillsbench_tasks", + lambda: tasks_dir, + ) + resolved_dataset, dataset_is_path = resolve_skillsbench_dataset( "benchflow/skillsbench@1.0" ) - assert resolved_dataset == "benchflow/skillsbench@1.0" - assert dataset_is_path is False + assert resolved_dataset == str(tasks_dir.resolve()) + assert dataset_is_path is True class TestRunHarborEvaluation: diff --git a/uv.lock b/uv.lock index 2cd0b3640..ec4350755 100644 --- a/uv.lock +++ b/uv.lock @@ -1282,6 +1282,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1292,6 +1293,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1302,6 +1304,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -2467,6 +2470,7 @@ dependencies = [ { name = "python-json-logger" }, { name = "requests" }, { name = "swebench" }, + { name = "swesmith" }, { name = "swt-bench" }, { name = "tenacity" }, { name = "toml" }, @@ -2521,6 +2525,7 @@ requires-dist = [ { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, { name = "swebench", specifier = "==4.1.0" }, + { name = "swesmith", specifier = ">=0.0.9" }, { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" }, { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, @@ -6841,6 +6846,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, ] +[[package]] +name = "swesmith" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" }, +] + [[package]] name = "swt-bench" version = "1.0.1" From 87c3bd3f4b29224c45f7126e1d9b11612483ce2a Mon Sep 17 00:00:00 2001 From: Chujun Tao Date: Wed, 29 Apr 2026 10:47:31 -0400 Subject: [PATCH 12/12] enhance: skill loading and readme update --- benchmarks/skillsbench/README.md | 43 ++++++++++- benchmarks/skillsbench/run_infer.py | 110 ++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md index 382c51f02..a67829193 100644 --- a/benchmarks/skillsbench/README.md +++ b/benchmarks/skillsbench/README.md @@ -31,7 +31,14 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills 2. **Docker**: Harbor requires Docker to be installed and running. -3. **LLM API Key**: Configure your LLM provider credentials. +3. **Modal Credentials**: Some tasks (e.g., `mhc-implementation`, `diff-transformer`) run workloads on [Modal](https://modal.com) and require a Modal token. Set the following environment variables before running: + + ```bash + export MODAL_TOKEN_ID=your_token_id + export MODAL_TOKEN_SECRET=your_token_secret + ``` + +4. **LLM API Key**: Configure your LLM provider credentials. ## Usage @@ -64,8 +71,42 @@ uv run skillsbench-infer .llm_config/claude.json --num-workers 4 # Versioned SkillsBench aliases also resolve to the synced local dataset uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0 + +# Run with agent skill definitions injected into task environments +uv run skillsbench-infer .llm_config/claude.json --with-skills + +# Combine task selection with skills injection +uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc --with-skills +uv run skillsbench-infer .llm_config/claude.json --select tasks.txt --with-skills +uv run skillsbench-infer .llm_config/claude.json --n-limit 5 --with-skills +``` + +### Skills Injection (`--with-skills`) + +The `--with-skills` flag injects agent skill definitions into the Docker environment of each evaluated task. When enabled, the following `COPY` instructions are added to each task's Dockerfile before building: + +```dockerfile +# Claude Code +COPY skills /root/.claude/skills +# Claude Code (Harbor compatibility) +COPY skills /etc/claude-code/.claude/skills +# Codex +COPY skills /root/.codex/skills +# OpenCode +COPY skills /root/.opencode/skill +# Goose +COPY skills /root/.goose/skills +# Factory +COPY skills /root/.factory/skills +# Portable agents format (Goose, Amp) +COPY skills /root/.agents/skills ``` +This makes any skills bundled in the task's `environment/skills/` directory available to the agent at the standard skill lookup paths for each supported agent framework. + +- Dockerfiles are automatically restored to their original content after Harbor finishes, regardless of success or failure. +- The `with_skills` flag is recorded in `metadata.json` alongside each evaluation run. + ### LLM Configuration Create an LLM configuration file (e.g., `.llm_config/claude.json`): diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index 9e8c8496a..535a27d62 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -40,6 +40,25 @@ REGISTRY_DATASET_PREFIX = "benchflow/skillsbench" INSTANCE_ID_PREFIX = "benchflow" +# Skills COPY block injected into Dockerfiles when --with-skills is set. +# RUN mkdir -p lines ensure parent directories exist before COPY. +SKILLS_COPY_BLOCK = """\ +# Claude Code +COPY skills /root/.claude/skills +# Claude Code (Harbor compatibility) +COPY skills /etc/claude-code/.claude/skills +# Codex +COPY skills /root/.codex/skills +# OpenCode +COPY skills /root/.opencode/skill +# Goose +COPY skills /root/.goose/skills +# Factory +COPY skills /root/.factory/skills +# Portable agents format (Goose, Amp) +COPY skills /root/.agents/skills +""" + def check_harbor_installed() -> bool: """Check if harbor CLI is installed and available.""" @@ -302,6 +321,66 @@ def _canonicalize_instance_id(task_name: str) -> str: return f"{INSTANCE_ID_PREFIX}/{task_name}" +def get_target_dockerfiles( + tasks_dir: Path, + task_ids: list[str] | None, +) -> list[Path]: + """Return Dockerfile paths for the selected tasks (or all tasks if none specified).""" + if task_ids: + names = [tid.rsplit("/", 1)[-1] for tid in task_ids] + candidates = [tasks_dir / name / "environment" / "Dockerfile" for name in names] + else: + candidates = list(tasks_dir.glob("*/environment/Dockerfile")) + + found = [p for p in candidates if p.is_file()] + missing = [p for p in candidates if not p.is_file()] + for p in missing: + logger.warning("Dockerfile not found (skipping skills injection): %s", p) + return found + + +def inject_skills_into_dockerfiles( + dockerfiles: list[Path], +) -> list[tuple[Path, str]]: + """Inject SKILLS_COPY_BLOCK into Dockerfiles that don't already contain it. + + Returns a list of (path, original_content) for every file that was modified, + so callers can revert with revert_dockerfiles(). + """ + reverts: list[tuple[Path, str]] = [] + for dockerfile in dockerfiles: + original = dockerfile.read_text(encoding="utf-8") + if "COPY skills" in original: + logger.debug("Skills already present in %s, skipping injection", dockerfile) + continue + + # Insert the block after the last WORKDIR directive, or at end of file. + lines = original.splitlines(keepends=True) + insert_at = len(lines) + for i, line in enumerate(lines): + if line.strip().upper().startswith("WORKDIR"): + insert_at = i + 1 + + injected_lines = ( + lines[:insert_at] + ["\n", SKILLS_COPY_BLOCK] + lines[insert_at:] + ) + dockerfile.write_text("".join(injected_lines), encoding="utf-8") + reverts.append((dockerfile, original)) + logger.info("Injected skills COPY block into %s", dockerfile) + + return reverts + + +def revert_dockerfiles(reverts: list[tuple[Path, str]]) -> None: + """Restore Dockerfiles to their original content after skills injection.""" + for dockerfile, original in reverts: + try: + dockerfile.write_text(original, encoding="utf-8") + logger.info("Reverted %s", dockerfile) + except OSError as e: + logger.error("Failed to revert %s: %s", dockerfile, e) + + def run_harbor_evaluation( llm: LLM, dataset: str, @@ -651,6 +730,17 @@ def main() -> None: action="store_true", help="Skip running harbor and only convert existing results", ) + parser.add_argument( + "--with-skills", + action="store_true", + default=False, + help=( + "Inject agent skill definitions into the selected task Dockerfiles before " + "running evaluation. Adds COPY instructions for Claude Code, Codex, " + "OpenCode, Goose, Factory, and portable-agents skill directories. " + "Dockerfiles are restored to their original state after Harbor completes." + ), + ) args = parser.parse_args() @@ -711,6 +801,7 @@ def main() -> None: "timestamp": datetime.now(timezone.utc).isoformat(), "harbor_agent": HARBOR_DEFAULTS["agent_name"], "note": args.note, + "with_skills": args.with_skills, } metadata_path = Path(structured_output_dir) / "metadata.json" with open(metadata_path, "w") as f: @@ -729,6 +820,18 @@ def main() -> None: output_path = Path(structured_output_dir) / OUTPUT_FILENAME if not args.skip_harbor: + # Optionally inject skill definitions into task Dockerfiles + dockerfile_reverts: list[tuple[Path, str]] = [] + if args.with_skills and dataset_is_path: + target_dockerfiles = get_target_dockerfiles( + tasks_dir=Path(resolved_dataset), + task_ids=task_ids, + ) + dockerfile_reverts = inject_skills_into_dockerfiles(target_dockerfiles) + logger.info( + "Injected skills into %d Dockerfile(s)", len(dockerfile_reverts) + ) + # Run harbor evaluation try: harbor_output_dir = run_harbor_evaluation( @@ -750,6 +853,13 @@ def main() -> None: except Exception as e: logger.error(f"Evaluation failed: {e}") sys.exit(1) + finally: + if dockerfile_reverts: + revert_dockerfiles(dockerfile_reverts) + logger.info( + "Reverted %d Dockerfile(s) after evaluation", + len(dockerfile_reverts), + ) else: # Skip harbor, just convert existing results harbor_output_dir = Path(structured_output_dir) / "harbor_output"