From 3bf623230fd9f61272e1982e61225f6ded7b5aed Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Sun, 5 Apr 2026 19:33:34 -0400
Subject: [PATCH 01/12] integrate skillsbench

---
 benchmarks/skillsbench/README.md     | 163 ++++++++++
 benchmarks/skillsbench/__init__.py   |   1 +
 benchmarks/skillsbench/config.py     |  16 +
 benchmarks/skillsbench/eval_infer.py | 280 ++++++++++++++++
 benchmarks/skillsbench/run_infer.py  | 467 +++++++++++++++++++++++++++
 benchmarks/utils/report_costs.py     |   4 +-
 pyproject.toml                       |   2 +
 7 files changed, 932 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/skillsbench/README.md
 create mode 100644 benchmarks/skillsbench/__init__.py
 create mode 100644 benchmarks/skillsbench/config.py
 create mode 100644 benchmarks/skillsbench/eval_infer.py
 create mode 100644 benchmarks/skillsbench/run_infer.py

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
new file mode 100644
index 000000000..60ff73652
--- /dev/null
+++ b/benchmarks/skillsbench/README.md
@@ -0,0 +1,163 @@
+# SkillsBench Evaluation
+
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
+
+## Overview
+
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
+
+- Software engineering
+- Office & white collar
+- Natural science
+- Media & content production
+- Cybersecurity
+- Finance
+- Robotics
+- Manufacturing
+- Energy
+- Mathematics
+- Healthcare
+
+## Prerequisites
+
+1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+
+   ```bash
+   pip install harbor
+   # or
+   uv pip install harbor
+   ```
+
+2. **Docker**: Harbor requires Docker to be installed and running.
+
+3. **LLM API Key**: Configure your LLM provider credentials.
+
+## Usage
+
+### Running Inference
+
+Run the SkillsBench evaluation using the OpenHands SDK agent:
+
+```bash
+# Run full evaluation
+uv run skillsbench-infer .llm_config/claude.json
+
+# Run specific tasks
+uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc
+
+# Run tasks from a file
+uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+# Limit the run to 5 tasks (useful for smoke tests)
+uv run skillsbench-infer .llm_config/claude.json --n-limit 5
+
+# Run with multiple workers
+uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+```
+
+### LLM Configuration
+
+Create an LLM configuration file (e.g., `.llm_config/claude.json`):
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4-20250514",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+Or use a LiteLLM proxy:
+
+```json
+{
+  "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",
+  "base_url": "https://your-proxy.example.com",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+### Evaluating Results
+
+After running inference, evaluate the results:
+
+```bash
+uv run skillsbench-eval ./evaluation_outputs/.../output.jsonl
+```
+
+This generates a report file (`output.report.json`) with:
+- Total/completed/resolved instance counts
+- Success rate
+- Aggregate metrics (cost, tokens)
+
+## Output Format
+
+### Inference Output (`output.jsonl`)
+
+Each line contains:
+
+```json
+{
+  "instance_id": "benchflow/task-name",
+  "test_result": {
+    "trial_name": "...",
+    "trial_uri": "...",
+    "rewards": {"reward": 1.0},
+    "passed": true
+  },
+  "instruction": "",
+  "error": null,
+  "history": [],
+  "metrics": {
+    "total_prompt_tokens": 5000,
+    "total_completion_tokens": 1000,
+    "total_cost_usd": 0.05
+  }
+}
+```
+
+### Evaluation Report (`output.report.json`)
+
+```json
+{
+  "total_instances": 100,
+  "completed_instances": 95,
+  "resolved_instances": 80,
+  "unresolved_instances": 15,
+  "error_instances": 5,
+  "aggregate_metrics": {
+    "total_cost_usd": 5.25,
+    "total_prompt_tokens": 500000,
+    "total_completion_tokens": 100000
+  }
+}
+```
+
+## Architecture
+
+The integration follows the Harbor agent adapter pattern:
+
+1. **Harbor Harness**: Manages task containers and lifecycle
+2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
+3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
+
+```text
+┌──────────────────────────────────────────────────┐
+│                 Harbor Harness                   │
+│  ┌────────────────────────────────────────────┐  │
+│  │           Task Container                   │  │
+│  │  ┌──────────────────────────────────────┐  │  │
+│  │  │       OpenHands SDK Agent            │  │  │
+│  │  │  - Terminal tool                     │  │  │
+│  │  │  - File editor tool                  │  │  │
+│  │  │  - Task tracker tool                 │  │  │
+│  │  └──────────────────────────────────────┘  │  │
+│  └────────────────────────────────────────────┘  │
+└──────────────────────────────────────────────────┘
+```
+
+## References
+
+- [SkillsBench](https://www.skillsbench.ai/) - The benchmark
+- [Harbor](https://harborframework.com) - The evaluation harness
+- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
+- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
diff --git a/benchmarks/skillsbench/__init__.py b/benchmarks/skillsbench/__init__.py
new file mode 100644
index 000000000..c02f7bafb
--- /dev/null
+++ b/benchmarks/skillsbench/__init__.py
@@ -0,0 +1 @@
+# SkillsBench evaluation benchmark
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
new file mode 100644
index 000000000..8b55a92b0
--- /dev/null
+++ b/benchmarks/skillsbench/config.py
@@ -0,0 +1,16 @@
+"""SkillsBench configuration defaults."""
+
+# Default inference settings (only include values actually used by argparse)
+INFER_DEFAULTS = {
+    "dataset": "benchflow/skillsbench",
+    "output_dir": "./evaluation_outputs",
+    "num_workers": 1,
+}
+
+# Harbor configuration defaults
+HARBOR_DEFAULTS = {
+    # Harbor executable
+    "harbor_executable": "harbor",
+    # Default agent name for openhands-sdk
+    "agent_name": "openhands-sdk",
+}
diff --git a/benchmarks/skillsbench/eval_infer.py b/benchmarks/skillsbench/eval_infer.py
new file mode 100644
index 000000000..f55a91736
--- /dev/null
+++ b/benchmarks/skillsbench/eval_infer.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""SkillsBench Evaluation Script.
+
+This script processes SkillsBench output and generates evaluation reports.
+It reads the output.jsonl produced by run_infer, aggregates results,
+and writes a summary report.
+
+Usage:
+    uv run skillsbench-eval <path_to_output.jsonl>
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def process_skillsbench_results(
+    input_file: str,
+    output_file: str,
+) -> dict:
+    """Process SkillsBench output.jsonl and generate evaluation report.
+
+    SkillsBench format (from harbor conversion):
+    {
+        "instance_id": "task_id",
+        "test_result": {
+            "trajectory_path": "...",
+            "total_steps": N,
+            "final_metrics": {...},
+            "passed": true/false  # May be populated by harbor grading
+        },
+        "instruction": "...",
+        "history": [...]
+    }
+
+    Report format (similar to SWE-Bench):
+    {
+        "total_instances": N,
+        "submitted_instances": N,
+        "completed_instances": N,
+        "incomplete_instances": N,
+        "resolved_instances": N,
+        "unresolved_instances": N,
+        "error_instances": N,
+        "submitted_ids": [...],
+        "completed_ids": [...],
+        "incomplete_ids": [...],
+        "resolved_ids": [...],
+        "unresolved_ids": [...]
+    }
+    """
+    logger.info(f"Processing {input_file} to generate report: {output_file}")
+
+    # Use sets for O(1) lookup and automatic deduplication
+    # Convert to sorted lists only when building final report
+    completed_ids: set[str] = set()
+    resolved_ids: set[str] = set()
+    unresolved_ids: set[str] = set()
+    incomplete_ids: set[str] = set()
+    error_ids: set[str] = set()
+
+    # Aggregate metrics
+    total_cost_usd = 0.0
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+
+    with open(input_file) as infile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                # Extract required fields
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    continue
+
+                if instance_id in completed_ids:
+                    logger.warning(
+                        f"Line {line_num}: Duplicate instance_id {instance_id}"
+                    )
+                    continue
+
+                # Check for errors
+                error = data.get("error")
+                if error:
+                    error_ids.add(instance_id)
+                    incomplete_ids.add(instance_id)
+                    continue
+
+                # Extract test result
+                test_result = data.get("test_result", {})
+
+                # Check if task passed (harbor may include this)
+                passed = test_result.get("passed")
+                # If not explicitly set, we mark as completed but ungraded
+                is_resolved = passed is True
+
+                # Add to completed instances
+                completed_ids.add(instance_id)
+
+                if is_resolved:
+                    resolved_ids.add(instance_id)
+                else:
+                    unresolved_ids.add(instance_id)
+
+                # Aggregate metrics
+                # Use explicit None check to handle zero values correctly
+                # (using `or` would incorrectly fallback when value is 0)
+                metrics = data.get("metrics", {})
+                final_metrics = test_result.get("final_metrics", {})
+
+                cost = metrics.get("total_cost_usd")
+                if cost is None:
+                    cost = final_metrics.get("total_cost_usd", 0.0)
+
+                prompt_tokens = metrics.get("total_prompt_tokens")
+                if prompt_tokens is None:
+                    prompt_tokens = final_metrics.get("total_prompt_tokens", 0)
+
+                completion_tokens = metrics.get("total_completion_tokens")
+                if completion_tokens is None:
+                    completion_tokens = final_metrics.get("total_completion_tokens", 0)
+
+                # After the None checks above, these values are guaranteed to be non-None
+                total_cost_usd += cost
+                total_prompt_tokens += prompt_tokens
+                total_completion_tokens += completion_tokens
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+
+    # Check for separate error file (used in manual workflows where errors
+    # are extracted to a separate file for analysis/retry)
+    error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl")
+    if error_path.exists():
+        with open(error_path) as error_file:
+            for line_num, line in enumerate(error_file, 1):
+                try:
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    data = json.loads(line)
+                    instance_id = data.get("instance_id")
+                    if not instance_id:
+                        continue
+                    if instance_id in completed_ids or instance_id in incomplete_ids:
+                        continue
+
+                    incomplete_ids.add(instance_id)
+                    error_ids.add(instance_id)
+                except (json.JSONDecodeError, Exception) as e:
+                    logger.error(f"Error file line {line_num}: {e}")
+
+    submitted_ids = completed_ids | incomplete_ids
+
+    # Generate report - convert sets to sorted lists for consistent output
+    report = {
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": len(incomplete_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "error_instances": len(error_ids),
+        "submitted_ids": sorted(submitted_ids),
+        "completed_ids": sorted(completed_ids),
+        "incomplete_ids": sorted(incomplete_ids),
+        "resolved_ids": sorted(resolved_ids),
+        "unresolved_ids": sorted(unresolved_ids),
+        "error_ids": sorted(error_ids),
+        # Aggregate metrics
+        "aggregate_metrics": {
+            "total_cost_usd": total_cost_usd,
+            "total_prompt_tokens": total_prompt_tokens,
+            "total_completion_tokens": total_completion_tokens,
+        },
+    }
+
+    # Write report
+    with open(output_file, "w") as outfile:
+        json.dump(report, outfile, indent=4)
+
+    logger.info("Report generated successfully:")
+    logger.info(f"  Total instances: {report['total_instances']}")
+    logger.info(f"  Completed instances: {report['completed_instances']}")
+    logger.info(f"  Resolved instances: {report['resolved_instances']}")
+    logger.info(f"  Unresolved instances: {report['unresolved_instances']}")
+    logger.info(f"  Error instances: {report['error_instances']}")
+    if report["completed_instances"] > 0:
+        logger.info(
+            f"  Success rate: "
+            f"{report['resolved_instances'] / report['completed_instances'] * 100:.1f}%"
+        )
+    logger.info(f"  Total cost: ${total_cost_usd:.4f}")
+
+    return report
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Process SkillsBench output and generate evaluation report",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run skillsbench-eval output.jsonl
+    uv run skillsbench-eval /path/to/output.jsonl
+        """,
+    )
+
+    parser.add_argument("input_file", help="Path to the SkillsBench output.jsonl file")
+    parser.add_argument(
+        "--output-file",
+        help="Output file for report (default: input_file with .report.json extension)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate input file
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    # Determine output file
+    if args.output_file:
+        output_file = Path(args.output_file)
+    else:
+        output_file = input_file.with_suffix(".report.json")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+
+    try:
+        # Process results and generate report
+        process_skillsbench_results(
+            str(input_file),
+            str(output_file),
+        )
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+    # Non-critical telemetry and reporting - wrap in try/except so expensive
+    # multi-hour evaluations don't fail at the telemetry step after completing
+    try:
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+    except Exception as e:
+        logger.warning(f"Laminar update failed (non-critical): {e}")
+
+    try:
+        generate_cost_report(str(input_file))
+    except Exception as e:
+        logger.warning(f"Cost report generation failed (non-critical): {e}")
+
+    logger.info("Script completed successfully!")
+    print(json.dumps({"report_json": str(output_file)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
new file mode 100644
index 000000000..a8afa7281
--- /dev/null
+++ b/benchmarks/skillsbench/run_infer.py
@@ -0,0 +1,467 @@
+"""SkillsBench inference script using Harbor with openhands-sdk agent.
+
+This script runs SkillsBench evaluation using Harbor as the harness
+and openhands-sdk as the agent. Results are saved in a format compatible
+with the standard evaluation pipeline.
+
+Usage:
+    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
+from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import LLM, get_logger
+
+
+logger = get_logger(__name__)
+
+# Output filename for results
+OUTPUT_FILENAME = "output.jsonl"
+
+
+def check_harbor_installed() -> bool:
+    """Check if harbor CLI is installed and available."""
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    try:
+        result = subprocess.run(
+            [harbor_exe, "--version"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
+
+
+def run_harbor_evaluation(
+    llm: LLM,
+    dataset: str,
+    output_dir: str,
+    num_workers: int = 1,
+    task_ids: list[str] | None = None,
+    n_limit: int | None = None,
+) -> Path:
+    """Run harbor evaluation with openhands-sdk agent.
+
+    Args:
+        llm: LLM configuration for the agent.
+        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        output_dir: Directory to store output files.
+        num_workers: Number of parallel workers.
+        task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
+
+    Returns:
+        Path to the harbor output directory.
+    """
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+
+    # Build harbor command using harbor CLI flags.
+    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
+    cmd = [
+        harbor_exe,
+        "run",
+        "-d",
+        dataset,
+        "-a",
+        HARBOR_DEFAULTS["agent_name"],
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    # Pass LLM credentials as agent environment variables
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
+    if llm.base_url:
+        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+
+    # Add specific task names if provided
+    if task_ids:
+        for task_id in task_ids:
+            cmd.extend(["--include-task-name", task_id])
+
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
+
+    logger.info(f"Running harbor command: {' '.join(cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
+
+    # harbor's openhands-sdk agent reads LLM credentials from the host process
+    # environment (os.environ), not from --ae flags which go to the sandbox.
+    env = os.environ.copy()
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+
+        if result.returncode != 0:
+            logger.error(f"Harbor command failed with code {result.returncode}")
+            logger.error(f"stdout: {result.stdout}")
+            logger.error(f"stderr: {result.stderr}")
+            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
+        )
+
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the harbor job directory (timestamp-named) inside the output dir."""
+    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
+    # containing result.json and trial subdirectories
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    # Use the most recent job directory if multiple exist
+    return sorted(candidates)[-1]
+
+
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
+    eval_output_path: Path,
+) -> None:
+    """Convert harbor output to evaluation output format.
+
+    Harbor stores trial results in a job directory structured as:
+        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
+
+    Each trial's result.json contains task_name, verifier_result, agent_result,
+    timing info, and exception details.
+
+    Args:
+        harbor_output_dir: Path to harbor output directory.
+        eval_output_path: Path to write the converted output.jsonl.
+    """
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
+
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
+
+    # Find trial result files (each trial dir has a result.json)
+    result_files = list(job_dir.glob("*/result.json"))
+    # Exclude the job-level result.json
+    result_files = [f for f in result_files if f.parent != job_dir]
+
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
+
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+
+    results: list[dict] = []
+    errors: list[dict] = []
+
+    for result_file in result_files:
+        try:
+            with open(result_file) as f:
+                trial = json.load(f)
+
+            instance_id = trial.get("task_name", result_file.parent.name)
+
+            # Check for exceptions
+            if trial.get("exception_info"):
+                errors.append(
+                    {
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
+                        "test_result": {},
+                    }
+                )
+                continue
+
+            # Extract verifier results
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+
+            # Extract agent metrics
+            agent_result = trial.get("agent_result", {})
+
+            eval_entry = {
+                "instance_id": instance_id,
+                "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
+                    "rewards": rewards,
+                    "passed": passed,
+                },
+                "instruction": "",
+                "error": None,
+                "history": [],
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
+            }
+            results.append(eval_entry)
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
+
+        except (json.JSONDecodeError, OSError) as e:
+            logger.error(f"Failed to process result file {result_file}: {e}")
+            errors.append(
+                {
+                    "instance_id": result_file.parent.name,
+                    "error": str(e),
+                    "test_result": {},
+                }
+            )
+
+    if not results and not errors:
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+
+    if not results:
+        logger.warning(
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
+        )
+
+    # Write results to output.jsonl
+    with open(eval_output_path, "w") as f:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
+            f.write(json.dumps(entry) + "\n")
+
+    logger.info(
+        f"Wrote {len(results)} successful + {len(errors)} failed entries "
+        f"to {eval_output_path}"
+    )
+
+
+def load_task_ids_from_file(filepath: str) -> list[str]:
+    """Load task IDs from a text file (one per line)."""
+    task_ids = []
+    with open(filepath) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                task_ids.append(line)
+    return task_ids
+
+
+def main() -> None:
+    """Main entry point for skillsbench inference."""
+    parser = argparse.ArgumentParser(
+        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run full skillsbench evaluation
+    uv run skillsbench-infer .llm_config/claude.json
+
+    # Run specific tasks
+    uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+    # Run with custom dataset version
+    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+        """,
+    )
+
+    parser.add_argument(
+        "llm_config_path",
+        type=str,
+        help="Path to JSON LLM configuration file",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=INFER_DEFAULTS["dataset"],
+        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=INFER_DEFAULTS["output_dir"],
+        help="Base output directory for evaluation results",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=INFER_DEFAULTS["num_workers"],
+        help="Number of parallel workers",
+    )
+    parser.add_argument(
+        "--n-limit",
+        type=int,
+        help="Maximum number of dataset tasks to run after Harbor filtering",
+    )
+    parser.add_argument(
+        "--select",
+        type=str,
+        help="Path to text file containing task IDs to run (one per line)",
+    )
+    parser.add_argument(
+        "--task-id",
+        type=str,
+        action="append",
+        help="Specific task ID to run (can be specified multiple times)",
+    )
+    parser.add_argument(
+        "--note",
+        type=str,
+        help="Optional note for the evaluation run",
+    )
+    parser.add_argument(
+        "--skip-harbor",
+        action="store_true",
+        help="Skip running harbor and only convert existing results",
+    )
+
+    args = parser.parse_args()
+
+    # Validate LLM config
+    if not os.path.isfile(args.llm_config_path):
+        logger.error(f"LLM config file does not exist: {args.llm_config_path}")
+        sys.exit(1)
+
+    with open(args.llm_config_path) as f:
+        llm_config = f.read()
+    llm = LLM.model_validate_json(llm_config)
+    logger.info(f"Using LLM: {llm.model}")
+
+    # Check harbor installation
+    if not args.skip_harbor and not check_harbor_installed():
+        logger.error(
+            "Harbor CLI is not installed. Please install it:\n"
+            "  pip install harbor\n"
+            "  # or\n"
+            "  uv pip install harbor"
+        )
+        sys.exit(1)
+
+    # Construct output directory
+    dataset_description = args.dataset.replace("/", "__").replace("@", "-")
+    structured_output_dir = construct_eval_output_dir(
+        base_dir=args.output_dir,
+        dataset_name=dataset_description,
+        model_name=llm.model,
+        max_iterations=100,  # Not directly used but required for path construction
+        eval_note=args.note,
+    )
+
+    logger.info(f"Output directory: {structured_output_dir}")
+    os.makedirs(structured_output_dir, exist_ok=True)
+
+    # Save metadata
+    metadata = {
+        "llm": llm.model_dump_json(),
+        "dataset": args.dataset,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
+        "note": args.note,
+    }
+    metadata_path = Path(structured_output_dir) / "metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Collect task IDs if specified
+    task_ids: list[str] | None = None
+    if args.select:
+        loaded_ids = load_task_ids_from_file(args.select)
+        task_ids = loaded_ids
+        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
+    elif args.task_id:
+        task_ids = list(args.task_id)
+        logger.info(f"Running {len(task_ids)} specified task IDs")
+
+    output_path = Path(structured_output_dir) / OUTPUT_FILENAME
+
+    if not args.skip_harbor:
+        # Run harbor evaluation
+        try:
+            harbor_output_dir = run_harbor_evaluation(
+                llm=llm,
+                dataset=args.dataset,
+                output_dir=structured_output_dir,
+                num_workers=args.num_workers,
+                task_ids=task_ids,
+                n_limit=args.n_limit,
+            )
+
+            # Convert harbor output to standard format
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+
+        except Exception as e:
+            logger.error(f"Evaluation failed: {e}")
+            sys.exit(1)
+    else:
+        # Skip harbor, just convert existing results
+        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
+        if harbor_output_dir.exists():
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+        else:
+            logger.error(f"No harbor output found at {harbor_output_dir}")
+            sys.exit(1)
+
+    # Generate cost report
+    if output_path.exists():
+        generate_cost_report(str(output_path))
+
+    logger.info("SkillsBench inference completed!")
+    print(json.dumps({"output_json": str(output_path)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py
index 8f38909f3..7a21a3831 100755
--- a/benchmarks/utils/report_costs.py
+++ b/benchmarks/utils/report_costs.py
@@ -48,7 +48,9 @@ def extract_accumulated_cost(jsonl_data: List[Optional[Dict]]) -> float:
         if entry is None:
             continue
         metrics = entry.get("metrics") or {}
-        accumulated_cost = metrics.get("accumulated_cost")
+        accumulated_cost = metrics.get("accumulated_cost") or metrics.get(
+            "total_cost_usd"
+        )
         if accumulated_cost is not None:
             total_cost += float(accumulated_cost)
 
diff --git a/pyproject.toml b/pyproject.toml
index 69f7b2df5..79c38c8c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,8 @@ swebenchmultilingual-eval = "benchmarks.swebenchmultilingual.eval_infer:main"
 swefficiency-infer = "benchmarks.swefficiency.run_infer:main"
 terminalbench-infer = "benchmarks.terminalbench.run_infer:main"
 terminalbench-eval = "benchmarks.terminalbench.eval_infer:main"
+skillsbench-infer = "benchmarks.skillsbench.run_infer:main"
+skillsbench-eval = "benchmarks.skillsbench.eval_infer:main"
 
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]

From 2bb3266da6039ac8dc97f36c8ec22240da48f18c Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Sun, 5 Apr 2026 19:44:33 -0400
Subject: [PATCH 02/12] add skillsbench tests

---
 tests/test_skillsbench_eval_infer.py | 125 +++++++++++++++
 tests/test_skillsbench_run_infer.py  | 221 +++++++++++++++++++++++++++
 2 files changed, 346 insertions(+)
 create mode 100644 tests/test_skillsbench_eval_infer.py
 create mode 100644 tests/test_skillsbench_run_infer.py

diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
new file mode 100644
index 000000000..56d54f27a
--- /dev/null
+++ b/tests/test_skillsbench_eval_infer.py
@@ -0,0 +1,125 @@
+"""Tests for SkillsBench eval_infer module."""
+
+import json
+from pathlib import Path
+
+from benchmarks.skillsbench.eval_infer import process_skillsbench_results
+
+
+class TestProcessSkillsbenchResults:
+    """Tests for the process_skillsbench_results function."""
+
+    def test_empty_input(self, tmp_path: Path) -> None:
+        """Test processing empty input file."""
+        input_file = tmp_path / "empty.jsonl"
+        output_file = tmp_path / "empty.report.json"
+        input_file.write_text("")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 0
+        assert result["completed_instances"] == 0
+        assert result["resolved_instances"] == 0
+
+    def test_resolved_instance(self, tmp_path: Path) -> None:
+        """Test processing a resolved (passed=True) instance."""
+        input_file = tmp_path / "resolved.jsonl"
+        output_file = tmp_path / "resolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/weighted-gdp-calc",
+            "test_result": {"passed": True, "rewards": {"reward": 1.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 0
+        assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
+
+    def test_unresolved_instance(self, tmp_path: Path) -> None:
+        """Test processing an unresolved (passed=False) instance."""
+        input_file = tmp_path / "unresolved.jsonl"
+        output_file = tmp_path / "unresolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 0
+        assert result["unresolved_instances"] == 1
+
+    def test_instance_with_error(self, tmp_path: Path) -> None:
+        """Test processing an instance that errored."""
+        input_file = tmp_path / "error.jsonl"
+        output_file = tmp_path / "error.report.json"
+
+        entry = {
+            "instance_id": "benchflow/error-task",
+            "test_result": {},
+            "error": "ValueError: LLM_API_KEY environment variable must be set",
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["error_instances"] == 1
+        assert result["incomplete_instances"] == 1
+        assert result["completed_instances"] == 0
+        assert "benchflow/error-task" in result["error_ids"]
+
+    def test_multiple_instances(self, tmp_path: Path) -> None:
+        """Test processing multiple instances with mixed results."""
+        input_file = tmp_path / "multi.jsonl"
+        output_file = tmp_path / "multi.report.json"
+
+        entries = [
+            {
+                "instance_id": "benchflow/task-1",
+                "test_result": {"passed": True},
+                "error": None,
+            },
+            {
+                "instance_id": "benchflow/task-2",
+                "test_result": {"passed": False},
+                "error": None,
+            },
+            {"instance_id": "benchflow/task-3", "test_result": {}, "error": "Timeout"},
+        ]
+        input_file.write_text("\n".join(json.dumps(e) for e in entries) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 3
+        assert result["completed_instances"] == 2
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 1
+        assert result["error_instances"] == 1
+
+    def test_report_file_written(self, tmp_path: Path) -> None:
+        """Test that report file is written correctly."""
+        input_file = tmp_path / "input.jsonl"
+        output_file = tmp_path / "output.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": True},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        process_skillsbench_results(str(input_file), str(output_file))
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            report = json.load(f)
+        assert "total_instances" in report
+        assert "resolved_ids" in report
+        assert "aggregate_metrics" in report
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
new file mode 100644
index 000000000..5f8452cb3
--- /dev/null
+++ b/tests/test_skillsbench_run_infer.py
@@ -0,0 +1,221 @@
+"""Tests for SkillsBench run_infer module."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from benchmarks.skillsbench.config import INFER_DEFAULTS
+from benchmarks.skillsbench.run_infer import (
+    convert_harbor_to_eval_output,
+    run_harbor_evaluation,
+)
+from openhands.sdk import LLM
+
+
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
+
+    def test_default_dataset_matches_harbor_registry(self) -> None:
+        """Test that the default dataset name matches Harbor's published registry."""
+        assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
+
+    def test_run_harbor_evaluation_passes_filters_and_limits(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor command includes task filters and n-limit."""
+        captured: dict[str, list[str]] = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+
+        harbor_output_dir = run_harbor_evaluation(
+            llm=LLM(
+                model="litellm_proxy/test-model",
+                api_key="test-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+            num_workers=2,
+            task_ids=["benchflow/task-a", "benchflow/task-b"],
+            n_limit=3,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "-d",
+            "benchflow/skillsbench",
+            "-a",
+            "openhands-sdk",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--include-task-name") == 2
+        assert "benchflow/task-a" in cmd
+        assert "benchflow/task-b" in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
+        assert cmd[cmd.index("--n-tasks") + 1] == "3"
+
+    def test_llm_credentials_passed_via_env(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
+        captured: dict = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            captured["env"] = env
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+
+        run_harbor_evaluation(
+            llm=LLM(
+                model="test-model",
+                api_key="my-secret-key",
+                base_url="https://my-proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+        )
+
+        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
+        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+
+
+class TestConvertHarborToEvalOutput:
+    """Tests for convert_harbor_to_eval_output function."""
+
+    def _create_harbor_structure(
+        self, tmp_path: Path, trials: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock Harbor output structure."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
+
+        for trial_name, trial_result in trials:
+            trial_dir = job_dir / trial_name
+            trial_dir.mkdir()
+            (trial_dir / "result.json").write_text(json.dumps(trial_result))
+
+        return harbor_dir
+
+    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
+        """Test successful parsing of harbor trial result."""
+        trial_result = {
+            "task_name": "benchflow/weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+        assert entries[0]["test_result"]["passed"] is True
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.05
+
+    def test_failed_trial(self, tmp_path: Path) -> None:
+        """Test parsing of a trial with reward 0."""
+        trial_result = {
+            "task_name": "benchflow/task-1",
+            "trial_name": "task-1__xyz",
+            "agent_result": {
+                "n_input_tokens": None,
+                "n_output_tokens": None,
+                "cost_usd": None,
+            },
+            "verifier_result": {"rewards": {"reward": 0.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("task-1__xyz", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["test_result"]["passed"] is False
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.0
+
+    def test_trial_with_exception(self, tmp_path: Path) -> None:
+        """Test that exception trials are written as error entries."""
+        trial_result = {
+            "task_name": "benchflow/error-task",
+            "trial_name": "error-task__err",
+            "agent_result": {},
+            "verifier_result": {},
+            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("error-task__err", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/error-task"
+        assert entries[0]["error"] is not None
+        assert entries[0]["test_result"] == {}
+
+    def test_missing_job_directory(self, tmp_path: Path) -> None:
+        """Test handling when no job directory exists."""
+        harbor_dir = tmp_path / "harbor_output"
+        harbor_dir.mkdir()
+
+        with pytest.raises(RuntimeError, match="No harbor job directory found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+
+    def test_empty_job_directory(self, tmp_path: Path) -> None:
+        """Test handling of harbor job dir with no trial subdirs."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
+
+        with pytest.raises(RuntimeError, match="No trial result files found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")

From 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Wed, 22 Apr 2026 20:47:30 -0400
Subject: [PATCH 03/12] feat(skillsbench): migrate harness from Harbor to
 benchflow 0.3.0

Switch the SkillsBench evaluation harness from Harbor/openhands-sdk to
benchflow 0.3.0 with the native openhands ACP agent.

Key changes:
- Replace Harbor-specific logic with benchflow CLI invocation
  (`bench eval create -f config.yaml` / legacy `benchflow job --config`)
- Add sparse-checkout task download to avoid cloning the full skillsbench repo
- Fix metrics extraction: benchflow 0.3.0 result.json omits cost/token fields;
  now reads from agent/trajectory.json (harbor-format) or parses
  agent/openhands.txt stdout (ACP agent)
- Fix timestamp detection with regex (_TIMESTAMP_RE) to correctly identify
  benchflow 0.3.0 job dirs (YYYY-MM-DD__HH-MM-SS) vs plain task dirs
- Fix openhands install failure on Ubuntu 24.04 (PEP 668) by injecting
  PIP_BREAK_SYSTEM_PACKAGES=1 into agent_env
- Add provider-specific env var injection for direct Gemini/Anthropic models
- Update README and config to reflect benchflow harness

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                           |   1 +
 benchmarks/skillsbench/README.md     |  42 +-
 benchmarks/skillsbench/config.py     |  11 +-
 benchmarks/skillsbench/run_infer.py  | 655 ++++++++++++++++++---------
 tests/test_skillsbench_eval_infer.py |  17 -
 tests/test_skillsbench_run_infer.py  | 442 +++++++++++++-----
 uv.lock                              |  50 +-
 vendor/software-agent-sdk            |   2 +-
 8 files changed, 824 insertions(+), 396 deletions(-)

diff --git a/.gitignore b/.gitignore
index 459fad588..9164fd12b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,4 +216,5 @@ workspace/
 
 # Evaluation outputs
 eval_outputs/
+evaluation_outputs/
 builds/
diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 60ff73652..21339842c 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -1,10 +1,10 @@
 # SkillsBench Evaluation
 
-This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent.
 
 ## Overview
 
-SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include:
 
 - Software engineering
 - Office & white collar
@@ -20,23 +20,25 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Prerequisites
 
-1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+1. **Install benchflow**: benchflow is the official harness for running SkillsBench.
 
    ```bash
-   pip install harbor
+   uv tool install benchflow==0.3.0
    # or
-   uv pip install harbor
+   pip install benchflow==0.3.0
+   # or
+   uv pip install benchflow==0.3.0
    ```
 
-2. **Docker**: Harbor requires Docker to be installed and running.
+2. **Docker**: benchflow requires Docker to be installed and running.
 
-3. **LLM API Key**: Configure your LLM provider credentials.
+3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment.
 
 ## Usage
 
 ### Running Inference
 
-Run the SkillsBench evaluation using the OpenHands SDK agent:
+Run the SkillsBench evaluation using the `openhands` agent:
 
 ```bash
 # Run full evaluation
@@ -62,7 +64,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`):
 ```json
 {
   "model": "anthropic/claude-sonnet-4-20250514",
-  "api_key": "YOUR_API_KEY"
+  "api_key": "YOUR_ANTHROPIC_API_KEY"
 }
 ```
 
@@ -99,8 +101,6 @@ Each line contains:
 {
   "instance_id": "benchflow/task-name",
   "test_result": {
-    "trial_name": "...",
-    "trial_uri": "...",
     "rewards": {"reward": 1.0},
     "passed": true
   },
@@ -134,22 +134,21 @@ Each line contains:
 
 ## Architecture
 
-The integration follows the Harbor agent adapter pattern:
+The integration uses the benchflow CLI as the evaluation harness:
 
-1. **Harbor Harness**: Manages task containers and lifecycle
-2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
-3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
+1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty
+2. **benchflow job**: Runs all tasks concurrently with `openhands`
+3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format
 
 ```text
 ┌──────────────────────────────────────────────────┐
-│                 Harbor Harness                   │
+│               benchflow job                      │
 │  ┌────────────────────────────────────────────┐  │
-│  │           Task Container                   │  │
+│  │           Task Container (Docker)          │  │
 │  │  ┌──────────────────────────────────────┐  │  │
-│  │  │       OpenHands SDK Agent            │  │  │
+│  │  │       openhands                      │  │  │
 │  │  │  - Terminal tool                     │  │  │
 │  │  │  - File editor tool                  │  │  │
-│  │  │  - Task tracker tool                 │  │  │
 │  │  └──────────────────────────────────────┘  │  │
 │  └────────────────────────────────────────────┘  │
 └──────────────────────────────────────────────────┘
@@ -158,6 +157,5 @@ The integration follows the Harbor agent adapter pattern:
 ## References
 
 - [SkillsBench](https://www.skillsbench.ai/) - The benchmark
-- [Harbor](https://harborframework.com) - The evaluation harness
-- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
-- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
+- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness
+- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
index 8b55a92b0..4ed541ab9 100644
--- a/benchmarks/skillsbench/config.py
+++ b/benchmarks/skillsbench/config.py
@@ -1,16 +1,13 @@
 """SkillsBench configuration defaults."""
 
-# Default inference settings (only include values actually used by argparse)
+# Default inference settings
 INFER_DEFAULTS = {
     "dataset": "benchflow/skillsbench",
     "output_dir": "./evaluation_outputs",
     "num_workers": 1,
 }
 
-# Harbor configuration defaults
-HARBOR_DEFAULTS = {
-    # Harbor executable
-    "harbor_executable": "harbor",
-    # Default agent name for openhands-sdk
-    "agent_name": "openhands-sdk",
+# benchflow configuration defaults
+BENCHFLOW_DEFAULTS = {
+    "agent_name": "openhands",
 }
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index a8afa7281..2e11a100a 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -1,24 +1,31 @@
-"""SkillsBench inference script using Harbor with openhands-sdk agent.
+"""SkillsBench inference script using the benchflow SDK.
 
-This script runs SkillsBench evaluation using Harbor as the harness
-and openhands-sdk as the agent. Results are saved in a format compatible
+This script runs SkillsBench evaluation using `benchflow job` as the harness
+and `openhands` as the default agent. Results are saved in a format compatible
 with the standard evaluation pipeline.
 
 Usage:
-    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
+    uv run skillsbench-infer <llm_config_path>
+
+    # Run specific tasks
+    uv run skillsbench-infer <llm_config_path> --select tasks.txt
 """
 
 import argparse
 import json
 import os
+import re
+import shutil
 import subprocess
 import sys
+import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
+import yaml
 from pydantic import SecretStr
 
-from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
@@ -26,256 +33,469 @@
 
 logger = get_logger(__name__)
 
-# Output filename for results
+# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS
+_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$")
+
+# "Total cost: $0.0487"
+_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)")
+# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K"
+_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b")
+
 OUTPUT_FILENAME = "output.jsonl"
 
+TASK_REPOS = {
+    "skillsbench": {
+        "repo": "https://github.com/benchflow-ai/skillsbench.git",
+        "subdir": "tasks",
+    }
+}
+
+_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = {
+    "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"),
+    "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
+    "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
+    "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"),
+}
+
+
+def _infer_direct_provider(model: str) -> str | None:
+    """Infer the provider prefix for direct model names.
+
+    Examples:
+      - gemini/gemini-2.5-pro -> gemini
+      - anthropic/claude-sonnet-4-5 -> anthropic
+      - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars)
+    """
+    if not model or model.startswith("litellm_proxy/"):
+        return None
+    if "/" in model:
+        provider = model.split("/", 1)[0].lower()
+        if provider in _DIRECT_PROVIDER_ENV_VARS:
+            return provider
+    return None
+
+
+def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]:
+    """Build the sandbox environment for benchflow's openhands agent.
+
+    Only LLM-specific variables are returned — these go INTO the sandbox
+    container via the ``agent_env`` YAML key.  The calling process inherits
+    the host environment normally; dumping ``os.environ`` here would leak
+    the entire host env into every container.
+    """
+    env: dict[str, str] = {}
+    api_key: str | None = None
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
+
+    provider = _infer_direct_provider(llm.model)
+    if provider and api_key:
+        key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider]
+        for var_name in key_vars:
+            env[var_name] = api_key
+        if llm.base_url and base_url_var:
+            env[base_url_var] = llm.base_url
+
+    return env
+
+
+def check_benchflow_installed() -> bool:
+    """Check if benchflow CLI is installed and available.
+
+    Tries ``bench`` first (current name), then falls back to the legacy
+    ``benchflow`` binary.
+    """
+    for cmd in ("bench", "benchflow"):
+        try:
+            result = subprocess.run(
+                [cmd, "--help"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                return True
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            continue
+    return False
+
 
-def check_harbor_installed() -> bool:
-    """Check if harbor CLI is installed and available."""
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]:
+    """Map a benchflow dataset name to its task repository metadata."""
+    dataset_name = dataset.split("@", 1)[0].split("/")[-1]
     try:
-        result = subprocess.run(
-            [harbor_exe, "--version"],
+        return dataset_name, TASK_REPOS[dataset_name]
+    except KeyError as exc:
+        raise ValueError(
+            f"Unsupported SkillsBench dataset: {dataset!r}. "
+            f"Known datasets: {sorted(TASK_REPOS)}"
+        ) from exc
+
+
+def ensure_tasks(
+    dataset: str,
+    tasks_dir: Path,
+    task_ids: list[str] | None = None,
+) -> None:
+    """Download tasks for a benchflow dataset into tasks_dir.
+
+    BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the
+    benchmark task repository directly when the local tasks directory is empty.
+
+    When *task_ids* is provided, a sparse checkout is used so only the
+    requested task subdirectories are downloaded — much faster than a full
+    clone for large repos.
+    """
+    if tasks_dir.exists() and any(tasks_dir.iterdir()):
+        logger.info(f"Tasks already present in {tasks_dir}, skipping download")
+        return
+
+    _, repo_info = _resolve_task_repo(dataset)
+    tasks_dir.mkdir(parents=True, exist_ok=True)
+    clone_dir = tasks_dir.parent / "_clone"
+    if clone_dir.exists():
+        shutil.rmtree(clone_dir, ignore_errors=True)
+
+    subdir = repo_info.get("subdir", "")
+
+    if task_ids:
+        # Sparse checkout: only download the specific task directories
+        short_names = [tid.split("/")[-1] for tid in task_ids]
+
+        cmd_clone = [
+            "git",
+            "clone",
+            "--no-checkout",
+            "--depth",
+            "1",
+            repo_info["repo"],
+            str(clone_dir),
+        ]
+        logger.info(f"Sparse clone: {' '.join(cmd_clone)}")
+        result = subprocess.run(cmd_clone, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"task download failed: {result.stderr}")
+
+        # Init sparse-checkout and set the desired paths
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"],
             capture_output=True,
             text=True,
-            timeout=10,
+            check=True,
         )
-        return result.returncode == 0
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        return False
+        sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names]
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "checkout"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+    else:
+        # Full shallow clone
+        cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)]
+        logger.info(f"Downloading tasks: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error(f"Failed to clone tasks: {result.stderr}")
+            raise RuntimeError(f"task download failed: {result.stderr}")
+
+    try:
+        source_dir = clone_dir / subdir if subdir else clone_dir
+
+        for entry in source_dir.iterdir():
+            target = tasks_dir / entry.name
+            if entry.is_dir():
+                shutil.copytree(entry, target, dirs_exist_ok=True)
+            else:
+                shutil.copy2(entry, target)
+    finally:
+        shutil.rmtree(clone_dir, ignore_errors=True)
 
+    logger.info(f"Tasks downloaded to {tasks_dir}")
 
-def run_harbor_evaluation(
+
+def run_benchflow_job(
     llm: LLM,
-    dataset: str,
-    output_dir: str,
+    tasks_dir: Path,
+    jobs_dir: Path,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
-    n_limit: int | None = None,
 ) -> Path:
-    """Run harbor evaluation with openhands-sdk agent.
+    """Run benchflow job command.
 
     Args:
         llm: LLM configuration for the agent.
-        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
-        output_dir: Directory to store output files.
-        num_workers: Number of parallel workers.
-        task_ids: Optional list of specific task IDs to run.
-        n_limit: Optional maximum number of dataset tasks to run.
+        tasks_dir: Path to directory containing task subdirectories.
+        jobs_dir: Directory for benchflow job output.
+        num_workers: Number of parallel workers (concurrency).
+        task_ids: Optional list of task IDs to filter (short names, not full paths).
 
     Returns:
-        Path to the harbor output directory.
+        Path to jobs_dir.
     """
-    harbor_output_dir = Path(output_dir) / "harbor_output"
-    harbor_output_dir.mkdir(parents=True, exist_ok=True)
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
-
-    # Build harbor command using harbor CLI flags.
-    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
-    cmd = [
-        harbor_exe,
-        "run",
-        "-d",
-        dataset,
-        "-a",
-        HARBOR_DEFAULTS["agent_name"],
-        "-m",
-        llm.model,
-        "--jobs-dir",
-        str(harbor_output_dir.resolve()),
-        "--n-concurrent",
-        str(num_workers),
-    ]
-
-    # Pass LLM credentials as agent environment variables
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
-    if llm.base_url:
-        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+    jobs_dir.mkdir(parents=True, exist_ok=True)
+
+    agent_env = _build_benchflow_agent_env(llm)
+    # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without
+    # --break-system-packages. benchflow's openhands install_cmd uses plain
+    # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04,
+    # causing "Agent openhands install failed (rc=1)". Setting this env var
+    # makes pip skip the restriction without modifying the install_cmd.
+    agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1")
+    config = {
+        "tasks_dir": str(tasks_dir),
+        "jobs_dir": str(jobs_dir.resolve()),
+        "agent": BENCHFLOW_DEFAULTS["agent_name"],
+        "model": llm.model,
+        "environment": "docker",
+        "concurrency": num_workers,
+        # OpenHands is installed inside the sandbox as root by benchflow's
+        # registry install command. Running as the default "agent" user can
+        # lose access to that binary on some task images.
+        "sandbox_user": None,
+        "agent_env": agent_env,
+    }
 
-    # Add specific task names if provided
-    if task_ids:
-        for task_id in task_ids:
-            cmd.extend(["--include-task-name", task_id])
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False
+    ) as tmp:
+        yaml.safe_dump(config, tmp, sort_keys=False)
+        config_path = tmp.name
 
-    if n_limit is not None:
-        cmd.extend(["--n-tasks", str(n_limit)])
+    # Prefer `bench eval create` (current), fall back to legacy `benchflow job`
+    bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench"
+    if "benchflow" in bench_bin:
+        cmd = [bench_bin, "job", "--config", config_path]
+    else:
+        cmd = [bench_bin, "eval", "create", "-f", config_path]
 
-    logger.info(f"Running harbor command: {' '.join(cmd)}")
-    logger.info(f"Output directory: {harbor_output_dir}")
+    logger.info(f"Running: {' '.join(cmd)}")
 
-    # harbor's openhands-sdk agent reads LLM credentials from the host process
-    # environment (os.environ), not from --ae flags which go to the sandbox.
-    env = os.environ.copy()
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        env["LLM_API_KEY"] = api_key
-    if llm.base_url:
-        env["LLM_BASE_URL"] = llm.base_url
+    # Inject LLM vars into the host process env so benchflow's provider
+    # resolution can pick them up; the subprocess inherits normally (env=None).
+    host_env = os.environ.copy()
+    host_env.update(agent_env)
+    result = subprocess.run(cmd, capture_output=True, text=True, env=host_env)
+    Path(config_path).unlink(missing_ok=True)
 
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            env=env,
-        )
+    if result.returncode != 0:
+        logger.error(f"benchflow job failed (code {result.returncode})")
+        logger.error(f"stdout: {result.stdout}")
+        logger.error(f"stderr: {result.stderr}")
+        raise RuntimeError(f"benchflow job failed: {result.stderr}")
 
-        if result.returncode != 0:
-            logger.error(f"Harbor command failed with code {result.returncode}")
-            logger.error(f"stdout: {result.stdout}")
-            logger.error(f"stderr: {result.stderr}")
-            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+    logger.info("benchflow job completed")
+    logger.info(f"stdout: {result.stdout}")
 
-        logger.info("Harbor evaluation completed successfully")
-        logger.info(f"stdout: {result.stdout}")
+    return jobs_dir
 
-    except FileNotFoundError:
-        raise RuntimeError(
-            "Harbor CLI not found. Please install harbor: pip install harbor"
-        )
 
-    return harbor_output_dir
-
-
-def _find_job_dir(harbor_output_dir: Path) -> Path:
-    """Find the harbor job directory (timestamp-named) inside the output dir."""
-    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
-    # containing result.json and trial subdirectories
-    candidates = [
-        d
-        for d in harbor_output_dir.iterdir()
-        if d.is_dir() and (d / "result.json").exists()
-    ]
-    if not candidates:
-        raise RuntimeError(
-            f"No harbor job directory found in {harbor_output_dir}. "
-            f"Expected a timestamp-named directory containing result.json."
+def _extract_trial_metrics(trial_dir: Path) -> dict:
+    """Extract token/cost metrics from benchflow 0.3.0 trial output files.
+
+    benchflow 0.3.0 does not write cost/token fields to result.json.
+    Instead, metrics are read from:
+      1. agent/trajectory.json → final_metrics (harbor-format agent)
+      2. agent/openhands.txt  → "Total cost:" and "Tokens:" lines (ACP agent)
+    """
+    # 1. Harbor-format trajectory.json written by openhands-sdk agent
+    traj_file = trial_dir / "agent" / "trajectory.json"
+    if traj_file.exists():
+        try:
+            with open(traj_file) as f:
+                traj = json.load(f)
+            fm = traj.get("final_metrics") or {}
+            if fm:
+                return {
+                    "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0),
+                    "total_completion_tokens": int(
+                        fm.get("total_completion_tokens") or 0
+                    ),
+                    "total_cost_usd": float(fm.get("total_cost_usd") or 0.0),
+                }
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native)
+    def _parse_token_count(value: str, suffix: str) -> int:
+        n = float(value)
+        return int(
+            n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1)
         )
-    # Use the most recent job directory if multiple exist
-    return sorted(candidates)[-1]
 
+    for log_name in ("openhands.txt", "openhands_sdk.txt"):
+        log_file = trial_dir / "agent" / log_name
+        if not log_file.exists():
+            continue
+        try:
+            text = log_file.read_text(errors="replace")
+            cost_usd = 0.0
+            prompt_tokens = 0
+            completion_tokens = 0
+            m = _COST_RE.search(text)
+            if m:
+                cost_usd = float(m.group(1))
+            m = _TOKENS_RE.search(text)
+            if m:
+                prompt_tokens = _parse_token_count(m.group(1), m.group(2))
+                completion_tokens = _parse_token_count(m.group(3), m.group(4))
+            if cost_usd or prompt_tokens:
+                return {
+                    "total_prompt_tokens": prompt_tokens,
+                    "total_completion_tokens": completion_tokens,
+                    "total_cost_usd": cost_usd,
+                }
+        except OSError:
+            pass
 
-def convert_harbor_to_eval_output(
-    harbor_output_dir: Path,
+    return {
+        "total_prompt_tokens": 0,
+        "total_completion_tokens": 0,
+        "total_cost_usd": 0.0,
+    }
+
+
+def convert_benchflow_to_eval_output(
+    jobs_dir: Path,
     eval_output_path: Path,
+    task_ids: list[str] | None = None,
 ) -> None:
-    """Convert harbor output to evaluation output format.
+    """Convert benchflow job output to standard evaluation output format.
 
-    Harbor stores trial results in a job directory structured as:
-        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
+    benchflow 0.3.0 stores trial results as:
+        jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json
 
-    Each trial's result.json contains task_name, verifier_result, agent_result,
-    timing info, and exception details.
+    Each result.json contains task_name, rewards, error, verifier_error, and timing.
 
     Args:
-        harbor_output_dir: Path to harbor output directory.
-        eval_output_path: Path to write the converted output.jsonl.
+        jobs_dir: Path to benchflow jobs directory.
+        eval_output_path: Path to write output.jsonl.
+        task_ids: Optional filter for specific task IDs (short names).
     """
-    logger.info(f"Converting harbor output from {harbor_output_dir}")
-
-    job_dir = _find_job_dir(harbor_output_dir)
-    logger.info(f"Using harbor job directory: {job_dir}")
+    logger.info(f"Converting benchflow output from {jobs_dir}")
+
+    # benchflow 0.3.0 writes:
+    #   jobs/summary.json
+    #   jobs/TIMESTAMP/TRIAL_NAME/result.json
+    # while older local outputs may place results directly under jobs/.
+    job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()]
+    timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)]
+
+    if timestamp_job_dirs:
+        selected_job_dir = sorted(timestamp_job_dirs)[-1]
+        logger.info(f"Using benchflow job directory: {selected_job_dir}")
+        task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()]
+    else:
+        task_dirs = job_dirs
 
-    # Find trial result files (each trial dir has a result.json)
-    result_files = list(job_dir.glob("*/result.json"))
-    # Exclude the job-level result.json
-    result_files = [f for f in result_files if f.parent != job_dir]
+    if not task_dirs:
+        raise RuntimeError(f"No task directories found in {jobs_dir}")
 
-    if not result_files:
-        raise RuntimeError(
-            f"No trial result files found in {job_dir}. "
-            f"Expected result.json files in trial subdirectories."
-        )
+    if task_ids:
+        short_ids = {tid.split("/")[-1] for tid in task_ids}
+        task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids]
 
-    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+    logger.info(f"Processing {len(task_dirs)} task directories")
 
     results: list[dict] = []
     errors: list[dict] = []
 
-    for result_file in result_files:
+    for task_dir in sorted(task_dirs):
+        # Find the trial result — benchflow writes trial-0/result.json
+        trial_results = list(task_dir.glob("trial-*/result.json"))
+        if not trial_results:
+            # Fall back to a direct result.json
+            direct = task_dir / "result.json"
+            if direct.exists():
+                trial_results = [direct]
+
+        if not trial_results:
+            logger.warning(f"No result.json found in {task_dir}, skipping")
+            errors.append(
+                {
+                    "instance_id": f"benchflow/{task_dir.name}",
+                    "error": "No result.json found",
+                    "test_result": {},
+                }
+            )
+            continue
+
+        # Use the last trial (highest retry index)
+        result_file = sorted(trial_results)[-1]
+
         try:
             with open(result_file) as f:
                 trial = json.load(f)
 
-            instance_id = trial.get("task_name", result_file.parent.name)
+            task_basename = task_dir.name.split("__")[0]
+            task_name = trial.get("task_name") or f"benchflow/{task_basename}"
+            # Normalise to benchflow/<name> form
+            if "/" not in task_name:
+                task_name = f"benchflow/{task_name}"
 
-            # Check for exceptions
-            if trial.get("exception_info"):
+            error = trial.get("error")
+            verifier_error = trial.get("verifier_error")
+
+            if error or verifier_error:
                 errors.append(
                     {
-                        "instance_id": instance_id,
-                        "error": str(trial["exception_info"]),
+                        "instance_id": task_name,
+                        "error": str(error or verifier_error),
                         "test_result": {},
                     }
                 )
                 continue
 
-            # Extract verifier results
-            verifier_result = trial.get("verifier_result", {})
-            rewards = verifier_result.get("rewards", {})
-            passed = rewards.get("reward", 0.0) > 0
-
-            # Extract agent metrics
-            agent_result = trial.get("agent_result", {})
+            rewards = trial.get("rewards") or {}
+            passed = bool(rewards.get("reward", 0.0))
 
             eval_entry = {
-                "instance_id": instance_id,
+                "instance_id": task_name,
                 "test_result": {
-                    "trial_name": trial.get("trial_name"),
-                    "trial_uri": trial.get("trial_uri"),
                     "rewards": rewards,
                     "passed": passed,
                 },
                 "instruction": "",
                 "error": None,
                 "history": [],
-                "metrics": {
-                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
-                    "total_completion_tokens": (
-                        agent_result.get("n_output_tokens") or 0
-                    ),
-                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
-                },
+                "metrics": _extract_trial_metrics(result_file.parent),
             }
             results.append(eval_entry)
-            logger.info(
-                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
-            )
+            logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}")
 
         except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to process result file {result_file}: {e}")
+            logger.error(f"Failed to read {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": result_file.parent.name,
+                    "instance_id": f"benchflow/{task_dir.name}",
                     "error": str(e),
                     "test_result": {},
                 }
             )
 
     if not results and not errors:
-        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+        raise RuntimeError(f"No trials processed from {jobs_dir}")
 
     if not results:
         logger.warning(
-            f"All {len(errors)} trials failed in {harbor_output_dir}; "
-            "writing error entries for downstream reporting"
+            f"All {len(errors)} trials failed; writing error entries for reporting"
         )
 
-    # Write results to output.jsonl
     with open(eval_output_path, "w") as f:
-        for entry in results:
-            f.write(json.dumps(entry) + "\n")
-        for entry in errors:
+        for entry in results + errors:
             f.write(json.dumps(entry) + "\n")
 
     logger.info(
@@ -298,18 +518,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]:
 def main() -> None:
     """Main entry point for skillsbench inference."""
     parser = argparse.ArgumentParser(
-        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
+        description="Run SkillsBench evaluation with benchflow and openhands",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
     # Run full skillsbench evaluation
     uv run skillsbench-infer .llm_config/claude.json
 
-    # Run specific tasks
+    # Run specific tasks from a file
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with custom dataset version
-    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+    # Run with more concurrency
+    uv run skillsbench-infer .llm_config/claude.json --num-workers 4
         """,
     )
 
@@ -322,7 +542,7 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+        help="benchflow dataset name (e.g., benchflow/skillsbench)",
     )
     parser.add_argument(
         "--output-dir",
@@ -334,12 +554,12 @@ def main() -> None:
         "--num-workers",
         type=int,
         default=INFER_DEFAULTS["num_workers"],
-        help="Number of parallel workers",
+        help="Number of parallel workers (concurrency)",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        help="Maximum number of dataset tasks to run after Harbor filtering",
+        help="Maximum number of tasks to run",
     )
     parser.add_argument(
         "--select",
@@ -358,14 +578,13 @@ def main() -> None:
         help="Optional note for the evaluation run",
     )
     parser.add_argument(
-        "--skip-harbor",
+        "--skip-run",
         action="store_true",
-        help="Skip running harbor and only convert existing results",
+        help="Skip running benchflow and only convert existing results",
     )
 
     args = parser.parse_args()
 
-    # Validate LLM config
     if not os.path.isfile(args.llm_config_path):
         logger.error(f"LLM config file does not exist: {args.llm_config_path}")
         sys.exit(1)
@@ -375,87 +594,103 @@ def main() -> None:
     llm = LLM.model_validate_json(llm_config)
     logger.info(f"Using LLM: {llm.model}")
 
-    # Check harbor installation
-    if not args.skip_harbor and not check_harbor_installed():
+    if not args.skip_run and not check_benchflow_installed():
         logger.error(
-            "Harbor CLI is not installed. Please install it:\n"
-            "  pip install harbor\n"
+            "benchflow CLI is not installed. Please install it:\n"
+            "  uv tool install benchflow==0.3.0\n"
             "  # or\n"
-            "  uv pip install harbor"
+            "  pip install benchflow==0.3.0\n"
+            "  # or\n"
+            "  uv pip install benchflow==0.3.0"
         )
         sys.exit(1)
 
-    # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
         base_dir=args.output_dir,
         dataset_name=dataset_description,
         model_name=llm.model,
-        max_iterations=100,  # Not directly used but required for path construction
+        max_iterations=100,
         eval_note=args.note,
     )
 
     logger.info(f"Output directory: {structured_output_dir}")
     os.makedirs(structured_output_dir, exist_ok=True)
 
-    # Save metadata
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
         "timestamp": datetime.now(timezone.utc).isoformat(),
-        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
+        "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"],
         "note": args.note,
     }
     metadata_path = Path(structured_output_dir) / "metadata.json"
     with open(metadata_path, "w") as f:
         json.dump(metadata, f, indent=2)
 
-    # Collect task IDs if specified
     task_ids: list[str] | None = None
     if args.select:
-        loaded_ids = load_task_ids_from_file(args.select)
-        task_ids = loaded_ids
-        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
+        task_ids = load_task_ids_from_file(args.select)
+        logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}")
     elif args.task_id:
         task_ids = list(args.task_id)
         logger.info(f"Running {len(task_ids)} specified task IDs")
 
+    tasks_dir = Path(structured_output_dir) / "tasks"
+    jobs_dir = Path(structured_output_dir) / "jobs"
     output_path = Path(structured_output_dir) / OUTPUT_FILENAME
 
-    if not args.skip_harbor:
-        # Run harbor evaluation
+    if not args.skip_run:
         try:
-            harbor_output_dir = run_harbor_evaluation(
+            ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids)
+
+            # Apply n_limit by slicing available task directories
+            effective_task_dirs = tasks_dir
+            if args.n_limit is not None or task_ids is not None:
+                all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir())
+                if task_ids:
+                    short_ids = {tid.split("/")[-1] for tid in task_ids}
+                    all_dirs = [d for d in all_dirs if d.name in short_ids]
+                if args.n_limit is not None:
+                    all_dirs = all_dirs[: args.n_limit]
+
+                # Write a filtered tasks dir symlink tree
+                filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered"
+                filtered_tasks_dir.mkdir(exist_ok=True)
+                for d in all_dirs:
+                    link = filtered_tasks_dir / d.name
+                    if not link.exists():
+                        link.symlink_to(d.resolve())
+                effective_task_dirs = filtered_tasks_dir
+
+            run_benchflow_job(
                 llm=llm,
-                dataset=args.dataset,
-                output_dir=structured_output_dir,
+                tasks_dir=effective_task_dirs,
+                jobs_dir=jobs_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
-                n_limit=args.n_limit,
             )
 
-            # Convert harbor output to standard format
-            convert_harbor_to_eval_output(
-                harbor_output_dir=harbor_output_dir,
+            convert_benchflow_to_eval_output(
+                jobs_dir=jobs_dir,
                 eval_output_path=output_path,
+                task_ids=task_ids,
             )
 
         except Exception as e:
             logger.error(f"Evaluation failed: {e}")
             sys.exit(1)
     else:
-        # Skip harbor, just convert existing results
-        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
-        if harbor_output_dir.exists():
-            convert_harbor_to_eval_output(
-                harbor_output_dir=harbor_output_dir,
+        if jobs_dir.exists():
+            convert_benchflow_to_eval_output(
+                jobs_dir=jobs_dir,
                 eval_output_path=output_path,
+                task_ids=task_ids,
             )
         else:
-            logger.error(f"No harbor output found at {harbor_output_dir}")
+            logger.error(f"No jobs output found at {jobs_dir}")
             sys.exit(1)
 
-    # Generate cost report
     if output_path.exists():
         generate_cost_report(str(output_path))
 
diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
index 56d54f27a..1334da297 100644
--- a/tests/test_skillsbench_eval_infer.py
+++ b/tests/test_skillsbench_eval_infer.py
@@ -39,23 +39,6 @@ def test_resolved_instance(self, tmp_path: Path) -> None:
         assert result["unresolved_instances"] == 0
         assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
 
-    def test_unresolved_instance(self, tmp_path: Path) -> None:
-        """Test processing an unresolved (passed=False) instance."""
-        input_file = tmp_path / "unresolved.jsonl"
-        output_file = tmp_path / "unresolved.report.json"
-
-        entry = {
-            "instance_id": "benchflow/task-1",
-            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
-            "error": None,
-        }
-        input_file.write_text(json.dumps(entry) + "\n")
-
-        result = process_skillsbench_results(str(input_file), str(output_file))
-
-        assert result["resolved_instances"] == 0
-        assert result["unresolved_instances"] == 1
-
     def test_instance_with_error(self, tmp_path: Path) -> None:
         """Test processing an instance that errored."""
         input_file = tmp_path / "error.jsonl"
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 5f8452cb3..784b4d1cc 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -4,82 +4,102 @@
 from pathlib import Path
 
 import pytest
+import yaml
 
-from benchmarks.skillsbench.config import INFER_DEFAULTS
+from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
-    convert_harbor_to_eval_output,
-    run_harbor_evaluation,
+    _build_benchflow_agent_env,
+    convert_benchflow_to_eval_output,
+    run_benchflow_job,
 )
 from openhands.sdk import LLM
 
 
-class TestRunHarborEvaluation:
-    """Tests for building Harbor invocation arguments."""
+class TestRunBenchflowJob:
+    """Tests for building benchflow job invocation arguments."""
 
-    def test_default_dataset_matches_harbor_registry(self) -> None:
-        """Test that the default dataset name matches Harbor's published registry."""
+    def test_default_dataset_matches_benchflow_registry(self) -> None:
+        """Test that the default dataset name matches benchflow's published registry."""
         assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
 
-    def test_run_harbor_evaluation_passes_filters_and_limits(
+    def test_default_agent_is_openhands(self) -> None:
+        """Test that the default agent is openhands."""
+        assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands"
+
+    def test_run_benchflow_job_passes_model_and_concurrency(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test Harbor command includes task filters and n-limit."""
-        captured: dict[str, list[str]] = {}
+        """Test benchflow job command writes the expected YAML config."""
+        captured_cmd: list[str] = []
+        captured_env: dict[str, str] = {}
+        captured_config: dict = {}
+
+        # Force legacy benchflow binary path so the command format is deterministic
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.shutil.which",
+            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
+        )
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured["cmd"] = cmd
+            captured_cmd[:] = cmd
+            captured_env.clear()
+            captured_env.update(env)
+            with open(cmd[3]) as f:
+                captured_config.update(yaml.safe_load(f))
             return type(
                 "Completed",
                 (),
-                {"returncode": 0, "stdout": "ok", "stderr": ""},
+                {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""},
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        harbor_output_dir = run_harbor_evaluation(
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        jobs_dir = tmp_path / "jobs"
+
+        run_benchflow_job(
             llm=LLM(
-                model="litellm_proxy/test-model",
+                model="anthropic/claude-sonnet-4-5",
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
-            output_dir=str(tmp_path),
-            num_workers=2,
-            task_ids=["benchflow/task-a", "benchflow/task-b"],
-            n_limit=3,
-        )
-
-        expected_output_dir = tmp_path / "harbor_output"
-        assert harbor_output_dir == expected_output_dir
-
-        cmd = captured["cmd"]
-        assert cmd[:8] == [
-            "harbor",
-            "run",
-            "-d",
-            "benchflow/skillsbench",
-            "-a",
-            "openhands-sdk",
-            "-m",
-            "litellm_proxy/test-model",
-        ]
-        assert "--jobs-dir" in cmd
-        assert str(expected_output_dir.resolve()) in cmd
-        assert cmd.count("--include-task-name") == 2
-        assert "benchflow/task-a" in cmd
-        assert "benchflow/task-b" in cmd
-        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
-        assert cmd[cmd.index("--n-tasks") + 1] == "3"
-
-    def test_llm_credentials_passed_via_env(
+            tasks_dir=tasks_dir,
+            jobs_dir=jobs_dir,
+            num_workers=4,
+        )
+
+        cmd = captured_cmd
+        assert cmd[0] == "/usr/local/bin/benchflow"
+        assert cmd[1] == "job"
+        assert cmd[2] == "--config"
+        assert captured_config["tasks_dir"] == str(tasks_dir)
+        assert captured_config["jobs_dir"] == str(jobs_dir.resolve())
+        assert captured_config["agent"] == "openhands"
+        assert captured_config["model"] == "anthropic/claude-sonnet-4-5"
+        assert captured_config["concurrency"] == 4
+        assert captured_config["sandbox_user"] is None
+
+    def test_llm_credentials_passed_via_subprocess_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
-        captured: dict = {}
+        """Test that LLM credentials are passed via subprocess env and YAML."""
+        captured_cmd: list[str] = []
+        captured_env: dict[str, str] = {}
+        captured_config: dict = {}
+
+        # Force legacy benchflow binary path so the command format is deterministic
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.shutil.which",
+            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
+        )
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured["cmd"] = cmd
-            captured["env"] = env
+            captured_cmd[:] = cmd
+            captured_env.clear()
+            captured_env.update(env)
+            with open(cmd[3]) as f:
+                captured_config.update(yaml.safe_load(f))
             return type(
                 "Completed",
                 (),
@@ -88,60 +108,127 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        run_harbor_evaluation(
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+
+        run_benchflow_job(
             llm=LLM(
                 model="test-model",
                 api_key="my-secret-key",
                 base_url="https://my-proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
-            output_dir=str(tmp_path),
+            tasks_dir=tasks_dir,
+            jobs_dir=tmp_path / "jobs",
+        )
+
+        # Credentials in subprocess env
+        assert captured_env["LLM_API_KEY"] == "my-secret-key"
+        assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com"
+        assert "--ae" not in captured_cmd
+        assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key"
+        assert (
+            captured_config["agent_env"]["LLM_BASE_URL"]
+            == "https://my-proxy.example.com"
         )
 
-        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
-        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+    def test_direct_gemini_model_sets_provider_env_vars(self) -> None:
+        """Direct provider models need provider-specific env vars."""
+        env = _build_benchflow_agent_env(
+            LLM(
+                model="gemini/gemini-3.1-flash-lite-preview",
+                api_key="gemini-test-key",
+            )
+        )
 
+        assert env["LLM_API_KEY"] == "gemini-test-key"
+        assert env["GEMINI_API_KEY"] == "gemini-test-key"
+        assert env["GOOGLE_API_KEY"] == "gemini-test-key"
 
-class TestConvertHarborToEvalOutput:
-    """Tests for convert_harbor_to_eval_output function."""
+    def test_proxy_model_does_not_set_provider_env_vars(self) -> None:
+        """LiteLLM proxy configs should keep using generic LLM_* vars only."""
+        env = _build_benchflow_agent_env(
+            LLM(
+                model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
+                api_key="proxy-key",
+                base_url="https://proxy.example.com",
+            )
+        )
 
-    def _create_harbor_structure(
-        self, tmp_path: Path, trials: list[tuple[str, dict]]
-    ) -> Path:
-        """Create a mock Harbor output structure."""
-        harbor_dir = tmp_path / "harbor_output"
-        job_dir = harbor_dir / "2026-01-01__00-00-00"
-        job_dir.mkdir(parents=True)
-        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
+        assert env["LLM_API_KEY"] == "proxy-key"
+        assert env["LLM_BASE_URL"] == "https://proxy.example.com"
+        assert "ANTHROPIC_API_KEY" not in env
+        assert "ANTHROPIC_BASE_URL" not in env
 
-        for trial_name, trial_result in trials:
-            trial_dir = job_dir / trial_name
-            trial_dir.mkdir()
-            (trial_dir / "result.json").write_text(json.dumps(trial_result))
 
-        return harbor_dir
+class TestConvertBenchflowToEvalOutput:
+    """Tests for convert_benchflow_to_eval_output function."""
+
+    def _create_benchflow_structure(
+        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock benchflow jobs directory structure.
+
+        benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json
+        """
+        jobs_dir = tmp_path / "jobs"
+        for task_name, result in tasks:
+            trial_dir = jobs_dir / task_name / "trial-0"
+            trial_dir.mkdir(parents=True)
+            (trial_dir / "result.json").write_text(json.dumps(result))
+        return jobs_dir
+
+    def _create_benchflow_timestamped_job(
+        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock benchflow 0.3.0 jobs directory structure.
+
+        benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json
+        """
+        jobs_dir = tmp_path / "jobs"
+        job_dir = jobs_dir / "2026-04-21__23-12-35"
+        job_dir.mkdir(parents=True)
+        (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)}))
+        for task_name, result in tasks:
+            trial_dir = job_dir / f"{task_name}__abc123"
+            trial_dir.mkdir(parents=True)
+            (trial_dir / "result.json").write_text(json.dumps(result))
+        return jobs_dir
 
     def test_successful_trial_parsing(self, tmp_path: Path) -> None:
-        """Test successful parsing of harbor trial result."""
+        """Test successful parsing of a benchflow trial result.
+
+        benchflow 0.3.0 does not write cost/token fields to result.json.
+        Metrics are read from agent/trajectory.json (harbor-format agent)
+        or parsed from agent/openhands.txt (ACP agent stdout).
+        """
         trial_result = {
             "task_name": "benchflow/weighted-gdp-calc",
-            "trial_name": "weighted-gdp-calc__abc123",
-            "trial_uri": "file:///path/to/trial",
-            "agent_result": {
-                "n_input_tokens": 1000,
-                "n_output_tokens": 200,
-                "cost_usd": 0.05,
-            },
-            "verifier_result": {"rewards": {"reward": 1.0}},
-            "exception_info": None,
+            "rewards": {"reward": 1.0},
+            "error": None,
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        # Write agent/trajectory.json with final_metrics (harbor-format agent output).
+        # agent/ sits next to result.json, inside the trial-0 subdirectory.
+        trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0"
+        agent_dir = trial_dir / "agent"
+        agent_dir.mkdir(parents=True, exist_ok=True)
+        (agent_dir / "trajectory.json").write_text(
+            json.dumps(
+                {
+                    "final_metrics": {
+                        "total_prompt_tokens": 1000,
+                        "total_completion_tokens": 200,
+                        "total_cost_usd": 0.05,
+                    }
+                }
+            )
         )
         output_file = tmp_path / "output.jsonl"
 
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         assert output_file.exists()
         with open(output_file) as f:
@@ -151,26 +238,52 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
+        assert entries[0]["metrics"]["total_prompt_tokens"] == 1000
+        assert entries[0]["metrics"]["total_completion_tokens"] == 200
+
+    def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None:
+        """Test that metrics are extracted from agent/openhands.txt (ACP agent)."""
+        trial_result = {
+            "task_name": "benchflow/acp-task",
+            "rewards": {"reward": 1.0},
+            "error": None,
+        }
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path, [("acp-task", trial_result)]
+        )
+        # Write agent/openhands.txt simulating openhands ACP stdout
+        trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123"
+        agent_dir = trial_dir / "agent"
+        agent_dir.mkdir(parents=True, exist_ok=True)
+        (agent_dir / "openhands.txt").write_text(
+            "OpenHands SDK v1.16.0\n"
+            "Tokens: ↑ input 404.21K • cache hit 70.47% •  reasoning 579 • ↓ output 7.83K • $0.0487\n"
+            "Total cost: $0.0487\n"
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487)
+        assert entries[0]["metrics"]["total_prompt_tokens"] == 404210
+        assert entries[0]["metrics"]["total_completion_tokens"] == 7830
 
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {
             "task_name": "benchflow/task-1",
-            "trial_name": "task-1__xyz",
-            "agent_result": {
-                "n_input_tokens": None,
-                "n_output_tokens": None,
-                "cost_usd": None,
-            },
-            "verifier_result": {"rewards": {"reward": 0.0}},
-            "exception_info": None,
+            "rewards": {"reward": 0.0},
+            "error": None,
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("task-1__xyz", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("task-1", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -178,21 +291,19 @@ def test_failed_trial(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is False
         assert entries[0]["metrics"]["total_cost_usd"] == 0.0
 
-    def test_trial_with_exception(self, tmp_path: Path) -> None:
-        """Test that exception trials are written as error entries."""
+    def test_trial_with_error(self, tmp_path: Path) -> None:
+        """Test that errored trials are written as error entries."""
         trial_result = {
             "task_name": "benchflow/error-task",
-            "trial_name": "error-task__err",
-            "agent_result": {},
-            "verifier_result": {},
-            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
+            "rewards": {},
+            "error": "LLM_API_KEY not set",
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("error-task__err", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("error-task", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -202,20 +313,121 @@ def test_trial_with_exception(self, tmp_path: Path) -> None:
         assert entries[0]["error"] is not None
         assert entries[0]["test_result"] == {}
 
-    def test_missing_job_directory(self, tmp_path: Path) -> None:
-        """Test handling when no job directory exists."""
-        harbor_dir = tmp_path / "harbor_output"
-        harbor_dir.mkdir()
+    def test_missing_jobs_directory(self, tmp_path: Path) -> None:
+        """Test handling when jobs directory is empty."""
+        jobs_dir = tmp_path / "jobs"
+        jobs_dir.mkdir()
+
+        with pytest.raises(RuntimeError, match="No task directories found"):
+            convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl")
+
+    def test_task_id_filtering(self, tmp_path: Path) -> None:
+        """Test that only specified task IDs are converted."""
+        trials = [
+            (
+                "task-a",
+                {
+                    "task_name": "benchflow/task-a",
+                    "rewards": {"reward": 1.0},
+                    "error": None,
+                },
+            ),
+            (
+                "task-b",
+                {
+                    "task_name": "benchflow/task-b",
+                    "rewards": {"reward": 0.0},
+                    "error": None,
+                },
+            ),
+        ]
+        jobs_dir = self._create_benchflow_structure(tmp_path, trials)
+        output_file = tmp_path / "output.jsonl"
 
-        with pytest.raises(RuntimeError, match="No harbor job directory found"):
-            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+        convert_benchflow_to_eval_output(
+            jobs_dir, output_file, task_ids=["benchflow/task-a"]
+        )
 
-    def test_empty_job_directory(self, tmp_path: Path) -> None:
-        """Test handling of harbor job dir with no trial subdirs."""
-        harbor_dir = tmp_path / "harbor_output"
-        job_dir = harbor_dir / "2026-01-01__00-00-00"
-        job_dir.mkdir(parents=True)
-        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/task-a"
+
+    def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None:
+        """Test that task names without prefix get benchflow/ prepended."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",  # no benchflow/ prefix
+            "rewards": {"reward": 1.0},
+            "error": None,
+        }
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+
+    def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None:
+        """Test benchflow 0.3.0 timestamped jobs directory layout."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",
+            "rewards": {"reward": 1.0},
+            "error": None,
+            "n_input_tokens": 42,
+            "n_output_tokens": 7,
+            "cost_usd": 0.01,
+        }
+
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+        assert entries[0]["test_result"]["passed"] is True
 
-        with pytest.raises(RuntimeError, match="No trial result files found"):
-            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+    def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None:
+        """Test filtering strips the run suffix from trial directory names."""
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path,
+            [
+                (
+                    "task-a",
+                    {
+                        "task_name": "task-a",
+                        "rewards": {"reward": 1.0},
+                        "error": None,
+                    },
+                ),
+                (
+                    "task-b",
+                    {
+                        "task_name": "task-b",
+                        "rewards": {"reward": 0.0},
+                        "error": None,
+                    },
+                ),
+            ],
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_benchflow_to_eval_output(
+            jobs_dir, output_file, task_ids=["benchflow/task-a"]
+        )
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/task-a"
diff --git a/uv.lock b/uv.lock
index 2cd0b3640..147abedc9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,6 +1282,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1292,6 +1293,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1302,6 +1304,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -1516,11 +1519,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
-[package.optional-dependencies]
-socks = [
-    { name = "socksio" },
-]
-
 [[package]]
 name = "httpx-sse"
 version = "0.4.2"
@@ -1816,12 +1814,14 @@ wheels = [
 
 [[package]]
 name = "litellm"
-version = "1.83.0"
+version = "1.80.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "click" },
     { name = "fastuuid" },
+    { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "httpx" },
     { name = "importlib-metadata" },
     { name = "jinja2" },
@@ -1832,9 +1832,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "tokenizers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" },
 ]
 
 [[package]]
@@ -2402,7 +2402,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2467,6 +2467,7 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
+    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2521,6 +2522,7 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
+    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -2544,7 +2546,7 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "agent-client-protocol" },
@@ -2552,7 +2554,7 @@ dependencies = [
     { name = "fakeredis", extra = ["lua"] },
     { name = "fastmcp" },
     { name = "filelock" },
-    { name = "httpx", extra = ["socks"] },
+    { name = "httpx" },
     { name = "litellm" },
     { name = "lmnr" },
     { name = "pydantic" },
@@ -2575,8 +2577,8 @@ requires-dist = [
     { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" },
     { name = "fastmcp", specifier = ">=3.0.0" },
     { name = "filelock", specifier = ">=3.20.1" },
-    { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" },
-    { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" },
+    { name = "httpx", specifier = ">=0.27.0" },
+    { name = "litellm", specifier = "==1.80.10" },
     { name = "lmnr", specifier = ">=0.7.24" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
@@ -2588,7 +2590,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2617,7 +2619,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },
@@ -6707,15 +6709,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
-[[package]]
-name = "socksio"
-version = "1.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" },
-]
-
 [[package]]
 name = "sortedcontainers"
 version = "2.4.0"
@@ -6841,6 +6834,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
+[[package]]
+name = "swesmith"
+version = "0.0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
+]
+
 [[package]]
 name = "swt-bench"
 version = "1.0.1"
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 3e0a3a091..acd5adc96 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a
+Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6

From 3963e9c50c4c47bbe7a47f7ace44e1d1cf57dcef Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Wed, 22 Apr 2026 20:57:53 -0400
Subject: [PATCH 04/12] Revert "feat(skillsbench): migrate harness from Harbor
 to benchflow 0.3.0"

This reverts commit 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b.
---
 .gitignore                           |   1 -
 benchmarks/skillsbench/README.md     |  42 +-
 benchmarks/skillsbench/config.py     |  11 +-
 benchmarks/skillsbench/run_infer.py  | 655 +++++++++------------------
 tests/test_skillsbench_eval_infer.py |  17 +
 tests/test_skillsbench_run_infer.py  | 442 +++++-------------
 uv.lock                              |  50 +-
 vendor/software-agent-sdk            |   2 +-
 8 files changed, 396 insertions(+), 824 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9164fd12b..459fad588 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,5 +216,4 @@ workspace/
 
 # Evaluation outputs
 eval_outputs/
-evaluation_outputs/
 builds/
diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 21339842c..60ff73652 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -1,10 +1,10 @@
 # SkillsBench Evaluation
 
-This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent.
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
 
 ## Overview
 
-SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include:
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
 
 - Software engineering
 - Office & white collar
@@ -20,25 +20,23 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Prerequisites
 
-1. **Install benchflow**: benchflow is the official harness for running SkillsBench.
+1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
 
    ```bash
-   uv tool install benchflow==0.3.0
+   pip install harbor
    # or
-   pip install benchflow==0.3.0
-   # or
-   uv pip install benchflow==0.3.0
+   uv pip install harbor
    ```
 
-2. **Docker**: benchflow requires Docker to be installed and running.
+2. **Docker**: Harbor requires Docker to be installed and running.
 
-3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment.
+3. **LLM API Key**: Configure your LLM provider credentials.
 
 ## Usage
 
 ### Running Inference
 
-Run the SkillsBench evaluation using the `openhands` agent:
+Run the SkillsBench evaluation using the OpenHands SDK agent:
 
 ```bash
 # Run full evaluation
@@ -64,7 +62,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`):
 ```json
 {
   "model": "anthropic/claude-sonnet-4-20250514",
-  "api_key": "YOUR_ANTHROPIC_API_KEY"
+  "api_key": "YOUR_API_KEY"
 }
 ```
 
@@ -101,6 +99,8 @@ Each line contains:
 {
   "instance_id": "benchflow/task-name",
   "test_result": {
+    "trial_name": "...",
+    "trial_uri": "...",
     "rewards": {"reward": 1.0},
     "passed": true
   },
@@ -134,21 +134,22 @@ Each line contains:
 
 ## Architecture
 
-The integration uses the benchflow CLI as the evaluation harness:
+The integration follows the Harbor agent adapter pattern:
 
-1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty
-2. **benchflow job**: Runs all tasks concurrently with `openhands`
-3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format
+1. **Harbor Harness**: Manages task containers and lifecycle
+2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
+3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
 
 ```text
 ┌──────────────────────────────────────────────────┐
-│               benchflow job                      │
+│                 Harbor Harness                   │
 │  ┌────────────────────────────────────────────┐  │
-│  │           Task Container (Docker)          │  │
+│  │           Task Container                   │  │
 │  │  ┌──────────────────────────────────────┐  │  │
-│  │  │       openhands                      │  │  │
+│  │  │       OpenHands SDK Agent            │  │  │
 │  │  │  - Terminal tool                     │  │  │
 │  │  │  - File editor tool                  │  │  │
+│  │  │  - Task tracker tool                 │  │  │
 │  │  └──────────────────────────────────────┘  │  │
 │  └────────────────────────────────────────────┘  │
 └──────────────────────────────────────────────────┘
@@ -157,5 +158,6 @@ The integration uses the benchflow CLI as the evaluation harness:
 ## References
 
 - [SkillsBench](https://www.skillsbench.ai/) - The benchmark
-- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness
-- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation
+- [Harbor](https://harborframework.com) - The evaluation harness
+- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
+- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
index 4ed541ab9..8b55a92b0 100644
--- a/benchmarks/skillsbench/config.py
+++ b/benchmarks/skillsbench/config.py
@@ -1,13 +1,16 @@
 """SkillsBench configuration defaults."""
 
-# Default inference settings
+# Default inference settings (only include values actually used by argparse)
 INFER_DEFAULTS = {
     "dataset": "benchflow/skillsbench",
     "output_dir": "./evaluation_outputs",
     "num_workers": 1,
 }
 
-# benchflow configuration defaults
-BENCHFLOW_DEFAULTS = {
-    "agent_name": "openhands",
+# Harbor configuration defaults
+HARBOR_DEFAULTS = {
+    # Harbor executable
+    "harbor_executable": "harbor",
+    # Default agent name for openhands-sdk
+    "agent_name": "openhands-sdk",
 }
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index 2e11a100a..a8afa7281 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -1,31 +1,24 @@
-"""SkillsBench inference script using the benchflow SDK.
+"""SkillsBench inference script using Harbor with openhands-sdk agent.
 
-This script runs SkillsBench evaluation using `benchflow job` as the harness
-and `openhands` as the default agent. Results are saved in a format compatible
+This script runs SkillsBench evaluation using Harbor as the harness
+and openhands-sdk as the agent. Results are saved in a format compatible
 with the standard evaluation pipeline.
 
 Usage:
-    uv run skillsbench-infer <llm_config_path>
-
-    # Run specific tasks
-    uv run skillsbench-infer <llm_config_path> --select tasks.txt
+    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
 """
 
 import argparse
 import json
 import os
-import re
-import shutil
 import subprocess
 import sys
-import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
-import yaml
 from pydantic import SecretStr
 
-from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
@@ -33,469 +26,256 @@
 
 logger = get_logger(__name__)
 
-# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS
-_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$")
-
-# "Total cost: $0.0487"
-_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)")
-# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K"
-_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b")
-
+# Output filename for results
 OUTPUT_FILENAME = "output.jsonl"
 
-TASK_REPOS = {
-    "skillsbench": {
-        "repo": "https://github.com/benchflow-ai/skillsbench.git",
-        "subdir": "tasks",
-    }
-}
-
-_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = {
-    "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"),
-    "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
-    "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
-    "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"),
-}
-
-
-def _infer_direct_provider(model: str) -> str | None:
-    """Infer the provider prefix for direct model names.
-
-    Examples:
-      - gemini/gemini-2.5-pro -> gemini
-      - anthropic/claude-sonnet-4-5 -> anthropic
-      - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars)
-    """
-    if not model or model.startswith("litellm_proxy/"):
-        return None
-    if "/" in model:
-        provider = model.split("/", 1)[0].lower()
-        if provider in _DIRECT_PROVIDER_ENV_VARS:
-            return provider
-    return None
-
-
-def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]:
-    """Build the sandbox environment for benchflow's openhands agent.
-
-    Only LLM-specific variables are returned — these go INTO the sandbox
-    container via the ``agent_env`` YAML key.  The calling process inherits
-    the host environment normally; dumping ``os.environ`` here would leak
-    the entire host env into every container.
-    """
-    env: dict[str, str] = {}
-    api_key: str | None = None
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        env["LLM_API_KEY"] = api_key
-    if llm.base_url:
-        env["LLM_BASE_URL"] = llm.base_url
-
-    provider = _infer_direct_provider(llm.model)
-    if provider and api_key:
-        key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider]
-        for var_name in key_vars:
-            env[var_name] = api_key
-        if llm.base_url and base_url_var:
-            env[base_url_var] = llm.base_url
-
-    return env
-
-
-def check_benchflow_installed() -> bool:
-    """Check if benchflow CLI is installed and available.
-
-    Tries ``bench`` first (current name), then falls back to the legacy
-    ``benchflow`` binary.
-    """
-    for cmd in ("bench", "benchflow"):
-        try:
-            result = subprocess.run(
-                [cmd, "--help"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-            if result.returncode == 0:
-                return True
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            continue
-    return False
-
 
-def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]:
-    """Map a benchflow dataset name to its task repository metadata."""
-    dataset_name = dataset.split("@", 1)[0].split("/")[-1]
+def check_harbor_installed() -> bool:
+    """Check if harbor CLI is installed and available."""
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
     try:
-        return dataset_name, TASK_REPOS[dataset_name]
-    except KeyError as exc:
-        raise ValueError(
-            f"Unsupported SkillsBench dataset: {dataset!r}. "
-            f"Known datasets: {sorted(TASK_REPOS)}"
-        ) from exc
-
-
-def ensure_tasks(
-    dataset: str,
-    tasks_dir: Path,
-    task_ids: list[str] | None = None,
-) -> None:
-    """Download tasks for a benchflow dataset into tasks_dir.
-
-    BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the
-    benchmark task repository directly when the local tasks directory is empty.
-
-    When *task_ids* is provided, a sparse checkout is used so only the
-    requested task subdirectories are downloaded — much faster than a full
-    clone for large repos.
-    """
-    if tasks_dir.exists() and any(tasks_dir.iterdir()):
-        logger.info(f"Tasks already present in {tasks_dir}, skipping download")
-        return
-
-    _, repo_info = _resolve_task_repo(dataset)
-    tasks_dir.mkdir(parents=True, exist_ok=True)
-    clone_dir = tasks_dir.parent / "_clone"
-    if clone_dir.exists():
-        shutil.rmtree(clone_dir, ignore_errors=True)
-
-    subdir = repo_info.get("subdir", "")
-
-    if task_ids:
-        # Sparse checkout: only download the specific task directories
-        short_names = [tid.split("/")[-1] for tid in task_ids]
-
-        cmd_clone = [
-            "git",
-            "clone",
-            "--no-checkout",
-            "--depth",
-            "1",
-            repo_info["repo"],
-            str(clone_dir),
-        ]
-        logger.info(f"Sparse clone: {' '.join(cmd_clone)}")
-        result = subprocess.run(cmd_clone, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"task download failed: {result.stderr}")
-
-        # Init sparse-checkout and set the desired paths
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"],
+        result = subprocess.run(
+            [harbor_exe, "--version"],
             capture_output=True,
             text=True,
-            check=True,
+            timeout=10,
         )
-        sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names]
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "checkout"],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-    else:
-        # Full shallow clone
-        cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)]
-        logger.info(f"Downloading tasks: {' '.join(cmd)}")
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            logger.error(f"Failed to clone tasks: {result.stderr}")
-            raise RuntimeError(f"task download failed: {result.stderr}")
-
-    try:
-        source_dir = clone_dir / subdir if subdir else clone_dir
-
-        for entry in source_dir.iterdir():
-            target = tasks_dir / entry.name
-            if entry.is_dir():
-                shutil.copytree(entry, target, dirs_exist_ok=True)
-            else:
-                shutil.copy2(entry, target)
-    finally:
-        shutil.rmtree(clone_dir, ignore_errors=True)
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
 
-    logger.info(f"Tasks downloaded to {tasks_dir}")
 
-
-def run_benchflow_job(
+def run_harbor_evaluation(
     llm: LLM,
-    tasks_dir: Path,
-    jobs_dir: Path,
+    dataset: str,
+    output_dir: str,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
+    n_limit: int | None = None,
 ) -> Path:
-    """Run benchflow job command.
+    """Run harbor evaluation with openhands-sdk agent.
 
     Args:
         llm: LLM configuration for the agent.
-        tasks_dir: Path to directory containing task subdirectories.
-        jobs_dir: Directory for benchflow job output.
-        num_workers: Number of parallel workers (concurrency).
-        task_ids: Optional list of task IDs to filter (short names, not full paths).
+        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        output_dir: Directory to store output files.
+        num_workers: Number of parallel workers.
+        task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
 
     Returns:
-        Path to jobs_dir.
+        Path to the harbor output directory.
     """
-    jobs_dir.mkdir(parents=True, exist_ok=True)
-
-    agent_env = _build_benchflow_agent_env(llm)
-    # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without
-    # --break-system-packages. benchflow's openhands install_cmd uses plain
-    # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04,
-    # causing "Agent openhands install failed (rc=1)". Setting this env var
-    # makes pip skip the restriction without modifying the install_cmd.
-    agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1")
-    config = {
-        "tasks_dir": str(tasks_dir),
-        "jobs_dir": str(jobs_dir.resolve()),
-        "agent": BENCHFLOW_DEFAULTS["agent_name"],
-        "model": llm.model,
-        "environment": "docker",
-        "concurrency": num_workers,
-        # OpenHands is installed inside the sandbox as root by benchflow's
-        # registry install command. Running as the default "agent" user can
-        # lose access to that binary on some task images.
-        "sandbox_user": None,
-        "agent_env": agent_env,
-    }
-
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False
-    ) as tmp:
-        yaml.safe_dump(config, tmp, sort_keys=False)
-        config_path = tmp.name
-
-    # Prefer `bench eval create` (current), fall back to legacy `benchflow job`
-    bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench"
-    if "benchflow" in bench_bin:
-        cmd = [bench_bin, "job", "--config", config_path]
-    else:
-        cmd = [bench_bin, "eval", "create", "-f", config_path]
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+
+    # Build harbor command using harbor CLI flags.
+    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
+    cmd = [
+        harbor_exe,
+        "run",
+        "-d",
+        dataset,
+        "-a",
+        HARBOR_DEFAULTS["agent_name"],
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    # Pass LLM credentials as agent environment variables
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
+    if llm.base_url:
+        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
 
-    logger.info(f"Running: {' '.join(cmd)}")
+    # Add specific task names if provided
+    if task_ids:
+        for task_id in task_ids:
+            cmd.extend(["--include-task-name", task_id])
 
-    # Inject LLM vars into the host process env so benchflow's provider
-    # resolution can pick them up; the subprocess inherits normally (env=None).
-    host_env = os.environ.copy()
-    host_env.update(agent_env)
-    result = subprocess.run(cmd, capture_output=True, text=True, env=host_env)
-    Path(config_path).unlink(missing_ok=True)
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
 
-    if result.returncode != 0:
-        logger.error(f"benchflow job failed (code {result.returncode})")
-        logger.error(f"stdout: {result.stdout}")
-        logger.error(f"stderr: {result.stderr}")
-        raise RuntimeError(f"benchflow job failed: {result.stderr}")
+    logger.info(f"Running harbor command: {' '.join(cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
 
-    logger.info("benchflow job completed")
-    logger.info(f"stdout: {result.stdout}")
+    # harbor's openhands-sdk agent reads LLM credentials from the host process
+    # environment (os.environ), not from --ae flags which go to the sandbox.
+    env = os.environ.copy()
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
 
-    return jobs_dir
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
 
+        if result.returncode != 0:
+            logger.error(f"Harbor command failed with code {result.returncode}")
+            logger.error(f"stdout: {result.stdout}")
+            logger.error(f"stderr: {result.stderr}")
+            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
 
-def _extract_trial_metrics(trial_dir: Path) -> dict:
-    """Extract token/cost metrics from benchflow 0.3.0 trial output files.
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
 
-    benchflow 0.3.0 does not write cost/token fields to result.json.
-    Instead, metrics are read from:
-      1. agent/trajectory.json → final_metrics (harbor-format agent)
-      2. agent/openhands.txt  → "Total cost:" and "Tokens:" lines (ACP agent)
-    """
-    # 1. Harbor-format trajectory.json written by openhands-sdk agent
-    traj_file = trial_dir / "agent" / "trajectory.json"
-    if traj_file.exists():
-        try:
-            with open(traj_file) as f:
-                traj = json.load(f)
-            fm = traj.get("final_metrics") or {}
-            if fm:
-                return {
-                    "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0),
-                    "total_completion_tokens": int(
-                        fm.get("total_completion_tokens") or 0
-                    ),
-                    "total_cost_usd": float(fm.get("total_cost_usd") or 0.0),
-                }
-        except (json.JSONDecodeError, OSError):
-            pass
-
-    # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native)
-    def _parse_token_count(value: str, suffix: str) -> int:
-        n = float(value)
-        return int(
-            n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1)
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
         )
 
-    for log_name in ("openhands.txt", "openhands_sdk.txt"):
-        log_file = trial_dir / "agent" / log_name
-        if not log_file.exists():
-            continue
-        try:
-            text = log_file.read_text(errors="replace")
-            cost_usd = 0.0
-            prompt_tokens = 0
-            completion_tokens = 0
-            m = _COST_RE.search(text)
-            if m:
-                cost_usd = float(m.group(1))
-            m = _TOKENS_RE.search(text)
-            if m:
-                prompt_tokens = _parse_token_count(m.group(1), m.group(2))
-                completion_tokens = _parse_token_count(m.group(3), m.group(4))
-            if cost_usd or prompt_tokens:
-                return {
-                    "total_prompt_tokens": prompt_tokens,
-                    "total_completion_tokens": completion_tokens,
-                    "total_cost_usd": cost_usd,
-                }
-        except OSError:
-            pass
-
-    return {
-        "total_prompt_tokens": 0,
-        "total_completion_tokens": 0,
-        "total_cost_usd": 0.0,
-    }
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the harbor job directory (timestamp-named) inside the output dir."""
+    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
+    # containing result.json and trial subdirectories
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    # Use the most recent job directory if multiple exist
+    return sorted(candidates)[-1]
 
 
-def convert_benchflow_to_eval_output(
-    jobs_dir: Path,
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
     eval_output_path: Path,
-    task_ids: list[str] | None = None,
 ) -> None:
-    """Convert benchflow job output to standard evaluation output format.
+    """Convert harbor output to evaluation output format.
 
-    benchflow 0.3.0 stores trial results as:
-        jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json
+    Harbor stores trial results in a job directory structured as:
+        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
 
-    Each result.json contains task_name, rewards, error, verifier_error, and timing.
+    Each trial's result.json contains task_name, verifier_result, agent_result,
+    timing info, and exception details.
 
     Args:
-        jobs_dir: Path to benchflow jobs directory.
-        eval_output_path: Path to write output.jsonl.
-        task_ids: Optional filter for specific task IDs (short names).
+        harbor_output_dir: Path to harbor output directory.
+        eval_output_path: Path to write the converted output.jsonl.
     """
-    logger.info(f"Converting benchflow output from {jobs_dir}")
-
-    # benchflow 0.3.0 writes:
-    #   jobs/summary.json
-    #   jobs/TIMESTAMP/TRIAL_NAME/result.json
-    # while older local outputs may place results directly under jobs/.
-    job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()]
-    timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)]
-
-    if timestamp_job_dirs:
-        selected_job_dir = sorted(timestamp_job_dirs)[-1]
-        logger.info(f"Using benchflow job directory: {selected_job_dir}")
-        task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()]
-    else:
-        task_dirs = job_dirs
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
 
-    if not task_dirs:
-        raise RuntimeError(f"No task directories found in {jobs_dir}")
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
 
-    if task_ids:
-        short_ids = {tid.split("/")[-1] for tid in task_ids}
-        task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids]
+    # Find trial result files (each trial dir has a result.json)
+    result_files = list(job_dir.glob("*/result.json"))
+    # Exclude the job-level result.json
+    result_files = [f for f in result_files if f.parent != job_dir]
+
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
 
-    logger.info(f"Processing {len(task_dirs)} task directories")
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
 
     results: list[dict] = []
     errors: list[dict] = []
 
-    for task_dir in sorted(task_dirs):
-        # Find the trial result — benchflow writes trial-0/result.json
-        trial_results = list(task_dir.glob("trial-*/result.json"))
-        if not trial_results:
-            # Fall back to a direct result.json
-            direct = task_dir / "result.json"
-            if direct.exists():
-                trial_results = [direct]
-
-        if not trial_results:
-            logger.warning(f"No result.json found in {task_dir}, skipping")
-            errors.append(
-                {
-                    "instance_id": f"benchflow/{task_dir.name}",
-                    "error": "No result.json found",
-                    "test_result": {},
-                }
-            )
-            continue
-
-        # Use the last trial (highest retry index)
-        result_file = sorted(trial_results)[-1]
-
+    for result_file in result_files:
         try:
             with open(result_file) as f:
                 trial = json.load(f)
 
-            task_basename = task_dir.name.split("__")[0]
-            task_name = trial.get("task_name") or f"benchflow/{task_basename}"
-            # Normalise to benchflow/<name> form
-            if "/" not in task_name:
-                task_name = f"benchflow/{task_name}"
+            instance_id = trial.get("task_name", result_file.parent.name)
 
-            error = trial.get("error")
-            verifier_error = trial.get("verifier_error")
-
-            if error or verifier_error:
+            # Check for exceptions
+            if trial.get("exception_info"):
                 errors.append(
                     {
-                        "instance_id": task_name,
-                        "error": str(error or verifier_error),
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
                         "test_result": {},
                     }
                 )
                 continue
 
-            rewards = trial.get("rewards") or {}
-            passed = bool(rewards.get("reward", 0.0))
+            # Extract verifier results
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+
+            # Extract agent metrics
+            agent_result = trial.get("agent_result", {})
 
             eval_entry = {
-                "instance_id": task_name,
+                "instance_id": instance_id,
                 "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
                     "rewards": rewards,
                     "passed": passed,
                 },
                 "instruction": "",
                 "error": None,
                 "history": [],
-                "metrics": _extract_trial_metrics(result_file.parent),
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
             }
             results.append(eval_entry)
-            logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}")
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
 
         except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to read {result_file}: {e}")
+            logger.error(f"Failed to process result file {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": f"benchflow/{task_dir.name}",
+                    "instance_id": result_file.parent.name,
                     "error": str(e),
                     "test_result": {},
                 }
             )
 
     if not results and not errors:
-        raise RuntimeError(f"No trials processed from {jobs_dir}")
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
 
     if not results:
         logger.warning(
-            f"All {len(errors)} trials failed; writing error entries for reporting"
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
         )
 
+    # Write results to output.jsonl
     with open(eval_output_path, "w") as f:
-        for entry in results + errors:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
             f.write(json.dumps(entry) + "\n")
 
     logger.info(
@@ -518,18 +298,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]:
 def main() -> None:
     """Main entry point for skillsbench inference."""
     parser = argparse.ArgumentParser(
-        description="Run SkillsBench evaluation with benchflow and openhands",
+        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
     # Run full skillsbench evaluation
     uv run skillsbench-infer .llm_config/claude.json
 
-    # Run specific tasks from a file
+    # Run specific tasks
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with more concurrency
-    uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+    # Run with custom dataset version
+    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
         """,
     )
 
@@ -542,7 +322,7 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="benchflow dataset name (e.g., benchflow/skillsbench)",
+        help="Harbor dataset name (e.g., benchflow/skillsbench)",
     )
     parser.add_argument(
         "--output-dir",
@@ -554,12 +334,12 @@ def main() -> None:
         "--num-workers",
         type=int,
         default=INFER_DEFAULTS["num_workers"],
-        help="Number of parallel workers (concurrency)",
+        help="Number of parallel workers",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        help="Maximum number of tasks to run",
+        help="Maximum number of dataset tasks to run after Harbor filtering",
     )
     parser.add_argument(
         "--select",
@@ -578,13 +358,14 @@ def main() -> None:
         help="Optional note for the evaluation run",
     )
     parser.add_argument(
-        "--skip-run",
+        "--skip-harbor",
         action="store_true",
-        help="Skip running benchflow and only convert existing results",
+        help="Skip running harbor and only convert existing results",
     )
 
     args = parser.parse_args()
 
+    # Validate LLM config
     if not os.path.isfile(args.llm_config_path):
         logger.error(f"LLM config file does not exist: {args.llm_config_path}")
         sys.exit(1)
@@ -594,103 +375,87 @@ def main() -> None:
     llm = LLM.model_validate_json(llm_config)
     logger.info(f"Using LLM: {llm.model}")
 
-    if not args.skip_run and not check_benchflow_installed():
+    # Check harbor installation
+    if not args.skip_harbor and not check_harbor_installed():
         logger.error(
-            "benchflow CLI is not installed. Please install it:\n"
-            "  uv tool install benchflow==0.3.0\n"
+            "Harbor CLI is not installed. Please install it:\n"
+            "  pip install harbor\n"
             "  # or\n"
-            "  pip install benchflow==0.3.0\n"
-            "  # or\n"
-            "  uv pip install benchflow==0.3.0"
+            "  uv pip install harbor"
         )
         sys.exit(1)
 
+    # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
         base_dir=args.output_dir,
         dataset_name=dataset_description,
         model_name=llm.model,
-        max_iterations=100,
+        max_iterations=100,  # Not directly used but required for path construction
         eval_note=args.note,
     )
 
     logger.info(f"Output directory: {structured_output_dir}")
     os.makedirs(structured_output_dir, exist_ok=True)
 
+    # Save metadata
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
         "timestamp": datetime.now(timezone.utc).isoformat(),
-        "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"],
+        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
         "note": args.note,
     }
     metadata_path = Path(structured_output_dir) / "metadata.json"
     with open(metadata_path, "w") as f:
         json.dump(metadata, f, indent=2)
 
+    # Collect task IDs if specified
     task_ids: list[str] | None = None
     if args.select:
-        task_ids = load_task_ids_from_file(args.select)
-        logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}")
+        loaded_ids = load_task_ids_from_file(args.select)
+        task_ids = loaded_ids
+        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
     elif args.task_id:
         task_ids = list(args.task_id)
         logger.info(f"Running {len(task_ids)} specified task IDs")
 
-    tasks_dir = Path(structured_output_dir) / "tasks"
-    jobs_dir = Path(structured_output_dir) / "jobs"
     output_path = Path(structured_output_dir) / OUTPUT_FILENAME
 
-    if not args.skip_run:
+    if not args.skip_harbor:
+        # Run harbor evaluation
         try:
-            ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids)
-
-            # Apply n_limit by slicing available task directories
-            effective_task_dirs = tasks_dir
-            if args.n_limit is not None or task_ids is not None:
-                all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir())
-                if task_ids:
-                    short_ids = {tid.split("/")[-1] for tid in task_ids}
-                    all_dirs = [d for d in all_dirs if d.name in short_ids]
-                if args.n_limit is not None:
-                    all_dirs = all_dirs[: args.n_limit]
-
-                # Write a filtered tasks dir symlink tree
-                filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered"
-                filtered_tasks_dir.mkdir(exist_ok=True)
-                for d in all_dirs:
-                    link = filtered_tasks_dir / d.name
-                    if not link.exists():
-                        link.symlink_to(d.resolve())
-                effective_task_dirs = filtered_tasks_dir
-
-            run_benchflow_job(
+            harbor_output_dir = run_harbor_evaluation(
                 llm=llm,
-                tasks_dir=effective_task_dirs,
-                jobs_dir=jobs_dir,
+                dataset=args.dataset,
+                output_dir=structured_output_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
+                n_limit=args.n_limit,
             )
 
-            convert_benchflow_to_eval_output(
-                jobs_dir=jobs_dir,
+            # Convert harbor output to standard format
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
                 eval_output_path=output_path,
-                task_ids=task_ids,
             )
 
         except Exception as e:
             logger.error(f"Evaluation failed: {e}")
             sys.exit(1)
     else:
-        if jobs_dir.exists():
-            convert_benchflow_to_eval_output(
-                jobs_dir=jobs_dir,
+        # Skip harbor, just convert existing results
+        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
+        if harbor_output_dir.exists():
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
                 eval_output_path=output_path,
-                task_ids=task_ids,
             )
         else:
-            logger.error(f"No jobs output found at {jobs_dir}")
+            logger.error(f"No harbor output found at {harbor_output_dir}")
             sys.exit(1)
 
+    # Generate cost report
     if output_path.exists():
         generate_cost_report(str(output_path))
 
diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
index 1334da297..56d54f27a 100644
--- a/tests/test_skillsbench_eval_infer.py
+++ b/tests/test_skillsbench_eval_infer.py
@@ -39,6 +39,23 @@ def test_resolved_instance(self, tmp_path: Path) -> None:
         assert result["unresolved_instances"] == 0
         assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
 
+    def test_unresolved_instance(self, tmp_path: Path) -> None:
+        """Test processing an unresolved (passed=False) instance."""
+        input_file = tmp_path / "unresolved.jsonl"
+        output_file = tmp_path / "unresolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 0
+        assert result["unresolved_instances"] == 1
+
     def test_instance_with_error(self, tmp_path: Path) -> None:
         """Test processing an instance that errored."""
         input_file = tmp_path / "error.jsonl"
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 784b4d1cc..5f8452cb3 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -4,102 +4,82 @@
 from pathlib import Path
 
 import pytest
-import yaml
 
-from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
-    _build_benchflow_agent_env,
-    convert_benchflow_to_eval_output,
-    run_benchflow_job,
+    convert_harbor_to_eval_output,
+    run_harbor_evaluation,
 )
 from openhands.sdk import LLM
 
 
-class TestRunBenchflowJob:
-    """Tests for building benchflow job invocation arguments."""
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
 
-    def test_default_dataset_matches_benchflow_registry(self) -> None:
-        """Test that the default dataset name matches benchflow's published registry."""
+    def test_default_dataset_matches_harbor_registry(self) -> None:
+        """Test that the default dataset name matches Harbor's published registry."""
         assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
 
-    def test_default_agent_is_openhands(self) -> None:
-        """Test that the default agent is openhands."""
-        assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands"
-
-    def test_run_benchflow_job_passes_model_and_concurrency(
+    def test_run_harbor_evaluation_passes_filters_and_limits(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test benchflow job command writes the expected YAML config."""
-        captured_cmd: list[str] = []
-        captured_env: dict[str, str] = {}
-        captured_config: dict = {}
-
-        # Force legacy benchflow binary path so the command format is deterministic
-        monkeypatch.setattr(
-            "benchmarks.skillsbench.run_infer.shutil.which",
-            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
-        )
+        """Test Harbor command includes task filters and n-limit."""
+        captured: dict[str, list[str]] = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured_cmd[:] = cmd
-            captured_env.clear()
-            captured_env.update(env)
-            with open(cmd[3]) as f:
-                captured_config.update(yaml.safe_load(f))
+            captured["cmd"] = cmd
             return type(
                 "Completed",
                 (),
-                {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""},
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        tasks_dir = tmp_path / "tasks"
-        tasks_dir.mkdir()
-        jobs_dir = tmp_path / "jobs"
-
-        run_benchflow_job(
+        harbor_output_dir = run_harbor_evaluation(
             llm=LLM(
-                model="anthropic/claude-sonnet-4-5",
+                model="litellm_proxy/test-model",
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            tasks_dir=tasks_dir,
-            jobs_dir=jobs_dir,
-            num_workers=4,
-        )
-
-        cmd = captured_cmd
-        assert cmd[0] == "/usr/local/bin/benchflow"
-        assert cmd[1] == "job"
-        assert cmd[2] == "--config"
-        assert captured_config["tasks_dir"] == str(tasks_dir)
-        assert captured_config["jobs_dir"] == str(jobs_dir.resolve())
-        assert captured_config["agent"] == "openhands"
-        assert captured_config["model"] == "anthropic/claude-sonnet-4-5"
-        assert captured_config["concurrency"] == 4
-        assert captured_config["sandbox_user"] is None
-
-    def test_llm_credentials_passed_via_subprocess_env(
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+            num_workers=2,
+            task_ids=["benchflow/task-a", "benchflow/task-b"],
+            n_limit=3,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "-d",
+            "benchflow/skillsbench",
+            "-a",
+            "openhands-sdk",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--include-task-name") == 2
+        assert "benchflow/task-a" in cmd
+        assert "benchflow/task-b" in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
+        assert cmd[cmd.index("--n-tasks") + 1] == "3"
+
+    def test_llm_credentials_passed_via_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test that LLM credentials are passed via subprocess env and YAML."""
-        captured_cmd: list[str] = []
-        captured_env: dict[str, str] = {}
-        captured_config: dict = {}
-
-        # Force legacy benchflow binary path so the command format is deterministic
-        monkeypatch.setattr(
-            "benchmarks.skillsbench.run_infer.shutil.which",
-            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
-        )
+        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
+        captured: dict = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured_cmd[:] = cmd
-            captured_env.clear()
-            captured_env.update(env)
-            with open(cmd[3]) as f:
-                captured_config.update(yaml.safe_load(f))
+            captured["cmd"] = cmd
+            captured["env"] = env
             return type(
                 "Completed",
                 (),
@@ -108,127 +88,60 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        tasks_dir = tmp_path / "tasks"
-        tasks_dir.mkdir()
-
-        run_benchflow_job(
+        run_harbor_evaluation(
             llm=LLM(
                 model="test-model",
                 api_key="my-secret-key",
                 base_url="https://my-proxy.example.com",
             ),
-            tasks_dir=tasks_dir,
-            jobs_dir=tmp_path / "jobs",
-        )
-
-        # Credentials in subprocess env
-        assert captured_env["LLM_API_KEY"] == "my-secret-key"
-        assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com"
-        assert "--ae" not in captured_cmd
-        assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key"
-        assert (
-            captured_config["agent_env"]["LLM_BASE_URL"]
-            == "https://my-proxy.example.com"
-        )
-
-    def test_direct_gemini_model_sets_provider_env_vars(self) -> None:
-        """Direct provider models need provider-specific env vars."""
-        env = _build_benchflow_agent_env(
-            LLM(
-                model="gemini/gemini-3.1-flash-lite-preview",
-                api_key="gemini-test-key",
-            )
-        )
-
-        assert env["LLM_API_KEY"] == "gemini-test-key"
-        assert env["GEMINI_API_KEY"] == "gemini-test-key"
-        assert env["GOOGLE_API_KEY"] == "gemini-test-key"
-
-    def test_proxy_model_does_not_set_provider_env_vars(self) -> None:
-        """LiteLLM proxy configs should keep using generic LLM_* vars only."""
-        env = _build_benchflow_agent_env(
-            LLM(
-                model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
-                api_key="proxy-key",
-                base_url="https://proxy.example.com",
-            )
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
         )
 
-        assert env["LLM_API_KEY"] == "proxy-key"
-        assert env["LLM_BASE_URL"] == "https://proxy.example.com"
-        assert "ANTHROPIC_API_KEY" not in env
-        assert "ANTHROPIC_BASE_URL" not in env
+        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
+        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
 
 
-class TestConvertBenchflowToEvalOutput:
-    """Tests for convert_benchflow_to_eval_output function."""
+class TestConvertHarborToEvalOutput:
+    """Tests for convert_harbor_to_eval_output function."""
 
-    def _create_benchflow_structure(
-        self, tmp_path: Path, tasks: list[tuple[str, dict]]
-    ) -> Path:
-        """Create a mock benchflow jobs directory structure.
-
-        benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json
-        """
-        jobs_dir = tmp_path / "jobs"
-        for task_name, result in tasks:
-            trial_dir = jobs_dir / task_name / "trial-0"
-            trial_dir.mkdir(parents=True)
-            (trial_dir / "result.json").write_text(json.dumps(result))
-        return jobs_dir
-
-    def _create_benchflow_timestamped_job(
-        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    def _create_harbor_structure(
+        self, tmp_path: Path, trials: list[tuple[str, dict]]
     ) -> Path:
-        """Create a mock benchflow 0.3.0 jobs directory structure.
-
-        benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json
-        """
-        jobs_dir = tmp_path / "jobs"
-        job_dir = jobs_dir / "2026-04-21__23-12-35"
+        """Create a mock Harbor output structure."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
         job_dir.mkdir(parents=True)
-        (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)}))
-        for task_name, result in tasks:
-            trial_dir = job_dir / f"{task_name}__abc123"
-            trial_dir.mkdir(parents=True)
-            (trial_dir / "result.json").write_text(json.dumps(result))
-        return jobs_dir
+        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
 
-    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
-        """Test successful parsing of a benchflow trial result.
+        for trial_name, trial_result in trials:
+            trial_dir = job_dir / trial_name
+            trial_dir.mkdir()
+            (trial_dir / "result.json").write_text(json.dumps(trial_result))
 
-        benchflow 0.3.0 does not write cost/token fields to result.json.
-        Metrics are read from agent/trajectory.json (harbor-format agent)
-        or parsed from agent/openhands.txt (ACP agent stdout).
-        """
+        return harbor_dir
+
+    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
+        """Test successful parsing of harbor trial result."""
         trial_result = {
             "task_name": "benchflow/weighted-gdp-calc",
-            "rewards": {"reward": 1.0},
-            "error": None,
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        # Write agent/trajectory.json with final_metrics (harbor-format agent output).
-        # agent/ sits next to result.json, inside the trial-0 subdirectory.
-        trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0"
-        agent_dir = trial_dir / "agent"
-        agent_dir.mkdir(parents=True, exist_ok=True)
-        (agent_dir / "trajectory.json").write_text(
-            json.dumps(
-                {
-                    "final_metrics": {
-                        "total_prompt_tokens": 1000,
-                        "total_completion_tokens": 200,
-                        "total_cost_usd": 0.05,
-                    }
-                }
-            )
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
 
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         assert output_file.exists()
         with open(output_file) as f:
@@ -238,52 +151,26 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
-        assert entries[0]["metrics"]["total_prompt_tokens"] == 1000
-        assert entries[0]["metrics"]["total_completion_tokens"] == 200
-
-    def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None:
-        """Test that metrics are extracted from agent/openhands.txt (ACP agent)."""
-        trial_result = {
-            "task_name": "benchflow/acp-task",
-            "rewards": {"reward": 1.0},
-            "error": None,
-        }
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path, [("acp-task", trial_result)]
-        )
-        # Write agent/openhands.txt simulating openhands ACP stdout
-        trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123"
-        agent_dir = trial_dir / "agent"
-        agent_dir.mkdir(parents=True, exist_ok=True)
-        (agent_dir / "openhands.txt").write_text(
-            "OpenHands SDK v1.16.0\n"
-            "Tokens: ↑ input 404.21K • cache hit 70.47% •  reasoning 579 • ↓ output 7.83K • $0.0487\n"
-            "Total cost: $0.0487\n"
-        )
-        output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487)
-        assert entries[0]["metrics"]["total_prompt_tokens"] == 404210
-        assert entries[0]["metrics"]["total_completion_tokens"] == 7830
 
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {
             "task_name": "benchflow/task-1",
-            "rewards": {"reward": 0.0},
-            "error": None,
+            "trial_name": "task-1__xyz",
+            "agent_result": {
+                "n_input_tokens": None,
+                "n_output_tokens": None,
+                "cost_usd": None,
+            },
+            "verifier_result": {"rewards": {"reward": 0.0}},
+            "exception_info": None,
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("task-1", trial_result)]
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("task-1__xyz", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -291,19 +178,21 @@ def test_failed_trial(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is False
         assert entries[0]["metrics"]["total_cost_usd"] == 0.0
 
-    def test_trial_with_error(self, tmp_path: Path) -> None:
-        """Test that errored trials are written as error entries."""
+    def test_trial_with_exception(self, tmp_path: Path) -> None:
+        """Test that exception trials are written as error entries."""
         trial_result = {
             "task_name": "benchflow/error-task",
-            "rewards": {},
-            "error": "LLM_API_KEY not set",
+            "trial_name": "error-task__err",
+            "agent_result": {},
+            "verifier_result": {},
+            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("error-task", trial_result)]
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("error-task__err", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -313,121 +202,20 @@ def test_trial_with_error(self, tmp_path: Path) -> None:
         assert entries[0]["error"] is not None
         assert entries[0]["test_result"] == {}
 
-    def test_missing_jobs_directory(self, tmp_path: Path) -> None:
-        """Test handling when jobs directory is empty."""
-        jobs_dir = tmp_path / "jobs"
-        jobs_dir.mkdir()
-
-        with pytest.raises(RuntimeError, match="No task directories found"):
-            convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl")
-
-    def test_task_id_filtering(self, tmp_path: Path) -> None:
-        """Test that only specified task IDs are converted."""
-        trials = [
-            (
-                "task-a",
-                {
-                    "task_name": "benchflow/task-a",
-                    "rewards": {"reward": 1.0},
-                    "error": None,
-                },
-            ),
-            (
-                "task-b",
-                {
-                    "task_name": "benchflow/task-b",
-                    "rewards": {"reward": 0.0},
-                    "error": None,
-                },
-            ),
-        ]
-        jobs_dir = self._create_benchflow_structure(tmp_path, trials)
-        output_file = tmp_path / "output.jsonl"
+    def test_missing_job_directory(self, tmp_path: Path) -> None:
+        """Test handling when no job directory exists."""
+        harbor_dir = tmp_path / "harbor_output"
+        harbor_dir.mkdir()
 
-        convert_benchflow_to_eval_output(
-            jobs_dir, output_file, task_ids=["benchflow/task-a"]
-        )
+        with pytest.raises(RuntimeError, match="No harbor job directory found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
 
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/task-a"
-
-    def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None:
-        """Test that task names without prefix get benchflow/ prepended."""
-        trial_result = {
-            "task_name": "weighted-gdp-calc",  # no benchflow/ prefix
-            "rewards": {"reward": 1.0},
-            "error": None,
-        }
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
-
-    def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None:
-        """Test benchflow 0.3.0 timestamped jobs directory layout."""
-        trial_result = {
-            "task_name": "weighted-gdp-calc",
-            "rewards": {"reward": 1.0},
-            "error": None,
-            "n_input_tokens": 42,
-            "n_output_tokens": 7,
-            "cost_usd": 0.01,
-        }
-
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        output_file = tmp_path / "output.jsonl"
-
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
-        assert entries[0]["test_result"]["passed"] is True
-
-    def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None:
-        """Test filtering strips the run suffix from trial directory names."""
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path,
-            [
-                (
-                    "task-a",
-                    {
-                        "task_name": "task-a",
-                        "rewards": {"reward": 1.0},
-                        "error": None,
-                    },
-                ),
-                (
-                    "task-b",
-                    {
-                        "task_name": "task-b",
-                        "rewards": {"reward": 0.0},
-                        "error": None,
-                    },
-                ),
-            ],
-        )
-        output_file = tmp_path / "output.jsonl"
-
-        convert_benchflow_to_eval_output(
-            jobs_dir, output_file, task_ids=["benchflow/task-a"]
-        )
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
+    def test_empty_job_directory(self, tmp_path: Path) -> None:
+        """Test handling of harbor job dir with no trial subdirs."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
 
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/task-a"
+        with pytest.raises(RuntimeError, match="No trial result files found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
diff --git a/uv.lock b/uv.lock
index 147abedc9..2cd0b3640 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,7 +1282,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
-    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1293,7 +1292,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1304,7 +1302,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -1519,6 +1516,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
+[package.optional-dependencies]
+socks = [
+    { name = "socksio" },
+]
+
 [[package]]
 name = "httpx-sse"
 version = "0.4.2"
@@ -1814,14 +1816,12 @@ wheels = [
 
 [[package]]
 name = "litellm"
-version = "1.80.10"
+version = "1.83.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "click" },
     { name = "fastuuid" },
-    { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
-    { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "httpx" },
     { name = "importlib-metadata" },
     { name = "jinja2" },
@@ -1832,9 +1832,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "tokenizers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" },
+    { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" },
 ]
 
 [[package]]
@@ -2402,7 +2402,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2467,7 +2467,6 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
-    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2522,7 +2521,6 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
-    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -2546,7 +2544,7 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "agent-client-protocol" },
@@ -2554,7 +2552,7 @@ dependencies = [
     { name = "fakeredis", extra = ["lua"] },
     { name = "fastmcp" },
     { name = "filelock" },
-    { name = "httpx" },
+    { name = "httpx", extra = ["socks"] },
     { name = "litellm" },
     { name = "lmnr" },
     { name = "pydantic" },
@@ -2577,8 +2575,8 @@ requires-dist = [
     { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" },
     { name = "fastmcp", specifier = ">=3.0.0" },
     { name = "filelock", specifier = ">=3.20.1" },
-    { name = "httpx", specifier = ">=0.27.0" },
-    { name = "litellm", specifier = "==1.80.10" },
+    { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" },
+    { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" },
     { name = "lmnr", specifier = ">=0.7.24" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
@@ -2590,7 +2588,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2619,7 +2617,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },
@@ -6709,6 +6707,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "socksio"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" },
+]
+
 [[package]]
 name = "sortedcontainers"
 version = "2.4.0"
@@ -6834,15 +6841,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
-[[package]]
-name = "swesmith"
-version = "0.0.9"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
-]
-
 [[package]]
 name = "swt-bench"
 version = "1.0.1"
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index acd5adc96..3e0a3a091 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6
+Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a

From 8ff7bbab21af4ae2b884981867a6d7302ec5d26f Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 16:56:57 -0400
Subject: [PATCH 05/12] Update skillsbench dataset handling

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/skillsbench/README.md    |  16 +-
 benchmarks/skillsbench/run_infer.py | 351 +++++++++++++++++++++++++---
 tests/test_skillsbench_run_infer.py | 231 +++++++++++++++++-
 3 files changed, 558 insertions(+), 40 deletions(-)

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 60ff73652..c2c11928b 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -21,11 +21,12 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 ## Prerequisites
 
 1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+   This integration is currently validated against `harbor==0.1.33`.
 
    ```bash
-   pip install harbor
+   pip install harbor==0.1.33
    # or
-   uv pip install harbor
+   uv pip install harbor==0.1.33
    ```
 
 2. **Docker**: Harbor requires Docker to be installed and running.
@@ -34,12 +35,18 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Usage
 
+By default, `skillsbench-infer` keeps a local copy of `tasks/` from
+`https://github.com/benchflow-ai/skillsbench` on the `main` branch under
+`benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in
+`benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the
+upstream `main` commit changes. The only supported dataset sources are this synced
+SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`.
+
 ### Running Inference
 
 Run the SkillsBench evaluation using the OpenHands SDK agent:
 
 ```bash
-# Run full evaluation
 uv run skillsbench-infer .llm_config/claude.json
 
 # Run specific tasks
@@ -53,6 +60,9 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5
 
 # Run with multiple workers
 uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+
+# Run against a Harbor registry dataset instead of the synced GitHub tasks
+uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
 ```
 
 ### LLM Configuration
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index a8afa7281..c8e06eee5 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -11,8 +11,11 @@
 import argparse
 import json
 import os
+import re
+import shutil
 import subprocess
 import sys
+import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -29,13 +32,21 @@
 # Output filename for results
 OUTPUT_FILENAME = "output.jsonl"
 
+SKILLSBENCH_REPO_URL = "https://github.com/benchflow-ai/skillsbench.git"
+SKILLSBENCH_REPO_BRANCH = "main"
+DATASET_CACHE_DIR = Path(__file__).parent / "data"
+TASKS_CACHE_DIR = DATASET_CACHE_DIR / "tasks"
+TASKS_METADATA_PATH = DATASET_CACHE_DIR / "source.json"
+REGISTRY_DATASET_PREFIX = "benchflow/skillsbench"
+INSTANCE_ID_PREFIX = "benchflow"
+
 
 def check_harbor_installed() -> bool:
     """Check if harbor CLI is installed and available."""
     harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
     try:
         result = subprocess.run(
-            [harbor_exe, "--version"],
+            [harbor_exe, "--help"],
             capture_output=True,
             text=True,
             timeout=10,
@@ -45,9 +56,252 @@ def check_harbor_installed() -> bool:
         return False
 
 
+def _run_command(cmd: list[str], error_message: str) -> str:
+    """Run a subprocess command and return stdout."""
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        stderr = result.stderr.strip() or result.stdout.strip()
+        raise RuntimeError(f"{error_message}: {stderr}")
+    return result.stdout.strip()
+
+
+def _get_supported_task_filter_flag(harbor_exe: str) -> str:
+    """Detect whether Harbor expects --task-name or --include-task-name."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return "--include-task-name"
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    supported_flags = set(re.findall(r"(?<![\w-])--[a-z0-9-]+", help_text))
+    if "--include-task-name" in supported_flags:
+        return "--include-task-name"
+    if "--task-name" in supported_flags:
+        return "--task-name"
+    return "--include-task-name"
+
+
+def _get_supported_agent_name(harbor_exe: str) -> str:
+    """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return HARBOR_DEFAULTS["agent_name"]
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower())
+    if "openhands-sdk" in compact_help_text:
+        return "openhands-sdk"
+    if "openhands" in compact_help_text:
+        return "openhands"
+    return HARBOR_DEFAULTS["agent_name"]
+
+
+def get_skillsbench_main_commit(
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> str:
+    """Resolve the latest commit hash for the upstream SkillsBench branch."""
+    stdout = _run_command(
+        ["git", "ls-remote", repo_url, f"refs/heads/{branch}"],
+        "Failed to resolve SkillsBench upstream commit",
+    )
+    commit_hash, _, ref = stdout.partition("\t")
+    if not commit_hash or ref != f"refs/heads/{branch}":
+        raise RuntimeError(
+            f"Unexpected git ls-remote output for {repo_url} {branch}: {stdout}"
+        )
+    return commit_hash
+
+
+def _load_cached_commit(metadata_path: Path = TASKS_METADATA_PATH) -> str | None:
+    """Load the cached upstream commit hash for the local task snapshot."""
+    if not metadata_path.is_file():
+        return None
+
+    try:
+        with open(metadata_path, encoding="utf-8") as f:
+            metadata = json.load(f)
+    except (OSError, json.JSONDecodeError) as e:
+        logger.warning(
+            "Ignoring unreadable SkillsBench dataset metadata at %s: %s",
+            metadata_path,
+            e,
+        )
+        return None
+
+    commit_hash = metadata.get("commit_hash")
+    return commit_hash if isinstance(commit_hash, str) and commit_hash else None
+
+
+def download_skillsbench_tasks(
+    commit_hash: str,
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> None:
+    """Download only the SkillsBench tasks directory for a specific commit."""
+    data_dir = tasks_dir.parent
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    logger.info(
+        "Downloading SkillsBench tasks from %s@%s into %s",
+        repo_url,
+        commit_hash,
+        tasks_dir,
+    )
+
+    with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir:
+        clone_dir = Path(temp_dir) / "skillsbench"
+        _run_command(
+            [
+                "git",
+                "clone",
+                "--depth",
+                "1",
+                "--branch",
+                branch,
+                "--filter=blob:none",
+                "--sparse",
+                repo_url,
+                str(clone_dir),
+            ],
+            "Failed to clone SkillsBench repository",
+        )
+        _run_command(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "set", "tasks"],
+            "Failed to sparsely checkout SkillsBench tasks",
+        )
+        checked_out_commit = _run_command(
+            ["git", "-C", str(clone_dir), "rev-parse", "HEAD"],
+            "Failed to read cloned SkillsBench commit",
+        )
+        if checked_out_commit != commit_hash:
+            raise RuntimeError(
+                "Cloned SkillsBench commit does not match upstream HEAD: "
+                f"expected {commit_hash}, got {checked_out_commit}"
+            )
+
+        source_tasks_dir = clone_dir / "tasks"
+        if not source_tasks_dir.is_dir():
+            raise RuntimeError(
+                f"SkillsBench clone at {clone_dir} does not contain a tasks/ directory"
+            )
+
+        if tasks_dir.exists():
+            shutil.rmtree(tasks_dir)
+        shutil.copytree(source_tasks_dir, tasks_dir)
+
+    metadata = {
+        "repo_url": repo_url,
+        "branch": branch,
+        "commit_hash": commit_hash,
+        "synced_at": datetime.now(timezone.utc).isoformat(),
+    }
+    with open(metadata_path, "w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+
+def ensure_skillsbench_tasks(
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> Path:
+    """Ensure a local SkillsBench task snapshot exists and matches upstream HEAD."""
+    cached_commit = _load_cached_commit(metadata_path)
+    has_cached_tasks = tasks_dir.is_dir() and any(tasks_dir.iterdir())
+
+    try:
+        upstream_commit = get_skillsbench_main_commit(repo_url=repo_url, branch=branch)
+    except RuntimeError as e:
+        if has_cached_tasks and cached_commit:
+            logger.warning(
+                "Failed to check SkillsBench upstream HEAD; using cached tasks from "
+                "%s (%s): %s",
+                tasks_dir,
+                cached_commit,
+                e,
+            )
+            return tasks_dir
+        raise
+
+    if has_cached_tasks and cached_commit == upstream_commit:
+        logger.info(
+            "Using cached SkillsBench tasks at %s (commit %s)",
+            tasks_dir,
+            upstream_commit,
+        )
+        return tasks_dir
+
+    if has_cached_tasks:
+        logger.info(
+            "Refreshing SkillsBench tasks in %s from commit %s to %s",
+            tasks_dir,
+            cached_commit or "<unknown>",
+            upstream_commit,
+        )
+    else:
+        logger.info("No cached SkillsBench tasks found at %s; downloading", tasks_dir)
+
+    download_skillsbench_tasks(
+        commit_hash=upstream_commit,
+        tasks_dir=tasks_dir,
+        metadata_path=metadata_path,
+        repo_url=repo_url,
+        branch=branch,
+    )
+    return tasks_dir
+
+
+def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]:
+    """Resolve the dataset argument to a synced local snapshot or registry id."""
+    if dataset == INFER_DEFAULTS["dataset"]:
+        local_tasks_dir = ensure_skillsbench_tasks()
+        return str(local_tasks_dir.resolve()), True
+    if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith(
+        f"{REGISTRY_DATASET_PREFIX}@"
+    ):
+        return dataset, False
+    raise ValueError(
+        "Unsupported SkillsBench dataset source. Use the default synced "
+        "SkillsBench snapshot or a Harbor registry id matching "
+        "'benchflow/skillsbench@<version>'."
+    )
+
+
+def _normalize_task_filter_value(task_id: str, *, dataset_is_path: bool) -> str:
+    """Normalize task filter values for Harbor's local-path dataset handling."""
+    if dataset_is_path:
+        return task_id.rsplit("/", 1)[-1]
+    return task_id
+
+
+def _canonicalize_instance_id(task_name: str) -> str:
+    """Normalize SkillsBench task names to stable benchflow/<task-name> ids."""
+    if "/" in task_name:
+        return task_name
+    return f"{INSTANCE_ID_PREFIX}/{task_name}"
+
+
 def run_harbor_evaluation(
     llm: LLM,
     dataset: str,
+    *,
+    dataset_is_path: bool,
     output_dir: str,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
@@ -57,7 +311,8 @@ def run_harbor_evaluation(
 
     Args:
         llm: LLM configuration for the agent.
-        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        dataset: Synced SkillsBench task snapshot path or Harbor registry id.
+        dataset_is_path: Whether ``dataset`` should be passed via ``--path``.
         output_dir: Directory to store output files.
         num_workers: Number of parallel workers.
         task_ids: Optional list of specific task IDs to run.
@@ -69,16 +324,18 @@ def run_harbor_evaluation(
     harbor_output_dir = Path(output_dir) / "harbor_output"
     harbor_output_dir.mkdir(parents=True, exist_ok=True)
     harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    agent_name = _get_supported_agent_name(harbor_exe)
+    task_filter_flag = _get_supported_task_filter_flag(harbor_exe)
 
     # Build harbor command using harbor CLI flags.
     # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
     cmd = [
         harbor_exe,
         "run",
-        "-d",
+        "--path" if dataset_is_path else "-d",
         dataset,
         "-a",
-        HARBOR_DEFAULTS["agent_name"],
+        agent_name,
         "-m",
         llm.model,
         "--jobs-dir",
@@ -87,21 +344,17 @@ def run_harbor_evaluation(
         str(num_workers),
     ]
 
-    # Pass LLM credentials as agent environment variables
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
-    if llm.base_url:
-        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
-
     # Add specific task names if provided
     if task_ids:
         for task_id in task_ids:
-            cmd.extend(["--include-task-name", task_id])
+            cmd.extend(
+                [
+                    task_filter_flag,
+                    _normalize_task_filter_value(
+                        task_id, dataset_is_path=dataset_is_path
+                    ),
+                ]
+            )
 
     if n_limit is not None:
         cmd.extend(["--n-tasks", str(n_limit)])
@@ -131,10 +384,31 @@ def run_harbor_evaluation(
         )
 
         if result.returncode != 0:
-            logger.error(f"Harbor command failed with code {result.returncode}")
-            logger.error(f"stdout: {result.stdout}")
-            logger.error(f"stderr: {result.stderr}")
-            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+            if (
+                task_ids
+                and task_filter_flag == "--task-name"
+                and "No such option: --task-name" in result.stderr
+            ):
+                fallback_cmd = [
+                    "--include-task-name" if part == "--task-name" else part
+                    for part in cmd
+                ]
+                logger.warning(
+                    "Harbor does not support --task-name; retrying with "
+                    "--include-task-name"
+                )
+                result = subprocess.run(
+                    fallback_cmd,
+                    capture_output=True,
+                    text=True,
+                    env=env,
+                )
+
+            if result.returncode != 0:
+                logger.error(f"Harbor command failed with code {result.returncode}")
+                logger.error(f"stdout: {result.stdout}")
+                logger.error(f"stderr: {result.stderr}")
+                raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
 
         logger.info("Harbor evaluation completed successfully")
         logger.info(f"stdout: {result.stdout}")
@@ -207,7 +481,9 @@ def convert_harbor_to_eval_output(
             with open(result_file) as f:
                 trial = json.load(f)
 
-            instance_id = trial.get("task_name", result_file.parent.name)
+            instance_id = _canonicalize_instance_id(
+                trial.get("task_name", result_file.parent.name)
+            )
 
             # Check for exceptions
             if trial.get("exception_info"):
@@ -256,7 +532,7 @@ def convert_harbor_to_eval_output(
             logger.error(f"Failed to process result file {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": result_file.parent.name,
+                    "instance_id": _canonicalize_instance_id(result_file.parent.name),
                     "error": str(e),
                     "test_result": {},
                 }
@@ -302,13 +578,14 @@ def main() -> None:
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-    # Run full skillsbench evaluation
+    # Run full skillsbench evaluation using a local tasks/ snapshot synced from
+    # https://github.com/benchflow-ai/skillsbench main
     uv run skillsbench-infer .llm_config/claude.json
 
     # Run specific tasks
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with custom dataset version
+    # Run against a Harbor registry dataset instead of the synced GitHub tasks
     uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
         """,
     )
@@ -322,7 +599,11 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+        help=(
+            "SkillsBench dataset source. The default value syncs tasks/ from the "
+            "benchflow-ai/skillsbench main branch. You can also pass a Harbor "
+            "registry id like benchflow/skillsbench@1.0."
+        ),
     )
     parser.add_argument(
         "--output-dir",
@@ -385,6 +666,20 @@ def main() -> None:
         )
         sys.exit(1)
 
+    resolved_dataset = args.dataset
+    dataset_is_path = False
+    dataset_commit_hash: str | None = None
+    if not args.skip_harbor:
+        try:
+            resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+                args.dataset
+            )
+        except ValueError as e:
+            logger.error(str(e))
+            sys.exit(1)
+        if dataset_is_path and args.dataset == INFER_DEFAULTS["dataset"]:
+            dataset_commit_hash = _load_cached_commit()
+
     # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
@@ -402,6 +697,9 @@ def main() -> None:
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
+        "resolved_dataset": resolved_dataset,
+        "dataset_is_path": dataset_is_path,
+        "dataset_commit_hash": dataset_commit_hash,
         "timestamp": datetime.now(timezone.utc).isoformat(),
         "harbor_agent": HARBOR_DEFAULTS["agent_name"],
         "note": args.note,
@@ -427,7 +725,8 @@ def main() -> None:
         try:
             harbor_output_dir = run_harbor_evaluation(
                 llm=llm,
-                dataset=args.dataset,
+                dataset=resolved_dataset,
+                dataset_is_path=dataset_is_path,
                 output_dir=structured_output_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 5f8452cb3..0632a6a46 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -8,22 +8,135 @@
 from benchmarks.skillsbench.config import INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
     convert_harbor_to_eval_output,
+    ensure_skillsbench_tasks,
+    resolve_skillsbench_dataset,
     run_harbor_evaluation,
 )
 from openhands.sdk import LLM
 
 
+class TestDatasetSync:
+    """Tests for syncing the local SkillsBench task snapshot."""
+
+    def test_ensure_skillsbench_tasks_reuses_matching_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that an up-to-date cached tasks directory is reused."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "abc123"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "abc123",
+        )
+
+        called = False
+
+        def fake_download(**kwargs) -> None:
+            nonlocal called
+            called = True
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+        assert called is False
+
+    def test_ensure_skillsbench_tasks_refreshes_stale_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a stale cached commit triggers a redownload."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "old-commit"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "new-commit",
+        )
+
+        captured: dict[str, str] = {}
+
+        def fake_download(
+            *,
+            commit_hash: str,
+            tasks_dir: Path,
+            metadata_path: Path,
+            repo_url: str,
+            branch: str,
+        ) -> None:
+            captured["commit_hash"] = commit_hash
+            captured["tasks_dir"] = str(tasks_dir)
+            captured["metadata_path"] = str(metadata_path)
+            tasks_dir.mkdir(exist_ok=True)
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert captured["commit_hash"] == "new-commit"
+        assert captured["tasks_dir"] == str(tasks_dir)
+        assert captured["metadata_path"] == str(metadata_path)
+
+    def test_ensure_skillsbench_tasks_uses_cache_if_remote_check_fails(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a usable cache is kept when the upstream HEAD check fails."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "cached-commit"}))
+
+        def fake_head(repo_url: str, branch: str) -> str:
+            raise RuntimeError("network unavailable")
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            fake_head,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+
+    def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None:
+        """Test that explicit Harbor dataset ids are passed through unchanged."""
+        resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+            "benchflow/skillsbench@1.0"
+        )
+
+        assert resolved_dataset == "benchflow/skillsbench@1.0"
+        assert dataset_is_path is False
+
+
 class TestRunHarborEvaluation:
     """Tests for building Harbor invocation arguments."""
 
-    def test_default_dataset_matches_harbor_registry(self) -> None:
-        """Test that the default dataset name matches Harbor's published registry."""
-        assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
-
     def test_run_harbor_evaluation_passes_filters_and_limits(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test Harbor command includes task filters and n-limit."""
+        """Test Harbor command normalizes local task ids and includes main flags."""
         captured: dict[str, list[str]] = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
@@ -35,6 +148,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
 
         harbor_output_dir = run_harbor_evaluation(
             llm=LLM(
@@ -42,7 +163,8 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
             output_dir=str(tmp_path),
             num_workers=2,
             task_ids=["benchflow/task-a", "benchflow/task-b"],
@@ -56,21 +178,69 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
         assert cmd[:8] == [
             "harbor",
             "run",
-            "-d",
-            "benchflow/skillsbench",
+            "--path",
+            str(tmp_path / "tasks"),
             "-a",
-            "openhands-sdk",
+            "openhands",
             "-m",
             "litellm_proxy/test-model",
         ]
         assert "--jobs-dir" in cmd
         assert str(expected_output_dir.resolve()) in cmd
         assert cmd.count("--include-task-name") == 2
-        assert "benchflow/task-a" in cmd
-        assert "benchflow/task-b" in cmd
+        assert "task-a" in cmd
+        assert "task-b" in cmd
+        assert "benchflow/task-a" not in cmd
+        assert "--ae" not in cmd
         assert cmd[cmd.index("--n-concurrent") + 1] == "2"
         assert cmd[cmd.index("--n-tasks") + 1] == "3"
 
+    def test_run_harbor_evaluation_retries_with_legacy_task_flag(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor falls back to --include-task-name when --task-name fails."""
+        captured_cmds: list[list[str]] = []
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured_cmds.append(cmd)
+            if "--task-name" in cmd:
+                return type(
+                    "Completed",
+                    (),
+                    {
+                        "returncode": 2,
+                        "stdout": "",
+                        "stderr": "No such option: --task-name",
+                    },
+                )()
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
+
+        run_harbor_evaluation(
+            llm=LLM(model="test-model"),
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
+            output_dir=str(tmp_path),
+            task_ids=["benchflow/task-a"],
+        )
+
+        assert len(captured_cmds) == 2
+        assert "--task-name" in captured_cmds[0]
+        assert "--include-task-name" in captured_cmds[1]
+
     def test_llm_credentials_passed_via_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
@@ -87,6 +257,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
 
         run_harbor_evaluation(
             llm=LLM(
@@ -95,11 +273,13 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
                 base_url="https://my-proxy.example.com",
             ),
             dataset=INFER_DEFAULTS["dataset"],
+            dataset_is_path=False,
             output_dir=str(tmp_path),
         )
 
         assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
         assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+        assert "--ae" not in captured["cmd"]
 
 
 class TestConvertHarborToEvalOutput:
@@ -152,6 +332,35 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
 
+    def test_local_trial_names_are_normalized_to_canonical_instance_ids(
+        self, tmp_path: Path
+    ) -> None:
+        """Test local Harbor task names without namespace keep benchflow ids."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {

From eb2015016ac2b3508025ea2d4b5ebd482e4ea854 Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Sun, 5 Apr 2026 19:33:34 -0400
Subject: [PATCH 06/12] integrate skillsbench

---
 benchmarks/skillsbench/README.md     | 163 ++++++++++
 benchmarks/skillsbench/__init__.py   |   1 +
 benchmarks/skillsbench/config.py     |  16 +
 benchmarks/skillsbench/eval_infer.py | 280 ++++++++++++++++
 benchmarks/skillsbench/run_infer.py  | 467 +++++++++++++++++++++++++++
 benchmarks/utils/report_costs.py     |   4 +-
 pyproject.toml                       |   2 +
 7 files changed, 932 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/skillsbench/README.md
 create mode 100644 benchmarks/skillsbench/__init__.py
 create mode 100644 benchmarks/skillsbench/config.py
 create mode 100644 benchmarks/skillsbench/eval_infer.py
 create mode 100644 benchmarks/skillsbench/run_infer.py

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
new file mode 100644
index 000000000..60ff73652
--- /dev/null
+++ b/benchmarks/skillsbench/README.md
@@ -0,0 +1,163 @@
+# SkillsBench Evaluation
+
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
+
+## Overview
+
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
+
+- Software engineering
+- Office & white collar
+- Natural science
+- Media & content production
+- Cybersecurity
+- Finance
+- Robotics
+- Manufacturing
+- Energy
+- Mathematics
+- Healthcare
+
+## Prerequisites
+
+1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+
+   ```bash
+   pip install harbor
+   # or
+   uv pip install harbor
+   ```
+
+2. **Docker**: Harbor requires Docker to be installed and running.
+
+3. **LLM API Key**: Configure your LLM provider credentials.
+
+## Usage
+
+### Running Inference
+
+Run the SkillsBench evaluation using the OpenHands SDK agent:
+
+```bash
+# Run full evaluation
+uv run skillsbench-infer .llm_config/claude.json
+
+# Run specific tasks
+uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc
+
+# Run tasks from a file
+uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+# Limit the run to 5 tasks (useful for smoke tests)
+uv run skillsbench-infer .llm_config/claude.json --n-limit 5
+
+# Run with multiple workers
+uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+```
+
+### LLM Configuration
+
+Create an LLM configuration file (e.g., `.llm_config/claude.json`):
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4-20250514",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+Or use a LiteLLM proxy:
+
+```json
+{
+  "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",
+  "base_url": "https://your-proxy.example.com",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+### Evaluating Results
+
+After running inference, evaluate the results:
+
+```bash
+uv run skillsbench-eval ./evaluation_outputs/.../output.jsonl
+```
+
+This generates a report file (`output.report.json`) with:
+- Total/completed/resolved instance counts
+- Success rate
+- Aggregate metrics (cost, tokens)
+
+## Output Format
+
+### Inference Output (`output.jsonl`)
+
+Each line contains:
+
+```json
+{
+  "instance_id": "benchflow/task-name",
+  "test_result": {
+    "trial_name": "...",
+    "trial_uri": "...",
+    "rewards": {"reward": 1.0},
+    "passed": true
+  },
+  "instruction": "",
+  "error": null,
+  "history": [],
+  "metrics": {
+    "total_prompt_tokens": 5000,
+    "total_completion_tokens": 1000,
+    "total_cost_usd": 0.05
+  }
+}
+```
+
+### Evaluation Report (`output.report.json`)
+
+```json
+{
+  "total_instances": 100,
+  "completed_instances": 95,
+  "resolved_instances": 80,
+  "unresolved_instances": 15,
+  "error_instances": 5,
+  "aggregate_metrics": {
+    "total_cost_usd": 5.25,
+    "total_prompt_tokens": 500000,
+    "total_completion_tokens": 100000
+  }
+}
+```
+
+## Architecture
+
+The integration follows the Harbor agent adapter pattern:
+
+1. **Harbor Harness**: Manages task containers and lifecycle
+2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
+3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
+
+```text
+┌──────────────────────────────────────────────────┐
+│                 Harbor Harness                   │
+│  ┌────────────────────────────────────────────┐  │
+│  │           Task Container                   │  │
+│  │  ┌──────────────────────────────────────┐  │  │
+│  │  │       OpenHands SDK Agent            │  │  │
+│  │  │  - Terminal tool                     │  │  │
+│  │  │  - File editor tool                  │  │  │
+│  │  │  - Task tracker tool                 │  │  │
+│  │  └──────────────────────────────────────┘  │  │
+│  └────────────────────────────────────────────┘  │
+└──────────────────────────────────────────────────┘
+```
+
+## References
+
+- [SkillsBench](https://www.skillsbench.ai/) - The benchmark
+- [Harbor](https://harborframework.com) - The evaluation harness
+- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
+- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
diff --git a/benchmarks/skillsbench/__init__.py b/benchmarks/skillsbench/__init__.py
new file mode 100644
index 000000000..c02f7bafb
--- /dev/null
+++ b/benchmarks/skillsbench/__init__.py
@@ -0,0 +1 @@
+# SkillsBench evaluation benchmark
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
new file mode 100644
index 000000000..8b55a92b0
--- /dev/null
+++ b/benchmarks/skillsbench/config.py
@@ -0,0 +1,16 @@
+"""SkillsBench configuration defaults."""
+
+# Default inference settings (only include values actually used by argparse)
+INFER_DEFAULTS = {
+    "dataset": "benchflow/skillsbench",
+    "output_dir": "./evaluation_outputs",
+    "num_workers": 1,
+}
+
+# Harbor configuration defaults
+HARBOR_DEFAULTS = {
+    # Harbor executable
+    "harbor_executable": "harbor",
+    # Default agent name for openhands-sdk
+    "agent_name": "openhands-sdk",
+}
diff --git a/benchmarks/skillsbench/eval_infer.py b/benchmarks/skillsbench/eval_infer.py
new file mode 100644
index 000000000..f55a91736
--- /dev/null
+++ b/benchmarks/skillsbench/eval_infer.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""SkillsBench Evaluation Script.
+
+This script processes SkillsBench output and generates evaluation reports.
+It reads the output.jsonl produced by run_infer, aggregates results,
+and writes a summary report.
+
+Usage:
+    uv run skillsbench-eval <path_to_output.jsonl>
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def process_skillsbench_results(
+    input_file: str,
+    output_file: str,
+) -> dict:
+    """Process SkillsBench output.jsonl and generate evaluation report.
+
+    SkillsBench format (from harbor conversion):
+    {
+        "instance_id": "task_id",
+        "test_result": {
+            "trajectory_path": "...",
+            "total_steps": N,
+            "final_metrics": {...},
+            "passed": true/false  # May be populated by harbor grading
+        },
+        "instruction": "...",
+        "history": [...]
+    }
+
+    Report format (similar to SWE-Bench):
+    {
+        "total_instances": N,
+        "submitted_instances": N,
+        "completed_instances": N,
+        "incomplete_instances": N,
+        "resolved_instances": N,
+        "unresolved_instances": N,
+        "error_instances": N,
+        "submitted_ids": [...],
+        "completed_ids": [...],
+        "incomplete_ids": [...],
+        "resolved_ids": [...],
+        "unresolved_ids": [...]
+    }
+    """
+    logger.info(f"Processing {input_file} to generate report: {output_file}")
+
+    # Use sets for O(1) lookup and automatic deduplication
+    # Convert to sorted lists only when building final report
+    completed_ids: set[str] = set()
+    resolved_ids: set[str] = set()
+    unresolved_ids: set[str] = set()
+    incomplete_ids: set[str] = set()
+    error_ids: set[str] = set()
+
+    # Aggregate metrics
+    total_cost_usd = 0.0
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+
+    with open(input_file) as infile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                # Extract required fields
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    continue
+
+                if instance_id in completed_ids:
+                    logger.warning(
+                        f"Line {line_num}: Duplicate instance_id {instance_id}"
+                    )
+                    continue
+
+                # Check for errors
+                error = data.get("error")
+                if error:
+                    error_ids.add(instance_id)
+                    incomplete_ids.add(instance_id)
+                    continue
+
+                # Extract test result
+                test_result = data.get("test_result", {})
+
+                # Check if task passed (harbor may include this)
+                passed = test_result.get("passed")
+                # If not explicitly set, we mark as completed but ungraded
+                is_resolved = passed is True
+
+                # Add to completed instances
+                completed_ids.add(instance_id)
+
+                if is_resolved:
+                    resolved_ids.add(instance_id)
+                else:
+                    unresolved_ids.add(instance_id)
+
+                # Aggregate metrics
+                # Use explicit None check to handle zero values correctly
+                # (using `or` would incorrectly fallback when value is 0)
+                metrics = data.get("metrics", {})
+                final_metrics = test_result.get("final_metrics", {})
+
+                cost = metrics.get("total_cost_usd")
+                if cost is None:
+                    cost = final_metrics.get("total_cost_usd", 0.0)
+
+                prompt_tokens = metrics.get("total_prompt_tokens")
+                if prompt_tokens is None:
+                    prompt_tokens = final_metrics.get("total_prompt_tokens", 0)
+
+                completion_tokens = metrics.get("total_completion_tokens")
+                if completion_tokens is None:
+                    completion_tokens = final_metrics.get("total_completion_tokens", 0)
+
+                # After the None checks above, these values are guaranteed to be non-None
+                total_cost_usd += cost
+                total_prompt_tokens += prompt_tokens
+                total_completion_tokens += completion_tokens
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+
+    # Check for separate error file (used in manual workflows where errors
+    # are extracted to a separate file for analysis/retry)
+    error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl")
+    if error_path.exists():
+        with open(error_path) as error_file:
+            for line_num, line in enumerate(error_file, 1):
+                try:
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    data = json.loads(line)
+                    instance_id = data.get("instance_id")
+                    if not instance_id:
+                        continue
+                    if instance_id in completed_ids or instance_id in incomplete_ids:
+                        continue
+
+                    incomplete_ids.add(instance_id)
+                    error_ids.add(instance_id)
+                except (json.JSONDecodeError, Exception) as e:
+                    logger.error(f"Error file line {line_num}: {e}")
+
+    submitted_ids = completed_ids | incomplete_ids
+
+    # Generate report - convert sets to sorted lists for consistent output
+    report = {
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": len(incomplete_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "error_instances": len(error_ids),
+        "submitted_ids": sorted(submitted_ids),
+        "completed_ids": sorted(completed_ids),
+        "incomplete_ids": sorted(incomplete_ids),
+        "resolved_ids": sorted(resolved_ids),
+        "unresolved_ids": sorted(unresolved_ids),
+        "error_ids": sorted(error_ids),
+        # Aggregate metrics
+        "aggregate_metrics": {
+            "total_cost_usd": total_cost_usd,
+            "total_prompt_tokens": total_prompt_tokens,
+            "total_completion_tokens": total_completion_tokens,
+        },
+    }
+
+    # Write report
+    with open(output_file, "w") as outfile:
+        json.dump(report, outfile, indent=4)
+
+    logger.info("Report generated successfully:")
+    logger.info(f"  Total instances: {report['total_instances']}")
+    logger.info(f"  Completed instances: {report['completed_instances']}")
+    logger.info(f"  Resolved instances: {report['resolved_instances']}")
+    logger.info(f"  Unresolved instances: {report['unresolved_instances']}")
+    logger.info(f"  Error instances: {report['error_instances']}")
+    if report["completed_instances"] > 0:
+        logger.info(
+            f"  Success rate: "
+            f"{report['resolved_instances'] / report['completed_instances'] * 100:.1f}%"
+        )
+    logger.info(f"  Total cost: ${total_cost_usd:.4f}")
+
+    return report
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Process SkillsBench output and generate evaluation report",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run skillsbench-eval output.jsonl
+    uv run skillsbench-eval /path/to/output.jsonl
+        """,
+    )
+
+    parser.add_argument("input_file", help="Path to the SkillsBench output.jsonl file")
+    parser.add_argument(
+        "--output-file",
+        help="Output file for report (default: input_file with .report.json extension)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate input file
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    # Determine output file
+    if args.output_file:
+        output_file = Path(args.output_file)
+    else:
+        output_file = input_file.with_suffix(".report.json")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+
+    try:
+        # Process results and generate report
+        process_skillsbench_results(
+            str(input_file),
+            str(output_file),
+        )
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+    # Non-critical telemetry and reporting - wrap in try/except so expensive
+    # multi-hour evaluations don't fail at the telemetry step after completing
+    try:
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+    except Exception as e:
+        logger.warning(f"Laminar update failed (non-critical): {e}")
+
+    try:
+        generate_cost_report(str(input_file))
+    except Exception as e:
+        logger.warning(f"Cost report generation failed (non-critical): {e}")
+
+    logger.info("Script completed successfully!")
+    print(json.dumps({"report_json": str(output_file)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
new file mode 100644
index 000000000..a8afa7281
--- /dev/null
+++ b/benchmarks/skillsbench/run_infer.py
@@ -0,0 +1,467 @@
+"""SkillsBench inference script using Harbor with openhands-sdk agent.
+
+This script runs SkillsBench evaluation using Harbor as the harness
+and openhands-sdk as the agent. Results are saved in a format compatible
+with the standard evaluation pipeline.
+
+Usage:
+    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
+from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import LLM, get_logger
+
+
+logger = get_logger(__name__)
+
+# Output filename for results
+OUTPUT_FILENAME = "output.jsonl"
+
+
+def check_harbor_installed() -> bool:
+    """Check if harbor CLI is installed and available."""
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    try:
+        result = subprocess.run(
+            [harbor_exe, "--version"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
+
+
+def run_harbor_evaluation(
+    llm: LLM,
+    dataset: str,
+    output_dir: str,
+    num_workers: int = 1,
+    task_ids: list[str] | None = None,
+    n_limit: int | None = None,
+) -> Path:
+    """Run harbor evaluation with openhands-sdk agent.
+
+    Args:
+        llm: LLM configuration for the agent.
+        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        output_dir: Directory to store output files.
+        num_workers: Number of parallel workers.
+        task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
+
+    Returns:
+        Path to the harbor output directory.
+    """
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+
+    # Build harbor command using harbor CLI flags.
+    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
+    cmd = [
+        harbor_exe,
+        "run",
+        "-d",
+        dataset,
+        "-a",
+        HARBOR_DEFAULTS["agent_name"],
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    # Pass LLM credentials as agent environment variables
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
+    if llm.base_url:
+        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+
+    # Add specific task names if provided
+    if task_ids:
+        for task_id in task_ids:
+            cmd.extend(["--include-task-name", task_id])
+
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
+
+    logger.info(f"Running harbor command: {' '.join(cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
+
+    # harbor's openhands-sdk agent reads LLM credentials from the host process
+    # environment (os.environ), not from --ae flags which go to the sandbox.
+    env = os.environ.copy()
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+
+        if result.returncode != 0:
+            logger.error(f"Harbor command failed with code {result.returncode}")
+            logger.error(f"stdout: {result.stdout}")
+            logger.error(f"stderr: {result.stderr}")
+            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
+        )
+
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the harbor job directory (timestamp-named) inside the output dir."""
+    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
+    # containing result.json and trial subdirectories
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    # Use the most recent job directory if multiple exist
+    return sorted(candidates)[-1]
+
+
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
+    eval_output_path: Path,
+) -> None:
+    """Convert harbor output to evaluation output format.
+
+    Harbor stores trial results in a job directory structured as:
+        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
+
+    Each trial's result.json contains task_name, verifier_result, agent_result,
+    timing info, and exception details.
+
+    Args:
+        harbor_output_dir: Path to harbor output directory.
+        eval_output_path: Path to write the converted output.jsonl.
+    """
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
+
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
+
+    # Find trial result files (each trial dir has a result.json)
+    result_files = list(job_dir.glob("*/result.json"))
+    # Exclude the job-level result.json
+    result_files = [f for f in result_files if f.parent != job_dir]
+
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
+
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+
+    results: list[dict] = []
+    errors: list[dict] = []
+
+    for result_file in result_files:
+        try:
+            with open(result_file) as f:
+                trial = json.load(f)
+
+            instance_id = trial.get("task_name", result_file.parent.name)
+
+            # Check for exceptions
+            if trial.get("exception_info"):
+                errors.append(
+                    {
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
+                        "test_result": {},
+                    }
+                )
+                continue
+
+            # Extract verifier results
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+
+            # Extract agent metrics
+            agent_result = trial.get("agent_result", {})
+
+            eval_entry = {
+                "instance_id": instance_id,
+                "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
+                    "rewards": rewards,
+                    "passed": passed,
+                },
+                "instruction": "",
+                "error": None,
+                "history": [],
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
+            }
+            results.append(eval_entry)
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
+
+        except (json.JSONDecodeError, OSError) as e:
+            logger.error(f"Failed to process result file {result_file}: {e}")
+            errors.append(
+                {
+                    "instance_id": result_file.parent.name,
+                    "error": str(e),
+                    "test_result": {},
+                }
+            )
+
+    if not results and not errors:
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+
+    if not results:
+        logger.warning(
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
+        )
+
+    # Write results to output.jsonl
+    with open(eval_output_path, "w") as f:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
+            f.write(json.dumps(entry) + "\n")
+
+    logger.info(
+        f"Wrote {len(results)} successful + {len(errors)} failed entries "
+        f"to {eval_output_path}"
+    )
+
+
+def load_task_ids_from_file(filepath: str) -> list[str]:
+    """Load task IDs from a text file (one per line)."""
+    task_ids = []
+    with open(filepath) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                task_ids.append(line)
+    return task_ids
+
+
+def main() -> None:
+    """Main entry point for skillsbench inference."""
+    parser = argparse.ArgumentParser(
+        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run full skillsbench evaluation
+    uv run skillsbench-infer .llm_config/claude.json
+
+    # Run specific tasks
+    uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+    # Run with custom dataset version
+    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+        """,
+    )
+
+    parser.add_argument(
+        "llm_config_path",
+        type=str,
+        help="Path to JSON LLM configuration file",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=INFER_DEFAULTS["dataset"],
+        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=INFER_DEFAULTS["output_dir"],
+        help="Base output directory for evaluation results",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=INFER_DEFAULTS["num_workers"],
+        help="Number of parallel workers",
+    )
+    parser.add_argument(
+        "--n-limit",
+        type=int,
+        help="Maximum number of dataset tasks to run after Harbor filtering",
+    )
+    parser.add_argument(
+        "--select",
+        type=str,
+        help="Path to text file containing task IDs to run (one per line)",
+    )
+    parser.add_argument(
+        "--task-id",
+        type=str,
+        action="append",
+        help="Specific task ID to run (can be specified multiple times)",
+    )
+    parser.add_argument(
+        "--note",
+        type=str,
+        help="Optional note for the evaluation run",
+    )
+    parser.add_argument(
+        "--skip-harbor",
+        action="store_true",
+        help="Skip running harbor and only convert existing results",
+    )
+
+    args = parser.parse_args()
+
+    # Validate LLM config
+    if not os.path.isfile(args.llm_config_path):
+        logger.error(f"LLM config file does not exist: {args.llm_config_path}")
+        sys.exit(1)
+
+    with open(args.llm_config_path) as f:
+        llm_config = f.read()
+    llm = LLM.model_validate_json(llm_config)
+    logger.info(f"Using LLM: {llm.model}")
+
+    # Check harbor installation
+    if not args.skip_harbor and not check_harbor_installed():
+        logger.error(
+            "Harbor CLI is not installed. Please install it:\n"
+            "  pip install harbor\n"
+            "  # or\n"
+            "  uv pip install harbor"
+        )
+        sys.exit(1)
+
+    # Construct output directory
+    dataset_description = args.dataset.replace("/", "__").replace("@", "-")
+    structured_output_dir = construct_eval_output_dir(
+        base_dir=args.output_dir,
+        dataset_name=dataset_description,
+        model_name=llm.model,
+        max_iterations=100,  # Not directly used but required for path construction
+        eval_note=args.note,
+    )
+
+    logger.info(f"Output directory: {structured_output_dir}")
+    os.makedirs(structured_output_dir, exist_ok=True)
+
+    # Save metadata
+    metadata = {
+        "llm": llm.model_dump_json(),
+        "dataset": args.dataset,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
+        "note": args.note,
+    }
+    metadata_path = Path(structured_output_dir) / "metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Collect task IDs if specified
+    task_ids: list[str] | None = None
+    if args.select:
+        loaded_ids = load_task_ids_from_file(args.select)
+        task_ids = loaded_ids
+        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
+    elif args.task_id:
+        task_ids = list(args.task_id)
+        logger.info(f"Running {len(task_ids)} specified task IDs")
+
+    output_path = Path(structured_output_dir) / OUTPUT_FILENAME
+
+    if not args.skip_harbor:
+        # Run harbor evaluation
+        try:
+            harbor_output_dir = run_harbor_evaluation(
+                llm=llm,
+                dataset=args.dataset,
+                output_dir=structured_output_dir,
+                num_workers=args.num_workers,
+                task_ids=task_ids,
+                n_limit=args.n_limit,
+            )
+
+            # Convert harbor output to standard format
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+
+        except Exception as e:
+            logger.error(f"Evaluation failed: {e}")
+            sys.exit(1)
+    else:
+        # Skip harbor, just convert existing results
+        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
+        if harbor_output_dir.exists():
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+        else:
+            logger.error(f"No harbor output found at {harbor_output_dir}")
+            sys.exit(1)
+
+    # Generate cost report
+    if output_path.exists():
+        generate_cost_report(str(output_path))
+
+    logger.info("SkillsBench inference completed!")
+    print(json.dumps({"output_json": str(output_path)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py
index 8f38909f3..7a21a3831 100755
--- a/benchmarks/utils/report_costs.py
+++ b/benchmarks/utils/report_costs.py
@@ -48,7 +48,9 @@ def extract_accumulated_cost(jsonl_data: List[Optional[Dict]]) -> float:
         if entry is None:
             continue
         metrics = entry.get("metrics") or {}
-        accumulated_cost = metrics.get("accumulated_cost")
+        accumulated_cost = metrics.get("accumulated_cost") or metrics.get(
+            "total_cost_usd"
+        )
         if accumulated_cost is not None:
             total_cost += float(accumulated_cost)
 
diff --git a/pyproject.toml b/pyproject.toml
index 11773729a..3fdcfcbdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,8 @@ hybridgym-issuelocalize-infer = "benchmarks.hybridgym_issuelocalize.run_infer:ma
 hybridgym-issuelocalize-eval = "benchmarks.hybridgym_issuelocalize.eval_infer:main"
 swesmith-infer = "benchmarks.swesmith.run_infer:main"
 swesmith-eval = "benchmarks.swesmith.eval_infer:main"
+skillsbench-infer = "benchmarks.skillsbench.run_infer:main"
+skillsbench-eval = "benchmarks.skillsbench.eval_infer:main"
 
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]

From 90e8c9f63f8460f8b3ca46cd87467ec62383fb6c Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Sun, 5 Apr 2026 19:44:33 -0400
Subject: [PATCH 07/12] add skillsbench tests

---
 tests/test_skillsbench_eval_infer.py | 125 +++++++++++++++
 tests/test_skillsbench_run_infer.py  | 221 +++++++++++++++++++++++++++
 2 files changed, 346 insertions(+)
 create mode 100644 tests/test_skillsbench_eval_infer.py
 create mode 100644 tests/test_skillsbench_run_infer.py

diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
new file mode 100644
index 000000000..56d54f27a
--- /dev/null
+++ b/tests/test_skillsbench_eval_infer.py
@@ -0,0 +1,125 @@
+"""Tests for SkillsBench eval_infer module."""
+
+import json
+from pathlib import Path
+
+from benchmarks.skillsbench.eval_infer import process_skillsbench_results
+
+
+class TestProcessSkillsbenchResults:
+    """Tests for the process_skillsbench_results function."""
+
+    def test_empty_input(self, tmp_path: Path) -> None:
+        """Test processing empty input file."""
+        input_file = tmp_path / "empty.jsonl"
+        output_file = tmp_path / "empty.report.json"
+        input_file.write_text("")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 0
+        assert result["completed_instances"] == 0
+        assert result["resolved_instances"] == 0
+
+    def test_resolved_instance(self, tmp_path: Path) -> None:
+        """Test processing a resolved (passed=True) instance."""
+        input_file = tmp_path / "resolved.jsonl"
+        output_file = tmp_path / "resolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/weighted-gdp-calc",
+            "test_result": {"passed": True, "rewards": {"reward": 1.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 0
+        assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
+
+    def test_unresolved_instance(self, tmp_path: Path) -> None:
+        """Test processing an unresolved (passed=False) instance."""
+        input_file = tmp_path / "unresolved.jsonl"
+        output_file = tmp_path / "unresolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 0
+        assert result["unresolved_instances"] == 1
+
+    def test_instance_with_error(self, tmp_path: Path) -> None:
+        """Test processing an instance that errored."""
+        input_file = tmp_path / "error.jsonl"
+        output_file = tmp_path / "error.report.json"
+
+        entry = {
+            "instance_id": "benchflow/error-task",
+            "test_result": {},
+            "error": "ValueError: LLM_API_KEY environment variable must be set",
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["error_instances"] == 1
+        assert result["incomplete_instances"] == 1
+        assert result["completed_instances"] == 0
+        assert "benchflow/error-task" in result["error_ids"]
+
+    def test_multiple_instances(self, tmp_path: Path) -> None:
+        """Test processing multiple instances with mixed results."""
+        input_file = tmp_path / "multi.jsonl"
+        output_file = tmp_path / "multi.report.json"
+
+        entries = [
+            {
+                "instance_id": "benchflow/task-1",
+                "test_result": {"passed": True},
+                "error": None,
+            },
+            {
+                "instance_id": "benchflow/task-2",
+                "test_result": {"passed": False},
+                "error": None,
+            },
+            {"instance_id": "benchflow/task-3", "test_result": {}, "error": "Timeout"},
+        ]
+        input_file.write_text("\n".join(json.dumps(e) for e in entries) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 3
+        assert result["completed_instances"] == 2
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 1
+        assert result["error_instances"] == 1
+
+    def test_report_file_written(self, tmp_path: Path) -> None:
+        """Test that report file is written correctly."""
+        input_file = tmp_path / "input.jsonl"
+        output_file = tmp_path / "output.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": True},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        process_skillsbench_results(str(input_file), str(output_file))
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            report = json.load(f)
+        assert "total_instances" in report
+        assert "resolved_ids" in report
+        assert "aggregate_metrics" in report
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
new file mode 100644
index 000000000..5f8452cb3
--- /dev/null
+++ b/tests/test_skillsbench_run_infer.py
@@ -0,0 +1,221 @@
+"""Tests for SkillsBench run_infer module."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from benchmarks.skillsbench.config import INFER_DEFAULTS
+from benchmarks.skillsbench.run_infer import (
+    convert_harbor_to_eval_output,
+    run_harbor_evaluation,
+)
+from openhands.sdk import LLM
+
+
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
+
+    def test_default_dataset_matches_harbor_registry(self) -> None:
+        """Test that the default dataset name matches Harbor's published registry."""
+        assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
+
+    def test_run_harbor_evaluation_passes_filters_and_limits(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor command includes task filters and n-limit."""
+        captured: dict[str, list[str]] = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+
+        harbor_output_dir = run_harbor_evaluation(
+            llm=LLM(
+                model="litellm_proxy/test-model",
+                api_key="test-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+            num_workers=2,
+            task_ids=["benchflow/task-a", "benchflow/task-b"],
+            n_limit=3,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "-d",
+            "benchflow/skillsbench",
+            "-a",
+            "openhands-sdk",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--include-task-name") == 2
+        assert "benchflow/task-a" in cmd
+        assert "benchflow/task-b" in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
+        assert cmd[cmd.index("--n-tasks") + 1] == "3"
+
+    def test_llm_credentials_passed_via_env(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
+        captured: dict = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            captured["env"] = env
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+
+        run_harbor_evaluation(
+            llm=LLM(
+                model="test-model",
+                api_key="my-secret-key",
+                base_url="https://my-proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+        )
+
+        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
+        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+
+
+class TestConvertHarborToEvalOutput:
+    """Tests for convert_harbor_to_eval_output function."""
+
+    def _create_harbor_structure(
+        self, tmp_path: Path, trials: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock Harbor output structure."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
+
+        for trial_name, trial_result in trials:
+            trial_dir = job_dir / trial_name
+            trial_dir.mkdir()
+            (trial_dir / "result.json").write_text(json.dumps(trial_result))
+
+        return harbor_dir
+
+    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
+        """Test successful parsing of harbor trial result."""
+        trial_result = {
+            "task_name": "benchflow/weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+        assert entries[0]["test_result"]["passed"] is True
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.05
+
+    def test_failed_trial(self, tmp_path: Path) -> None:
+        """Test parsing of a trial with reward 0."""
+        trial_result = {
+            "task_name": "benchflow/task-1",
+            "trial_name": "task-1__xyz",
+            "agent_result": {
+                "n_input_tokens": None,
+                "n_output_tokens": None,
+                "cost_usd": None,
+            },
+            "verifier_result": {"rewards": {"reward": 0.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("task-1__xyz", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["test_result"]["passed"] is False
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.0
+
+    def test_trial_with_exception(self, tmp_path: Path) -> None:
+        """Test that exception trials are written as error entries."""
+        trial_result = {
+            "task_name": "benchflow/error-task",
+            "trial_name": "error-task__err",
+            "agent_result": {},
+            "verifier_result": {},
+            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("error-task__err", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/error-task"
+        assert entries[0]["error"] is not None
+        assert entries[0]["test_result"] == {}
+
+    def test_missing_job_directory(self, tmp_path: Path) -> None:
+        """Test handling when no job directory exists."""
+        harbor_dir = tmp_path / "harbor_output"
+        harbor_dir.mkdir()
+
+        with pytest.raises(RuntimeError, match="No harbor job directory found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+
+    def test_empty_job_directory(self, tmp_path: Path) -> None:
+        """Test handling of harbor job dir with no trial subdirs."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
+
+        with pytest.raises(RuntimeError, match="No trial result files found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")

From 4d69c04ac4943f099ad823682edf852f61719fcb Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Wed, 22 Apr 2026 20:47:30 -0400
Subject: [PATCH 08/12] feat(skillsbench): migrate harness from Harbor to
 benchflow 0.3.0

Switch the SkillsBench evaluation harness from Harbor/openhands-sdk to
benchflow 0.3.0 with the native openhands ACP agent.

Key changes:
- Replace Harbor-specific logic with benchflow CLI invocation
  (`bench eval create -f config.yaml` / legacy `benchflow job --config`)
- Add sparse-checkout task download to avoid cloning the full skillsbench repo
- Fix metrics extraction: benchflow 0.3.0 result.json omits cost/token fields;
  now reads from agent/trajectory.json (harbor-format) or parses
  agent/openhands.txt stdout (ACP agent)
- Fix timestamp detection with regex (_TIMESTAMP_RE) to correctly identify
  benchflow 0.3.0 job dirs (YYYY-MM-DD__HH-MM-SS) vs plain task dirs
- Fix openhands install failure on Ubuntu 24.04 (PEP 668) by injecting
  PIP_BREAK_SYSTEM_PACKAGES=1 into agent_env
- Add provider-specific env var injection for direct Gemini/Anthropic models
- Update README and config to reflect benchflow harness

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                           |   1 +
 benchmarks/skillsbench/README.md     |  42 +-
 benchmarks/skillsbench/config.py     |  11 +-
 benchmarks/skillsbench/run_infer.py  | 655 ++++++++++++++++++---------
 tests/test_skillsbench_eval_infer.py |  17 -
 tests/test_skillsbench_run_infer.py  | 442 +++++++++++++-----
 uv.lock                              |  50 +-
 vendor/software-agent-sdk            |   2 +-
 8 files changed, 824 insertions(+), 396 deletions(-)

diff --git a/.gitignore b/.gitignore
index 459fad588..9164fd12b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,4 +216,5 @@ workspace/
 
 # Evaluation outputs
 eval_outputs/
+evaluation_outputs/
 builds/
diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 60ff73652..21339842c 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -1,10 +1,10 @@
 # SkillsBench Evaluation
 
-This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent.
 
 ## Overview
 
-SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include:
 
 - Software engineering
 - Office & white collar
@@ -20,23 +20,25 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Prerequisites
 
-1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+1. **Install benchflow**: benchflow is the official harness for running SkillsBench.
 
    ```bash
-   pip install harbor
+   uv tool install benchflow==0.3.0
    # or
-   uv pip install harbor
+   pip install benchflow==0.3.0
+   # or
+   uv pip install benchflow==0.3.0
    ```
 
-2. **Docker**: Harbor requires Docker to be installed and running.
+2. **Docker**: benchflow requires Docker to be installed and running.
 
-3. **LLM API Key**: Configure your LLM provider credentials.
+3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment.
 
 ## Usage
 
 ### Running Inference
 
-Run the SkillsBench evaluation using the OpenHands SDK agent:
+Run the SkillsBench evaluation using the `openhands` agent:
 
 ```bash
 # Run full evaluation
@@ -62,7 +64,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`):
 ```json
 {
   "model": "anthropic/claude-sonnet-4-20250514",
-  "api_key": "YOUR_API_KEY"
+  "api_key": "YOUR_ANTHROPIC_API_KEY"
 }
 ```
 
@@ -99,8 +101,6 @@ Each line contains:
 {
   "instance_id": "benchflow/task-name",
   "test_result": {
-    "trial_name": "...",
-    "trial_uri": "...",
     "rewards": {"reward": 1.0},
     "passed": true
   },
@@ -134,22 +134,21 @@ Each line contains:
 
 ## Architecture
 
-The integration follows the Harbor agent adapter pattern:
+The integration uses the benchflow CLI as the evaluation harness:
 
-1. **Harbor Harness**: Manages task containers and lifecycle
-2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
-3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
+1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty
+2. **benchflow job**: Runs all tasks concurrently with `openhands`
+3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format
 
 ```text
 ┌──────────────────────────────────────────────────┐
-│                 Harbor Harness                   │
+│               benchflow job                      │
 │  ┌────────────────────────────────────────────┐  │
-│  │           Task Container                   │  │
+│  │           Task Container (Docker)          │  │
 │  │  ┌──────────────────────────────────────┐  │  │
-│  │  │       OpenHands SDK Agent            │  │  │
+│  │  │       openhands                      │  │  │
 │  │  │  - Terminal tool                     │  │  │
 │  │  │  - File editor tool                  │  │  │
-│  │  │  - Task tracker tool                 │  │  │
 │  │  └──────────────────────────────────────┘  │  │
 │  └────────────────────────────────────────────┘  │
 └──────────────────────────────────────────────────┘
@@ -158,6 +157,5 @@ The integration follows the Harbor agent adapter pattern:
 ## References
 
 - [SkillsBench](https://www.skillsbench.ai/) - The benchmark
-- [Harbor](https://harborframework.com) - The evaluation harness
-- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
-- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
+- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness
+- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
index 8b55a92b0..4ed541ab9 100644
--- a/benchmarks/skillsbench/config.py
+++ b/benchmarks/skillsbench/config.py
@@ -1,16 +1,13 @@
 """SkillsBench configuration defaults."""
 
-# Default inference settings (only include values actually used by argparse)
+# Default inference settings
 INFER_DEFAULTS = {
     "dataset": "benchflow/skillsbench",
     "output_dir": "./evaluation_outputs",
     "num_workers": 1,
 }
 
-# Harbor configuration defaults
-HARBOR_DEFAULTS = {
-    # Harbor executable
-    "harbor_executable": "harbor",
-    # Default agent name for openhands-sdk
-    "agent_name": "openhands-sdk",
+# benchflow configuration defaults
+BENCHFLOW_DEFAULTS = {
+    "agent_name": "openhands",
 }
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index a8afa7281..2e11a100a 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -1,24 +1,31 @@
-"""SkillsBench inference script using Harbor with openhands-sdk agent.
+"""SkillsBench inference script using the benchflow SDK.
 
-This script runs SkillsBench evaluation using Harbor as the harness
-and openhands-sdk as the agent. Results are saved in a format compatible
+This script runs SkillsBench evaluation using `benchflow job` as the harness
+and `openhands` as the default agent. Results are saved in a format compatible
 with the standard evaluation pipeline.
 
 Usage:
-    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
+    uv run skillsbench-infer <llm_config_path>
+
+    # Run specific tasks
+    uv run skillsbench-infer <llm_config_path> --select tasks.txt
 """
 
 import argparse
 import json
 import os
+import re
+import shutil
 import subprocess
 import sys
+import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
+import yaml
 from pydantic import SecretStr
 
-from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
@@ -26,256 +33,469 @@
 
 logger = get_logger(__name__)
 
-# Output filename for results
+# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS
+_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$")
+
+# "Total cost: $0.0487"
+_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)")
+# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K"
+_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b")
+
 OUTPUT_FILENAME = "output.jsonl"
 
+TASK_REPOS = {
+    "skillsbench": {
+        "repo": "https://github.com/benchflow-ai/skillsbench.git",
+        "subdir": "tasks",
+    }
+}
+
+_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = {
+    "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"),
+    "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
+    "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
+    "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"),
+}
+
+
+def _infer_direct_provider(model: str) -> str | None:
+    """Infer the provider prefix for direct model names.
+
+    Examples:
+      - gemini/gemini-2.5-pro -> gemini
+      - anthropic/claude-sonnet-4-5 -> anthropic
+      - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars)
+    """
+    if not model or model.startswith("litellm_proxy/"):
+        return None
+    if "/" in model:
+        provider = model.split("/", 1)[0].lower()
+        if provider in _DIRECT_PROVIDER_ENV_VARS:
+            return provider
+    return None
+
+
+def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]:
+    """Build the sandbox environment for benchflow's openhands agent.
+
+    Only LLM-specific variables are returned — these go INTO the sandbox
+    container via the ``agent_env`` YAML key.  The calling process inherits
+    the host environment normally; dumping ``os.environ`` here would leak
+    the entire host env into every container.
+    """
+    env: dict[str, str] = {}
+    api_key: str | None = None
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
+
+    provider = _infer_direct_provider(llm.model)
+    if provider and api_key:
+        key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider]
+        for var_name in key_vars:
+            env[var_name] = api_key
+        if llm.base_url and base_url_var:
+            env[base_url_var] = llm.base_url
+
+    return env
+
+
+def check_benchflow_installed() -> bool:
+    """Check if benchflow CLI is installed and available.
+
+    Tries ``bench`` first (current name), then falls back to the legacy
+    ``benchflow`` binary.
+    """
+    for cmd in ("bench", "benchflow"):
+        try:
+            result = subprocess.run(
+                [cmd, "--help"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                return True
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            continue
+    return False
+
 
-def check_harbor_installed() -> bool:
-    """Check if harbor CLI is installed and available."""
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]:
+    """Map a benchflow dataset name to its task repository metadata."""
+    dataset_name = dataset.split("@", 1)[0].split("/")[-1]
     try:
-        result = subprocess.run(
-            [harbor_exe, "--version"],
+        return dataset_name, TASK_REPOS[dataset_name]
+    except KeyError as exc:
+        raise ValueError(
+            f"Unsupported SkillsBench dataset: {dataset!r}. "
+            f"Known datasets: {sorted(TASK_REPOS)}"
+        ) from exc
+
+
+def ensure_tasks(
+    dataset: str,
+    tasks_dir: Path,
+    task_ids: list[str] | None = None,
+) -> None:
+    """Download tasks for a benchflow dataset into tasks_dir.
+
+    BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the
+    benchmark task repository directly when the local tasks directory is empty.
+
+    When *task_ids* is provided, a sparse checkout is used so only the
+    requested task subdirectories are downloaded — much faster than a full
+    clone for large repos.
+    """
+    if tasks_dir.exists() and any(tasks_dir.iterdir()):
+        logger.info(f"Tasks already present in {tasks_dir}, skipping download")
+        return
+
+    _, repo_info = _resolve_task_repo(dataset)
+    tasks_dir.mkdir(parents=True, exist_ok=True)
+    clone_dir = tasks_dir.parent / "_clone"
+    if clone_dir.exists():
+        shutil.rmtree(clone_dir, ignore_errors=True)
+
+    subdir = repo_info.get("subdir", "")
+
+    if task_ids:
+        # Sparse checkout: only download the specific task directories
+        short_names = [tid.split("/")[-1] for tid in task_ids]
+
+        cmd_clone = [
+            "git",
+            "clone",
+            "--no-checkout",
+            "--depth",
+            "1",
+            repo_info["repo"],
+            str(clone_dir),
+        ]
+        logger.info(f"Sparse clone: {' '.join(cmd_clone)}")
+        result = subprocess.run(cmd_clone, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"task download failed: {result.stderr}")
+
+        # Init sparse-checkout and set the desired paths
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"],
             capture_output=True,
             text=True,
-            timeout=10,
+            check=True,
         )
-        return result.returncode == 0
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        return False
+        sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names]
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        subprocess.run(
+            ["git", "-C", str(clone_dir), "checkout"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+    else:
+        # Full shallow clone
+        cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)]
+        logger.info(f"Downloading tasks: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error(f"Failed to clone tasks: {result.stderr}")
+            raise RuntimeError(f"task download failed: {result.stderr}")
+
+    try:
+        source_dir = clone_dir / subdir if subdir else clone_dir
+
+        for entry in source_dir.iterdir():
+            target = tasks_dir / entry.name
+            if entry.is_dir():
+                shutil.copytree(entry, target, dirs_exist_ok=True)
+            else:
+                shutil.copy2(entry, target)
+    finally:
+        shutil.rmtree(clone_dir, ignore_errors=True)
 
+    logger.info(f"Tasks downloaded to {tasks_dir}")
 
-def run_harbor_evaluation(
+
+def run_benchflow_job(
     llm: LLM,
-    dataset: str,
-    output_dir: str,
+    tasks_dir: Path,
+    jobs_dir: Path,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
-    n_limit: int | None = None,
 ) -> Path:
-    """Run harbor evaluation with openhands-sdk agent.
+    """Run benchflow job command.
 
     Args:
         llm: LLM configuration for the agent.
-        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
-        output_dir: Directory to store output files.
-        num_workers: Number of parallel workers.
-        task_ids: Optional list of specific task IDs to run.
-        n_limit: Optional maximum number of dataset tasks to run.
+        tasks_dir: Path to directory containing task subdirectories.
+        jobs_dir: Directory for benchflow job output.
+        num_workers: Number of parallel workers (concurrency).
+        task_ids: Optional list of task IDs to filter (short names, not full paths).
 
     Returns:
-        Path to the harbor output directory.
+        Path to jobs_dir.
     """
-    harbor_output_dir = Path(output_dir) / "harbor_output"
-    harbor_output_dir.mkdir(parents=True, exist_ok=True)
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
-
-    # Build harbor command using harbor CLI flags.
-    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
-    cmd = [
-        harbor_exe,
-        "run",
-        "-d",
-        dataset,
-        "-a",
-        HARBOR_DEFAULTS["agent_name"],
-        "-m",
-        llm.model,
-        "--jobs-dir",
-        str(harbor_output_dir.resolve()),
-        "--n-concurrent",
-        str(num_workers),
-    ]
-
-    # Pass LLM credentials as agent environment variables
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
-    if llm.base_url:
-        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+    jobs_dir.mkdir(parents=True, exist_ok=True)
+
+    agent_env = _build_benchflow_agent_env(llm)
+    # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without
+    # --break-system-packages. benchflow's openhands install_cmd uses plain
+    # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04,
+    # causing "Agent openhands install failed (rc=1)". Setting this env var
+    # makes pip skip the restriction without modifying the install_cmd.
+    agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1")
+    config = {
+        "tasks_dir": str(tasks_dir),
+        "jobs_dir": str(jobs_dir.resolve()),
+        "agent": BENCHFLOW_DEFAULTS["agent_name"],
+        "model": llm.model,
+        "environment": "docker",
+        "concurrency": num_workers,
+        # OpenHands is installed inside the sandbox as root by benchflow's
+        # registry install command. Running as the default "agent" user can
+        # lose access to that binary on some task images.
+        "sandbox_user": None,
+        "agent_env": agent_env,
+    }
 
-    # Add specific task names if provided
-    if task_ids:
-        for task_id in task_ids:
-            cmd.extend(["--include-task-name", task_id])
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False
+    ) as tmp:
+        yaml.safe_dump(config, tmp, sort_keys=False)
+        config_path = tmp.name
 
-    if n_limit is not None:
-        cmd.extend(["--n-tasks", str(n_limit)])
+    # Prefer `bench eval create` (current), fall back to legacy `benchflow job`
+    bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench"
+    if "benchflow" in bench_bin:
+        cmd = [bench_bin, "job", "--config", config_path]
+    else:
+        cmd = [bench_bin, "eval", "create", "-f", config_path]
 
-    logger.info(f"Running harbor command: {' '.join(cmd)}")
-    logger.info(f"Output directory: {harbor_output_dir}")
+    logger.info(f"Running: {' '.join(cmd)}")
 
-    # harbor's openhands-sdk agent reads LLM credentials from the host process
-    # environment (os.environ), not from --ae flags which go to the sandbox.
-    env = os.environ.copy()
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        env["LLM_API_KEY"] = api_key
-    if llm.base_url:
-        env["LLM_BASE_URL"] = llm.base_url
+    # Inject LLM vars into the host process env so benchflow's provider
+    # resolution can pick them up; the subprocess inherits normally (env=None).
+    host_env = os.environ.copy()
+    host_env.update(agent_env)
+    result = subprocess.run(cmd, capture_output=True, text=True, env=host_env)
+    Path(config_path).unlink(missing_ok=True)
 
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            env=env,
-        )
+    if result.returncode != 0:
+        logger.error(f"benchflow job failed (code {result.returncode})")
+        logger.error(f"stdout: {result.stdout}")
+        logger.error(f"stderr: {result.stderr}")
+        raise RuntimeError(f"benchflow job failed: {result.stderr}")
 
-        if result.returncode != 0:
-            logger.error(f"Harbor command failed with code {result.returncode}")
-            logger.error(f"stdout: {result.stdout}")
-            logger.error(f"stderr: {result.stderr}")
-            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+    logger.info("benchflow job completed")
+    logger.info(f"stdout: {result.stdout}")
 
-        logger.info("Harbor evaluation completed successfully")
-        logger.info(f"stdout: {result.stdout}")
+    return jobs_dir
 
-    except FileNotFoundError:
-        raise RuntimeError(
-            "Harbor CLI not found. Please install harbor: pip install harbor"
-        )
 
-    return harbor_output_dir
-
-
-def _find_job_dir(harbor_output_dir: Path) -> Path:
-    """Find the harbor job directory (timestamp-named) inside the output dir."""
-    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
-    # containing result.json and trial subdirectories
-    candidates = [
-        d
-        for d in harbor_output_dir.iterdir()
-        if d.is_dir() and (d / "result.json").exists()
-    ]
-    if not candidates:
-        raise RuntimeError(
-            f"No harbor job directory found in {harbor_output_dir}. "
-            f"Expected a timestamp-named directory containing result.json."
+def _extract_trial_metrics(trial_dir: Path) -> dict:
+    """Extract token/cost metrics from benchflow 0.3.0 trial output files.
+
+    benchflow 0.3.0 does not write cost/token fields to result.json.
+    Instead, metrics are read from:
+      1. agent/trajectory.json → final_metrics (harbor-format agent)
+      2. agent/openhands.txt  → "Total cost:" and "Tokens:" lines (ACP agent)
+    """
+    # 1. Harbor-format trajectory.json written by openhands-sdk agent
+    traj_file = trial_dir / "agent" / "trajectory.json"
+    if traj_file.exists():
+        try:
+            with open(traj_file) as f:
+                traj = json.load(f)
+            fm = traj.get("final_metrics") or {}
+            if fm:
+                return {
+                    "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0),
+                    "total_completion_tokens": int(
+                        fm.get("total_completion_tokens") or 0
+                    ),
+                    "total_cost_usd": float(fm.get("total_cost_usd") or 0.0),
+                }
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native)
+    def _parse_token_count(value: str, suffix: str) -> int:
+        n = float(value)
+        return int(
+            n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1)
         )
-    # Use the most recent job directory if multiple exist
-    return sorted(candidates)[-1]
 
+    for log_name in ("openhands.txt", "openhands_sdk.txt"):
+        log_file = trial_dir / "agent" / log_name
+        if not log_file.exists():
+            continue
+        try:
+            text = log_file.read_text(errors="replace")
+            cost_usd = 0.0
+            prompt_tokens = 0
+            completion_tokens = 0
+            m = _COST_RE.search(text)
+            if m:
+                cost_usd = float(m.group(1))
+            m = _TOKENS_RE.search(text)
+            if m:
+                prompt_tokens = _parse_token_count(m.group(1), m.group(2))
+                completion_tokens = _parse_token_count(m.group(3), m.group(4))
+            if cost_usd or prompt_tokens:
+                return {
+                    "total_prompt_tokens": prompt_tokens,
+                    "total_completion_tokens": completion_tokens,
+                    "total_cost_usd": cost_usd,
+                }
+        except OSError:
+            pass
 
-def convert_harbor_to_eval_output(
-    harbor_output_dir: Path,
+    return {
+        "total_prompt_tokens": 0,
+        "total_completion_tokens": 0,
+        "total_cost_usd": 0.0,
+    }
+
+
+def convert_benchflow_to_eval_output(
+    jobs_dir: Path,
     eval_output_path: Path,
+    task_ids: list[str] | None = None,
 ) -> None:
-    """Convert harbor output to evaluation output format.
+    """Convert benchflow job output to standard evaluation output format.
 
-    Harbor stores trial results in a job directory structured as:
-        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
+    benchflow 0.3.0 stores trial results as:
+        jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json
 
-    Each trial's result.json contains task_name, verifier_result, agent_result,
-    timing info, and exception details.
+    Each result.json contains task_name, rewards, error, verifier_error, and timing.
 
     Args:
-        harbor_output_dir: Path to harbor output directory.
-        eval_output_path: Path to write the converted output.jsonl.
+        jobs_dir: Path to benchflow jobs directory.
+        eval_output_path: Path to write output.jsonl.
+        task_ids: Optional filter for specific task IDs (short names).
     """
-    logger.info(f"Converting harbor output from {harbor_output_dir}")
-
-    job_dir = _find_job_dir(harbor_output_dir)
-    logger.info(f"Using harbor job directory: {job_dir}")
+    logger.info(f"Converting benchflow output from {jobs_dir}")
+
+    # benchflow 0.3.0 writes:
+    #   jobs/summary.json
+    #   jobs/TIMESTAMP/TRIAL_NAME/result.json
+    # while older local outputs may place results directly under jobs/.
+    job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()]
+    timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)]
+
+    if timestamp_job_dirs:
+        selected_job_dir = sorted(timestamp_job_dirs)[-1]
+        logger.info(f"Using benchflow job directory: {selected_job_dir}")
+        task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()]
+    else:
+        task_dirs = job_dirs
 
-    # Find trial result files (each trial dir has a result.json)
-    result_files = list(job_dir.glob("*/result.json"))
-    # Exclude the job-level result.json
-    result_files = [f for f in result_files if f.parent != job_dir]
+    if not task_dirs:
+        raise RuntimeError(f"No task directories found in {jobs_dir}")
 
-    if not result_files:
-        raise RuntimeError(
-            f"No trial result files found in {job_dir}. "
-            f"Expected result.json files in trial subdirectories."
-        )
+    if task_ids:
+        short_ids = {tid.split("/")[-1] for tid in task_ids}
+        task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids]
 
-    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+    logger.info(f"Processing {len(task_dirs)} task directories")
 
     results: list[dict] = []
     errors: list[dict] = []
 
-    for result_file in result_files:
+    for task_dir in sorted(task_dirs):
+        # Find the trial result — benchflow writes trial-0/result.json
+        trial_results = list(task_dir.glob("trial-*/result.json"))
+        if not trial_results:
+            # Fall back to a direct result.json
+            direct = task_dir / "result.json"
+            if direct.exists():
+                trial_results = [direct]
+
+        if not trial_results:
+            logger.warning(f"No result.json found in {task_dir}, skipping")
+            errors.append(
+                {
+                    "instance_id": f"benchflow/{task_dir.name}",
+                    "error": "No result.json found",
+                    "test_result": {},
+                }
+            )
+            continue
+
+        # Use the last trial (highest retry index)
+        result_file = sorted(trial_results)[-1]
+
         try:
             with open(result_file) as f:
                 trial = json.load(f)
 
-            instance_id = trial.get("task_name", result_file.parent.name)
+            task_basename = task_dir.name.split("__")[0]
+            task_name = trial.get("task_name") or f"benchflow/{task_basename}"
+            # Normalise to benchflow/<name> form
+            if "/" not in task_name:
+                task_name = f"benchflow/{task_name}"
 
-            # Check for exceptions
-            if trial.get("exception_info"):
+            error = trial.get("error")
+            verifier_error = trial.get("verifier_error")
+
+            if error or verifier_error:
                 errors.append(
                     {
-                        "instance_id": instance_id,
-                        "error": str(trial["exception_info"]),
+                        "instance_id": task_name,
+                        "error": str(error or verifier_error),
                         "test_result": {},
                     }
                 )
                 continue
 
-            # Extract verifier results
-            verifier_result = trial.get("verifier_result", {})
-            rewards = verifier_result.get("rewards", {})
-            passed = rewards.get("reward", 0.0) > 0
-
-            # Extract agent metrics
-            agent_result = trial.get("agent_result", {})
+            rewards = trial.get("rewards") or {}
+            passed = bool(rewards.get("reward", 0.0))
 
             eval_entry = {
-                "instance_id": instance_id,
+                "instance_id": task_name,
                 "test_result": {
-                    "trial_name": trial.get("trial_name"),
-                    "trial_uri": trial.get("trial_uri"),
                     "rewards": rewards,
                     "passed": passed,
                 },
                 "instruction": "",
                 "error": None,
                 "history": [],
-                "metrics": {
-                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
-                    "total_completion_tokens": (
-                        agent_result.get("n_output_tokens") or 0
-                    ),
-                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
-                },
+                "metrics": _extract_trial_metrics(result_file.parent),
             }
             results.append(eval_entry)
-            logger.info(
-                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
-            )
+            logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}")
 
         except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to process result file {result_file}: {e}")
+            logger.error(f"Failed to read {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": result_file.parent.name,
+                    "instance_id": f"benchflow/{task_dir.name}",
                     "error": str(e),
                     "test_result": {},
                 }
             )
 
     if not results and not errors:
-        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+        raise RuntimeError(f"No trials processed from {jobs_dir}")
 
     if not results:
         logger.warning(
-            f"All {len(errors)} trials failed in {harbor_output_dir}; "
-            "writing error entries for downstream reporting"
+            f"All {len(errors)} trials failed; writing error entries for reporting"
         )
 
-    # Write results to output.jsonl
     with open(eval_output_path, "w") as f:
-        for entry in results:
-            f.write(json.dumps(entry) + "\n")
-        for entry in errors:
+        for entry in results + errors:
             f.write(json.dumps(entry) + "\n")
 
     logger.info(
@@ -298,18 +518,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]:
 def main() -> None:
     """Main entry point for skillsbench inference."""
     parser = argparse.ArgumentParser(
-        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
+        description="Run SkillsBench evaluation with benchflow and openhands",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
     # Run full skillsbench evaluation
     uv run skillsbench-infer .llm_config/claude.json
 
-    # Run specific tasks
+    # Run specific tasks from a file
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with custom dataset version
-    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+    # Run with more concurrency
+    uv run skillsbench-infer .llm_config/claude.json --num-workers 4
         """,
     )
 
@@ -322,7 +542,7 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+        help="benchflow dataset name (e.g., benchflow/skillsbench)",
     )
     parser.add_argument(
         "--output-dir",
@@ -334,12 +554,12 @@ def main() -> None:
         "--num-workers",
         type=int,
         default=INFER_DEFAULTS["num_workers"],
-        help="Number of parallel workers",
+        help="Number of parallel workers (concurrency)",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        help="Maximum number of dataset tasks to run after Harbor filtering",
+        help="Maximum number of tasks to run",
     )
     parser.add_argument(
         "--select",
@@ -358,14 +578,13 @@ def main() -> None:
         help="Optional note for the evaluation run",
     )
     parser.add_argument(
-        "--skip-harbor",
+        "--skip-run",
         action="store_true",
-        help="Skip running harbor and only convert existing results",
+        help="Skip running benchflow and only convert existing results",
     )
 
     args = parser.parse_args()
 
-    # Validate LLM config
     if not os.path.isfile(args.llm_config_path):
         logger.error(f"LLM config file does not exist: {args.llm_config_path}")
         sys.exit(1)
@@ -375,87 +594,103 @@ def main() -> None:
     llm = LLM.model_validate_json(llm_config)
     logger.info(f"Using LLM: {llm.model}")
 
-    # Check harbor installation
-    if not args.skip_harbor and not check_harbor_installed():
+    if not args.skip_run and not check_benchflow_installed():
         logger.error(
-            "Harbor CLI is not installed. Please install it:\n"
-            "  pip install harbor\n"
+            "benchflow CLI is not installed. Please install it:\n"
+            "  uv tool install benchflow==0.3.0\n"
             "  # or\n"
-            "  uv pip install harbor"
+            "  pip install benchflow==0.3.0\n"
+            "  # or\n"
+            "  uv pip install benchflow==0.3.0"
         )
         sys.exit(1)
 
-    # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
         base_dir=args.output_dir,
         dataset_name=dataset_description,
         model_name=llm.model,
-        max_iterations=100,  # Not directly used but required for path construction
+        max_iterations=100,
         eval_note=args.note,
     )
 
     logger.info(f"Output directory: {structured_output_dir}")
     os.makedirs(structured_output_dir, exist_ok=True)
 
-    # Save metadata
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
         "timestamp": datetime.now(timezone.utc).isoformat(),
-        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
+        "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"],
         "note": args.note,
     }
     metadata_path = Path(structured_output_dir) / "metadata.json"
     with open(metadata_path, "w") as f:
         json.dump(metadata, f, indent=2)
 
-    # Collect task IDs if specified
     task_ids: list[str] | None = None
     if args.select:
-        loaded_ids = load_task_ids_from_file(args.select)
-        task_ids = loaded_ids
-        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
+        task_ids = load_task_ids_from_file(args.select)
+        logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}")
     elif args.task_id:
         task_ids = list(args.task_id)
         logger.info(f"Running {len(task_ids)} specified task IDs")
 
+    tasks_dir = Path(structured_output_dir) / "tasks"
+    jobs_dir = Path(structured_output_dir) / "jobs"
     output_path = Path(structured_output_dir) / OUTPUT_FILENAME
 
-    if not args.skip_harbor:
-        # Run harbor evaluation
+    if not args.skip_run:
         try:
-            harbor_output_dir = run_harbor_evaluation(
+            ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids)
+
+            # Apply n_limit by slicing available task directories
+            effective_task_dirs = tasks_dir
+            if args.n_limit is not None or task_ids is not None:
+                all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir())
+                if task_ids:
+                    short_ids = {tid.split("/")[-1] for tid in task_ids}
+                    all_dirs = [d for d in all_dirs if d.name in short_ids]
+                if args.n_limit is not None:
+                    all_dirs = all_dirs[: args.n_limit]
+
+                # Write a filtered tasks dir symlink tree
+                filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered"
+                filtered_tasks_dir.mkdir(exist_ok=True)
+                for d in all_dirs:
+                    link = filtered_tasks_dir / d.name
+                    if not link.exists():
+                        link.symlink_to(d.resolve())
+                effective_task_dirs = filtered_tasks_dir
+
+            run_benchflow_job(
                 llm=llm,
-                dataset=args.dataset,
-                output_dir=structured_output_dir,
+                tasks_dir=effective_task_dirs,
+                jobs_dir=jobs_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
-                n_limit=args.n_limit,
             )
 
-            # Convert harbor output to standard format
-            convert_harbor_to_eval_output(
-                harbor_output_dir=harbor_output_dir,
+            convert_benchflow_to_eval_output(
+                jobs_dir=jobs_dir,
                 eval_output_path=output_path,
+                task_ids=task_ids,
             )
 
         except Exception as e:
             logger.error(f"Evaluation failed: {e}")
             sys.exit(1)
     else:
-        # Skip harbor, just convert existing results
-        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
-        if harbor_output_dir.exists():
-            convert_harbor_to_eval_output(
-                harbor_output_dir=harbor_output_dir,
+        if jobs_dir.exists():
+            convert_benchflow_to_eval_output(
+                jobs_dir=jobs_dir,
                 eval_output_path=output_path,
+                task_ids=task_ids,
             )
         else:
-            logger.error(f"No harbor output found at {harbor_output_dir}")
+            logger.error(f"No jobs output found at {jobs_dir}")
             sys.exit(1)
 
-    # Generate cost report
     if output_path.exists():
         generate_cost_report(str(output_path))
 
diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
index 56d54f27a..1334da297 100644
--- a/tests/test_skillsbench_eval_infer.py
+++ b/tests/test_skillsbench_eval_infer.py
@@ -39,23 +39,6 @@ def test_resolved_instance(self, tmp_path: Path) -> None:
         assert result["unresolved_instances"] == 0
         assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
 
-    def test_unresolved_instance(self, tmp_path: Path) -> None:
-        """Test processing an unresolved (passed=False) instance."""
-        input_file = tmp_path / "unresolved.jsonl"
-        output_file = tmp_path / "unresolved.report.json"
-
-        entry = {
-            "instance_id": "benchflow/task-1",
-            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
-            "error": None,
-        }
-        input_file.write_text(json.dumps(entry) + "\n")
-
-        result = process_skillsbench_results(str(input_file), str(output_file))
-
-        assert result["resolved_instances"] == 0
-        assert result["unresolved_instances"] == 1
-
     def test_instance_with_error(self, tmp_path: Path) -> None:
         """Test processing an instance that errored."""
         input_file = tmp_path / "error.jsonl"
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 5f8452cb3..784b4d1cc 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -4,82 +4,102 @@
 from pathlib import Path
 
 import pytest
+import yaml
 
-from benchmarks.skillsbench.config import INFER_DEFAULTS
+from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
-    convert_harbor_to_eval_output,
-    run_harbor_evaluation,
+    _build_benchflow_agent_env,
+    convert_benchflow_to_eval_output,
+    run_benchflow_job,
 )
 from openhands.sdk import LLM
 
 
-class TestRunHarborEvaluation:
-    """Tests for building Harbor invocation arguments."""
+class TestRunBenchflowJob:
+    """Tests for building benchflow job invocation arguments."""
 
-    def test_default_dataset_matches_harbor_registry(self) -> None:
-        """Test that the default dataset name matches Harbor's published registry."""
+    def test_default_dataset_matches_benchflow_registry(self) -> None:
+        """Test that the default dataset name matches benchflow's published registry."""
         assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
 
-    def test_run_harbor_evaluation_passes_filters_and_limits(
+    def test_default_agent_is_openhands(self) -> None:
+        """Test that the default agent is openhands."""
+        assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands"
+
+    def test_run_benchflow_job_passes_model_and_concurrency(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test Harbor command includes task filters and n-limit."""
-        captured: dict[str, list[str]] = {}
+        """Test benchflow job command writes the expected YAML config."""
+        captured_cmd: list[str] = []
+        captured_env: dict[str, str] = {}
+        captured_config: dict = {}
+
+        # Force legacy benchflow binary path so the command format is deterministic
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.shutil.which",
+            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
+        )
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured["cmd"] = cmd
+            captured_cmd[:] = cmd
+            captured_env.clear()
+            captured_env.update(env)
+            with open(cmd[3]) as f:
+                captured_config.update(yaml.safe_load(f))
             return type(
                 "Completed",
                 (),
-                {"returncode": 0, "stdout": "ok", "stderr": ""},
+                {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""},
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        harbor_output_dir = run_harbor_evaluation(
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        jobs_dir = tmp_path / "jobs"
+
+        run_benchflow_job(
             llm=LLM(
-                model="litellm_proxy/test-model",
+                model="anthropic/claude-sonnet-4-5",
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
-            output_dir=str(tmp_path),
-            num_workers=2,
-            task_ids=["benchflow/task-a", "benchflow/task-b"],
-            n_limit=3,
-        )
-
-        expected_output_dir = tmp_path / "harbor_output"
-        assert harbor_output_dir == expected_output_dir
-
-        cmd = captured["cmd"]
-        assert cmd[:8] == [
-            "harbor",
-            "run",
-            "-d",
-            "benchflow/skillsbench",
-            "-a",
-            "openhands-sdk",
-            "-m",
-            "litellm_proxy/test-model",
-        ]
-        assert "--jobs-dir" in cmd
-        assert str(expected_output_dir.resolve()) in cmd
-        assert cmd.count("--include-task-name") == 2
-        assert "benchflow/task-a" in cmd
-        assert "benchflow/task-b" in cmd
-        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
-        assert cmd[cmd.index("--n-tasks") + 1] == "3"
-
-    def test_llm_credentials_passed_via_env(
+            tasks_dir=tasks_dir,
+            jobs_dir=jobs_dir,
+            num_workers=4,
+        )
+
+        cmd = captured_cmd
+        assert cmd[0] == "/usr/local/bin/benchflow"
+        assert cmd[1] == "job"
+        assert cmd[2] == "--config"
+        assert captured_config["tasks_dir"] == str(tasks_dir)
+        assert captured_config["jobs_dir"] == str(jobs_dir.resolve())
+        assert captured_config["agent"] == "openhands"
+        assert captured_config["model"] == "anthropic/claude-sonnet-4-5"
+        assert captured_config["concurrency"] == 4
+        assert captured_config["sandbox_user"] is None
+
+    def test_llm_credentials_passed_via_subprocess_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
-        captured: dict = {}
+        """Test that LLM credentials are passed via subprocess env and YAML."""
+        captured_cmd: list[str] = []
+        captured_env: dict[str, str] = {}
+        captured_config: dict = {}
+
+        # Force legacy benchflow binary path so the command format is deterministic
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.shutil.which",
+            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
+        )
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured["cmd"] = cmd
-            captured["env"] = env
+            captured_cmd[:] = cmd
+            captured_env.clear()
+            captured_env.update(env)
+            with open(cmd[3]) as f:
+                captured_config.update(yaml.safe_load(f))
             return type(
                 "Completed",
                 (),
@@ -88,60 +108,127 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        run_harbor_evaluation(
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+
+        run_benchflow_job(
             llm=LLM(
                 model="test-model",
                 api_key="my-secret-key",
                 base_url="https://my-proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
-            output_dir=str(tmp_path),
+            tasks_dir=tasks_dir,
+            jobs_dir=tmp_path / "jobs",
+        )
+
+        # Credentials in subprocess env
+        assert captured_env["LLM_API_KEY"] == "my-secret-key"
+        assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com"
+        assert "--ae" not in captured_cmd
+        assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key"
+        assert (
+            captured_config["agent_env"]["LLM_BASE_URL"]
+            == "https://my-proxy.example.com"
         )
 
-        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
-        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+    def test_direct_gemini_model_sets_provider_env_vars(self) -> None:
+        """Direct provider models need provider-specific env vars."""
+        env = _build_benchflow_agent_env(
+            LLM(
+                model="gemini/gemini-3.1-flash-lite-preview",
+                api_key="gemini-test-key",
+            )
+        )
 
+        assert env["LLM_API_KEY"] == "gemini-test-key"
+        assert env["GEMINI_API_KEY"] == "gemini-test-key"
+        assert env["GOOGLE_API_KEY"] == "gemini-test-key"
 
-class TestConvertHarborToEvalOutput:
-    """Tests for convert_harbor_to_eval_output function."""
+    def test_proxy_model_does_not_set_provider_env_vars(self) -> None:
+        """LiteLLM proxy configs should keep using generic LLM_* vars only."""
+        env = _build_benchflow_agent_env(
+            LLM(
+                model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
+                api_key="proxy-key",
+                base_url="https://proxy.example.com",
+            )
+        )
 
-    def _create_harbor_structure(
-        self, tmp_path: Path, trials: list[tuple[str, dict]]
-    ) -> Path:
-        """Create a mock Harbor output structure."""
-        harbor_dir = tmp_path / "harbor_output"
-        job_dir = harbor_dir / "2026-01-01__00-00-00"
-        job_dir.mkdir(parents=True)
-        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
+        assert env["LLM_API_KEY"] == "proxy-key"
+        assert env["LLM_BASE_URL"] == "https://proxy.example.com"
+        assert "ANTHROPIC_API_KEY" not in env
+        assert "ANTHROPIC_BASE_URL" not in env
 
-        for trial_name, trial_result in trials:
-            trial_dir = job_dir / trial_name
-            trial_dir.mkdir()
-            (trial_dir / "result.json").write_text(json.dumps(trial_result))
 
-        return harbor_dir
+class TestConvertBenchflowToEvalOutput:
+    """Tests for convert_benchflow_to_eval_output function."""
+
+    def _create_benchflow_structure(
+        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock benchflow jobs directory structure.
+
+        benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json
+        """
+        jobs_dir = tmp_path / "jobs"
+        for task_name, result in tasks:
+            trial_dir = jobs_dir / task_name / "trial-0"
+            trial_dir.mkdir(parents=True)
+            (trial_dir / "result.json").write_text(json.dumps(result))
+        return jobs_dir
+
+    def _create_benchflow_timestamped_job(
+        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock benchflow 0.3.0 jobs directory structure.
+
+        benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json
+        """
+        jobs_dir = tmp_path / "jobs"
+        job_dir = jobs_dir / "2026-04-21__23-12-35"
+        job_dir.mkdir(parents=True)
+        (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)}))
+        for task_name, result in tasks:
+            trial_dir = job_dir / f"{task_name}__abc123"
+            trial_dir.mkdir(parents=True)
+            (trial_dir / "result.json").write_text(json.dumps(result))
+        return jobs_dir
 
     def test_successful_trial_parsing(self, tmp_path: Path) -> None:
-        """Test successful parsing of harbor trial result."""
+        """Test successful parsing of a benchflow trial result.
+
+        benchflow 0.3.0 does not write cost/token fields to result.json.
+        Metrics are read from agent/trajectory.json (harbor-format agent)
+        or parsed from agent/openhands.txt (ACP agent stdout).
+        """
         trial_result = {
             "task_name": "benchflow/weighted-gdp-calc",
-            "trial_name": "weighted-gdp-calc__abc123",
-            "trial_uri": "file:///path/to/trial",
-            "agent_result": {
-                "n_input_tokens": 1000,
-                "n_output_tokens": 200,
-                "cost_usd": 0.05,
-            },
-            "verifier_result": {"rewards": {"reward": 1.0}},
-            "exception_info": None,
+            "rewards": {"reward": 1.0},
+            "error": None,
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        # Write agent/trajectory.json with final_metrics (harbor-format agent output).
+        # agent/ sits next to result.json, inside the trial-0 subdirectory.
+        trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0"
+        agent_dir = trial_dir / "agent"
+        agent_dir.mkdir(parents=True, exist_ok=True)
+        (agent_dir / "trajectory.json").write_text(
+            json.dumps(
+                {
+                    "final_metrics": {
+                        "total_prompt_tokens": 1000,
+                        "total_completion_tokens": 200,
+                        "total_cost_usd": 0.05,
+                    }
+                }
+            )
         )
         output_file = tmp_path / "output.jsonl"
 
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         assert output_file.exists()
         with open(output_file) as f:
@@ -151,26 +238,52 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
+        assert entries[0]["metrics"]["total_prompt_tokens"] == 1000
+        assert entries[0]["metrics"]["total_completion_tokens"] == 200
+
+    def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None:
+        """Test that metrics are extracted from agent/openhands.txt (ACP agent)."""
+        trial_result = {
+            "task_name": "benchflow/acp-task",
+            "rewards": {"reward": 1.0},
+            "error": None,
+        }
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path, [("acp-task", trial_result)]
+        )
+        # Write agent/openhands.txt simulating openhands ACP stdout
+        trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123"
+        agent_dir = trial_dir / "agent"
+        agent_dir.mkdir(parents=True, exist_ok=True)
+        (agent_dir / "openhands.txt").write_text(
+            "OpenHands SDK v1.16.0\n"
+            "Tokens: ↑ input 404.21K • cache hit 70.47% •  reasoning 579 • ↓ output 7.83K • $0.0487\n"
+            "Total cost: $0.0487\n"
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487)
+        assert entries[0]["metrics"]["total_prompt_tokens"] == 404210
+        assert entries[0]["metrics"]["total_completion_tokens"] == 7830
 
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {
             "task_name": "benchflow/task-1",
-            "trial_name": "task-1__xyz",
-            "agent_result": {
-                "n_input_tokens": None,
-                "n_output_tokens": None,
-                "cost_usd": None,
-            },
-            "verifier_result": {"rewards": {"reward": 0.0}},
-            "exception_info": None,
+            "rewards": {"reward": 0.0},
+            "error": None,
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("task-1__xyz", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("task-1", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -178,21 +291,19 @@ def test_failed_trial(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is False
         assert entries[0]["metrics"]["total_cost_usd"] == 0.0
 
-    def test_trial_with_exception(self, tmp_path: Path) -> None:
-        """Test that exception trials are written as error entries."""
+    def test_trial_with_error(self, tmp_path: Path) -> None:
+        """Test that errored trials are written as error entries."""
         trial_result = {
             "task_name": "benchflow/error-task",
-            "trial_name": "error-task__err",
-            "agent_result": {},
-            "verifier_result": {},
-            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
+            "rewards": {},
+            "error": "LLM_API_KEY not set",
         }
 
-        harbor_dir = self._create_harbor_structure(
-            tmp_path, [("error-task__err", trial_result)]
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("error-task", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_harbor_to_eval_output(harbor_dir, output_file)
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -202,20 +313,121 @@ def test_trial_with_exception(self, tmp_path: Path) -> None:
         assert entries[0]["error"] is not None
         assert entries[0]["test_result"] == {}
 
-    def test_missing_job_directory(self, tmp_path: Path) -> None:
-        """Test handling when no job directory exists."""
-        harbor_dir = tmp_path / "harbor_output"
-        harbor_dir.mkdir()
+    def test_missing_jobs_directory(self, tmp_path: Path) -> None:
+        """Test handling when jobs directory is empty."""
+        jobs_dir = tmp_path / "jobs"
+        jobs_dir.mkdir()
+
+        with pytest.raises(RuntimeError, match="No task directories found"):
+            convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl")
+
+    def test_task_id_filtering(self, tmp_path: Path) -> None:
+        """Test that only specified task IDs are converted."""
+        trials = [
+            (
+                "task-a",
+                {
+                    "task_name": "benchflow/task-a",
+                    "rewards": {"reward": 1.0},
+                    "error": None,
+                },
+            ),
+            (
+                "task-b",
+                {
+                    "task_name": "benchflow/task-b",
+                    "rewards": {"reward": 0.0},
+                    "error": None,
+                },
+            ),
+        ]
+        jobs_dir = self._create_benchflow_structure(tmp_path, trials)
+        output_file = tmp_path / "output.jsonl"
 
-        with pytest.raises(RuntimeError, match="No harbor job directory found"):
-            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+        convert_benchflow_to_eval_output(
+            jobs_dir, output_file, task_ids=["benchflow/task-a"]
+        )
 
-    def test_empty_job_directory(self, tmp_path: Path) -> None:
-        """Test handling of harbor job dir with no trial subdirs."""
-        harbor_dir = tmp_path / "harbor_output"
-        job_dir = harbor_dir / "2026-01-01__00-00-00"
-        job_dir.mkdir(parents=True)
-        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/task-a"
+
+    def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None:
+        """Test that task names without prefix get benchflow/ prepended."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",  # no benchflow/ prefix
+            "rewards": {"reward": 1.0},
+            "error": None,
+        }
+        jobs_dir = self._create_benchflow_structure(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+
+    def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None:
+        """Test benchflow 0.3.0 timestamped jobs directory layout."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",
+            "rewards": {"reward": 1.0},
+            "error": None,
+            "n_input_tokens": 42,
+            "n_output_tokens": 7,
+            "cost_usd": 0.01,
+        }
+
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path, [("weighted-gdp-calc", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_benchflow_to_eval_output(jobs_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+        assert entries[0]["test_result"]["passed"] is True
 
-        with pytest.raises(RuntimeError, match="No trial result files found"):
-            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+    def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None:
+        """Test filtering strips the run suffix from trial directory names."""
+        jobs_dir = self._create_benchflow_timestamped_job(
+            tmp_path,
+            [
+                (
+                    "task-a",
+                    {
+                        "task_name": "task-a",
+                        "rewards": {"reward": 1.0},
+                        "error": None,
+                    },
+                ),
+                (
+                    "task-b",
+                    {
+                        "task_name": "task-b",
+                        "rewards": {"reward": 0.0},
+                        "error": None,
+                    },
+                ),
+            ],
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_benchflow_to_eval_output(
+            jobs_dir, output_file, task_ids=["benchflow/task-a"]
+        )
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/task-a"
diff --git a/uv.lock b/uv.lock
index 2cd0b3640..147abedc9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,6 +1282,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1292,6 +1293,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1302,6 +1304,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -1516,11 +1519,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
-[package.optional-dependencies]
-socks = [
-    { name = "socksio" },
-]
-
 [[package]]
 name = "httpx-sse"
 version = "0.4.2"
@@ -1816,12 +1814,14 @@ wheels = [
 
 [[package]]
 name = "litellm"
-version = "1.83.0"
+version = "1.80.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "click" },
     { name = "fastuuid" },
+    { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
+    { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "httpx" },
     { name = "importlib-metadata" },
     { name = "jinja2" },
@@ -1832,9 +1832,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "tokenizers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" },
 ]
 
 [[package]]
@@ -2402,7 +2402,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2467,6 +2467,7 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
+    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2521,6 +2522,7 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
+    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -2544,7 +2546,7 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "agent-client-protocol" },
@@ -2552,7 +2554,7 @@ dependencies = [
     { name = "fakeredis", extra = ["lua"] },
     { name = "fastmcp" },
     { name = "filelock" },
-    { name = "httpx", extra = ["socks"] },
+    { name = "httpx" },
     { name = "litellm" },
     { name = "lmnr" },
     { name = "pydantic" },
@@ -2575,8 +2577,8 @@ requires-dist = [
     { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" },
     { name = "fastmcp", specifier = ">=3.0.0" },
     { name = "filelock", specifier = ">=3.20.1" },
-    { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" },
-    { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" },
+    { name = "httpx", specifier = ">=0.27.0" },
+    { name = "litellm", specifier = "==1.80.10" },
     { name = "lmnr", specifier = ">=0.7.24" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
@@ -2588,7 +2590,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2617,7 +2619,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.16.1"
+version = "1.16.0"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },
@@ -6707,15 +6709,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
-[[package]]
-name = "socksio"
-version = "1.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" },
-]
-
 [[package]]
 name = "sortedcontainers"
 version = "2.4.0"
@@ -6841,6 +6834,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
+[[package]]
+name = "swesmith"
+version = "0.0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
+]
+
 [[package]]
 name = "swt-bench"
 version = "1.0.1"
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 3e0a3a091..acd5adc96 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a
+Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6

From 935f489e5233d919fb9db6ccfa63a39f89b68511 Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Wed, 22 Apr 2026 20:57:53 -0400
Subject: [PATCH 09/12] Revert "feat(skillsbench): migrate harness from Harbor
 to benchflow 0.3.0"

This reverts commit 4d31c87c8fb0c7ff8341bc37debe06851ad2a67b.
---
 .gitignore                           |   1 -
 benchmarks/skillsbench/README.md     |  42 +-
 benchmarks/skillsbench/config.py     |  11 +-
 benchmarks/skillsbench/run_infer.py  | 655 +++++++++------------------
 tests/test_skillsbench_eval_infer.py |  17 +
 tests/test_skillsbench_run_infer.py  | 442 +++++-------------
 uv.lock                              |  50 +-
 vendor/software-agent-sdk            |   2 +-
 8 files changed, 396 insertions(+), 824 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9164fd12b..459fad588 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,5 +216,4 @@ workspace/
 
 # Evaluation outputs
 eval_outputs/
-evaluation_outputs/
 builds/
diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 21339842c..60ff73652 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -1,10 +1,10 @@
 # SkillsBench Evaluation
 
-This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [benchflow](https://github.com/benchflow-ai/benchflow) as the evaluation harness with the `openhands` agent.
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
 
 ## Overview
 
-SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents. Domains include:
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
 
 - Software engineering
 - Office & white collar
@@ -20,25 +20,23 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Prerequisites
 
-1. **Install benchflow**: benchflow is the official harness for running SkillsBench.
+1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
 
    ```bash
-   uv tool install benchflow==0.3.0
+   pip install harbor
    # or
-   pip install benchflow==0.3.0
-   # or
-   uv pip install benchflow==0.3.0
+   uv pip install harbor
    ```
 
-2. **Docker**: benchflow requires Docker to be installed and running.
+2. **Docker**: Harbor requires Docker to be installed and running.
 
-3. **LLM API Key**: Configure your LLM provider credentials. The benchflow `openhands` agent reads `LLM_API_KEY` and optional `LLM_BASE_URL` from the environment.
+3. **LLM API Key**: Configure your LLM provider credentials.
 
 ## Usage
 
 ### Running Inference
 
-Run the SkillsBench evaluation using the `openhands` agent:
+Run the SkillsBench evaluation using the OpenHands SDK agent:
 
 ```bash
 # Run full evaluation
@@ -64,7 +62,7 @@ Create an LLM configuration file (e.g., `.llm_config/claude.json`):
 ```json
 {
   "model": "anthropic/claude-sonnet-4-20250514",
-  "api_key": "YOUR_ANTHROPIC_API_KEY"
+  "api_key": "YOUR_API_KEY"
 }
 ```
 
@@ -101,6 +99,8 @@ Each line contains:
 {
   "instance_id": "benchflow/task-name",
   "test_result": {
+    "trial_name": "...",
+    "trial_uri": "...",
     "rewards": {"reward": 1.0},
     "passed": true
   },
@@ -134,21 +134,22 @@ Each line contains:
 
 ## Architecture
 
-The integration uses the benchflow CLI as the evaluation harness:
+The integration follows the Harbor agent adapter pattern:
 
-1. **Task download**: the integration clones the SkillsBench task repo locally when the task cache is empty
-2. **benchflow job**: Runs all tasks concurrently with `openhands`
-3. **Result conversion**: Trial `result.json` files are converted to the standard `output.jsonl` format
+1. **Harbor Harness**: Manages task containers and lifecycle
+2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
+3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
 
 ```text
 ┌──────────────────────────────────────────────────┐
-│               benchflow job                      │
+│                 Harbor Harness                   │
 │  ┌────────────────────────────────────────────┐  │
-│  │           Task Container (Docker)          │  │
+│  │           Task Container                   │  │
 │  │  ┌──────────────────────────────────────┐  │  │
-│  │  │       openhands                      │  │  │
+│  │  │       OpenHands SDK Agent            │  │  │
 │  │  │  - Terminal tool                     │  │  │
 │  │  │  - File editor tool                  │  │  │
+│  │  │  - Task tracker tool                 │  │  │
 │  │  └──────────────────────────────────────┘  │  │
 │  └────────────────────────────────────────────┘  │
 └──────────────────────────────────────────────────┘
@@ -157,5 +158,6 @@ The integration uses the benchflow CLI as the evaluation harness:
 ## References
 
 - [SkillsBench](https://www.skillsbench.ai/) - The benchmark
-- [benchflow](https://github.com/benchflow-ai/benchflow) - The evaluation harness
-- [benchflow CLI reference](https://github.com/benchflow-ai/benchflow/blob/main/docs/cli-reference.md) - CLI documentation
+- [Harbor](https://harborframework.com) - The evaluation harness
+- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
+- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
index 4ed541ab9..8b55a92b0 100644
--- a/benchmarks/skillsbench/config.py
+++ b/benchmarks/skillsbench/config.py
@@ -1,13 +1,16 @@
 """SkillsBench configuration defaults."""
 
-# Default inference settings
+# Default inference settings (only include values actually used by argparse)
 INFER_DEFAULTS = {
     "dataset": "benchflow/skillsbench",
     "output_dir": "./evaluation_outputs",
     "num_workers": 1,
 }
 
-# benchflow configuration defaults
-BENCHFLOW_DEFAULTS = {
-    "agent_name": "openhands",
+# Harbor configuration defaults
+HARBOR_DEFAULTS = {
+    # Harbor executable
+    "harbor_executable": "harbor",
+    # Default agent name for openhands-sdk
+    "agent_name": "openhands-sdk",
 }
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index 2e11a100a..a8afa7281 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -1,31 +1,24 @@
-"""SkillsBench inference script using the benchflow SDK.
+"""SkillsBench inference script using Harbor with openhands-sdk agent.
 
-This script runs SkillsBench evaluation using `benchflow job` as the harness
-and `openhands` as the default agent. Results are saved in a format compatible
+This script runs SkillsBench evaluation using Harbor as the harness
+and openhands-sdk as the agent. Results are saved in a format compatible
 with the standard evaluation pipeline.
 
 Usage:
-    uv run skillsbench-infer <llm_config_path>
-
-    # Run specific tasks
-    uv run skillsbench-infer <llm_config_path> --select tasks.txt
+    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
 """
 
 import argparse
 import json
 import os
-import re
-import shutil
 import subprocess
 import sys
-import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
-import yaml
 from pydantic import SecretStr
 
-from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
@@ -33,469 +26,256 @@
 
 logger = get_logger(__name__)
 
-# Matches benchflow 0.3.0 job directory names: YYYY-MM-DD__HH-MM-SS
-_TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2}__\d{2}-\d{2}-\d{2}$")
-
-# "Total cost: $0.0487"
-_COST_RE = re.compile(r"Total cost:\s*\$([0-9]+(?:\.[0-9]+)?)")
-# "Tokens: ↑ input 404.21K • ... • ↓ output 7.83K"
-_TOKENS_RE = re.compile(r"↑ input\s+([\d.]+)([KMB]?)\b.*?↓ output\s+([\d.]+)([KMB]?)\b")
-
+# Output filename for results
 OUTPUT_FILENAME = "output.jsonl"
 
-TASK_REPOS = {
-    "skillsbench": {
-        "repo": "https://github.com/benchflow-ai/skillsbench.git",
-        "subdir": "tasks",
-    }
-}
-
-_DIRECT_PROVIDER_ENV_VARS: dict[str, tuple[tuple[str, ...], str | None]] = {
-    "anthropic": (("ANTHROPIC_API_KEY",), "ANTHROPIC_BASE_URL"),
-    "gemini": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
-    "google": (("GEMINI_API_KEY", "GOOGLE_API_KEY"), "GEMINI_BASE_URL"),
-    "openai": (("OPENAI_API_KEY",), "OPENAI_BASE_URL"),
-}
-
-
-def _infer_direct_provider(model: str) -> str | None:
-    """Infer the provider prefix for direct model names.
-
-    Examples:
-      - gemini/gemini-2.5-pro -> gemini
-      - anthropic/claude-sonnet-4-5 -> anthropic
-      - litellm_proxy/anthropic/... -> None (proxy config uses LLM_* vars)
-    """
-    if not model or model.startswith("litellm_proxy/"):
-        return None
-    if "/" in model:
-        provider = model.split("/", 1)[0].lower()
-        if provider in _DIRECT_PROVIDER_ENV_VARS:
-            return provider
-    return None
-
-
-def _build_benchflow_agent_env(llm: LLM) -> dict[str, str]:
-    """Build the sandbox environment for benchflow's openhands agent.
-
-    Only LLM-specific variables are returned — these go INTO the sandbox
-    container via the ``agent_env`` YAML key.  The calling process inherits
-    the host environment normally; dumping ``os.environ`` here would leak
-    the entire host env into every container.
-    """
-    env: dict[str, str] = {}
-    api_key: str | None = None
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        env["LLM_API_KEY"] = api_key
-    if llm.base_url:
-        env["LLM_BASE_URL"] = llm.base_url
-
-    provider = _infer_direct_provider(llm.model)
-    if provider and api_key:
-        key_vars, base_url_var = _DIRECT_PROVIDER_ENV_VARS[provider]
-        for var_name in key_vars:
-            env[var_name] = api_key
-        if llm.base_url and base_url_var:
-            env[base_url_var] = llm.base_url
-
-    return env
-
-
-def check_benchflow_installed() -> bool:
-    """Check if benchflow CLI is installed and available.
-
-    Tries ``bench`` first (current name), then falls back to the legacy
-    ``benchflow`` binary.
-    """
-    for cmd in ("bench", "benchflow"):
-        try:
-            result = subprocess.run(
-                [cmd, "--help"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-            if result.returncode == 0:
-                return True
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            continue
-    return False
-
 
-def _resolve_task_repo(dataset: str) -> tuple[str, dict[str, str]]:
-    """Map a benchflow dataset name to its task repository metadata."""
-    dataset_name = dataset.split("@", 1)[0].split("/")[-1]
+def check_harbor_installed() -> bool:
+    """Check if harbor CLI is installed and available."""
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
     try:
-        return dataset_name, TASK_REPOS[dataset_name]
-    except KeyError as exc:
-        raise ValueError(
-            f"Unsupported SkillsBench dataset: {dataset!r}. "
-            f"Known datasets: {sorted(TASK_REPOS)}"
-        ) from exc
-
-
-def ensure_tasks(
-    dataset: str,
-    tasks_dir: Path,
-    task_ids: list[str] | None = None,
-) -> None:
-    """Download tasks for a benchflow dataset into tasks_dir.
-
-    BenchFlow 0.3.0 does not expose ``benchflow tasks pull``, so we clone the
-    benchmark task repository directly when the local tasks directory is empty.
-
-    When *task_ids* is provided, a sparse checkout is used so only the
-    requested task subdirectories are downloaded — much faster than a full
-    clone for large repos.
-    """
-    if tasks_dir.exists() and any(tasks_dir.iterdir()):
-        logger.info(f"Tasks already present in {tasks_dir}, skipping download")
-        return
-
-    _, repo_info = _resolve_task_repo(dataset)
-    tasks_dir.mkdir(parents=True, exist_ok=True)
-    clone_dir = tasks_dir.parent / "_clone"
-    if clone_dir.exists():
-        shutil.rmtree(clone_dir, ignore_errors=True)
-
-    subdir = repo_info.get("subdir", "")
-
-    if task_ids:
-        # Sparse checkout: only download the specific task directories
-        short_names = [tid.split("/")[-1] for tid in task_ids]
-
-        cmd_clone = [
-            "git",
-            "clone",
-            "--no-checkout",
-            "--depth",
-            "1",
-            repo_info["repo"],
-            str(clone_dir),
-        ]
-        logger.info(f"Sparse clone: {' '.join(cmd_clone)}")
-        result = subprocess.run(cmd_clone, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"task download failed: {result.stderr}")
-
-        # Init sparse-checkout and set the desired paths
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "sparse-checkout", "init", "--cone"],
+        result = subprocess.run(
+            [harbor_exe, "--version"],
             capture_output=True,
             text=True,
-            check=True,
+            timeout=10,
         )
-        sparse_paths = [f"{subdir}/{name}" if subdir else name for name in short_names]
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "sparse-checkout", "set", *sparse_paths],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        subprocess.run(
-            ["git", "-C", str(clone_dir), "checkout"],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-    else:
-        # Full shallow clone
-        cmd = ["git", "clone", "--depth", "1", repo_info["repo"], str(clone_dir)]
-        logger.info(f"Downloading tasks: {' '.join(cmd)}")
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            logger.error(f"Failed to clone tasks: {result.stderr}")
-            raise RuntimeError(f"task download failed: {result.stderr}")
-
-    try:
-        source_dir = clone_dir / subdir if subdir else clone_dir
-
-        for entry in source_dir.iterdir():
-            target = tasks_dir / entry.name
-            if entry.is_dir():
-                shutil.copytree(entry, target, dirs_exist_ok=True)
-            else:
-                shutil.copy2(entry, target)
-    finally:
-        shutil.rmtree(clone_dir, ignore_errors=True)
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
 
-    logger.info(f"Tasks downloaded to {tasks_dir}")
 
-
-def run_benchflow_job(
+def run_harbor_evaluation(
     llm: LLM,
-    tasks_dir: Path,
-    jobs_dir: Path,
+    dataset: str,
+    output_dir: str,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
+    n_limit: int | None = None,
 ) -> Path:
-    """Run benchflow job command.
+    """Run harbor evaluation with openhands-sdk agent.
 
     Args:
         llm: LLM configuration for the agent.
-        tasks_dir: Path to directory containing task subdirectories.
-        jobs_dir: Directory for benchflow job output.
-        num_workers: Number of parallel workers (concurrency).
-        task_ids: Optional list of task IDs to filter (short names, not full paths).
+        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        output_dir: Directory to store output files.
+        num_workers: Number of parallel workers.
+        task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
 
     Returns:
-        Path to jobs_dir.
+        Path to the harbor output directory.
     """
-    jobs_dir.mkdir(parents=True, exist_ok=True)
-
-    agent_env = _build_benchflow_agent_env(llm)
-    # Ubuntu 24.04 enforces PEP 668 and blocks bare `pip install` without
-    # --break-system-packages. benchflow's openhands install_cmd uses plain
-    # `pip install openhands`, which silently fails (exit 0) on Ubuntu 24.04,
-    # causing "Agent openhands install failed (rc=1)". Setting this env var
-    # makes pip skip the restriction without modifying the install_cmd.
-    agent_env.setdefault("PIP_BREAK_SYSTEM_PACKAGES", "1")
-    config = {
-        "tasks_dir": str(tasks_dir),
-        "jobs_dir": str(jobs_dir.resolve()),
-        "agent": BENCHFLOW_DEFAULTS["agent_name"],
-        "model": llm.model,
-        "environment": "docker",
-        "concurrency": num_workers,
-        # OpenHands is installed inside the sandbox as root by benchflow's
-        # registry install command. Running as the default "agent" user can
-        # lose access to that binary on some task images.
-        "sandbox_user": None,
-        "agent_env": agent_env,
-    }
-
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".yaml", prefix="benchflow-job-", delete=False
-    ) as tmp:
-        yaml.safe_dump(config, tmp, sort_keys=False)
-        config_path = tmp.name
-
-    # Prefer `bench eval create` (current), fall back to legacy `benchflow job`
-    bench_bin = shutil.which("bench") or shutil.which("benchflow") or "bench"
-    if "benchflow" in bench_bin:
-        cmd = [bench_bin, "job", "--config", config_path]
-    else:
-        cmd = [bench_bin, "eval", "create", "-f", config_path]
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+
+    # Build harbor command using harbor CLI flags.
+    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
+    cmd = [
+        harbor_exe,
+        "run",
+        "-d",
+        dataset,
+        "-a",
+        HARBOR_DEFAULTS["agent_name"],
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    # Pass LLM credentials as agent environment variables
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
+    if llm.base_url:
+        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
 
-    logger.info(f"Running: {' '.join(cmd)}")
+    # Add specific task names if provided
+    if task_ids:
+        for task_id in task_ids:
+            cmd.extend(["--include-task-name", task_id])
 
-    # Inject LLM vars into the host process env so benchflow's provider
-    # resolution can pick them up; the subprocess inherits normally (env=None).
-    host_env = os.environ.copy()
-    host_env.update(agent_env)
-    result = subprocess.run(cmd, capture_output=True, text=True, env=host_env)
-    Path(config_path).unlink(missing_ok=True)
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
 
-    if result.returncode != 0:
-        logger.error(f"benchflow job failed (code {result.returncode})")
-        logger.error(f"stdout: {result.stdout}")
-        logger.error(f"stderr: {result.stderr}")
-        raise RuntimeError(f"benchflow job failed: {result.stderr}")
+    logger.info(f"Running harbor command: {' '.join(cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
 
-    logger.info("benchflow job completed")
-    logger.info(f"stdout: {result.stdout}")
+    # harbor's openhands-sdk agent reads LLM credentials from the host process
+    # environment (os.environ), not from --ae flags which go to the sandbox.
+    env = os.environ.copy()
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
 
-    return jobs_dir
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
 
+        if result.returncode != 0:
+            logger.error(f"Harbor command failed with code {result.returncode}")
+            logger.error(f"stdout: {result.stdout}")
+            logger.error(f"stderr: {result.stderr}")
+            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
 
-def _extract_trial_metrics(trial_dir: Path) -> dict:
-    """Extract token/cost metrics from benchflow 0.3.0 trial output files.
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
 
-    benchflow 0.3.0 does not write cost/token fields to result.json.
-    Instead, metrics are read from:
-      1. agent/trajectory.json → final_metrics (harbor-format agent)
-      2. agent/openhands.txt  → "Total cost:" and "Tokens:" lines (ACP agent)
-    """
-    # 1. Harbor-format trajectory.json written by openhands-sdk agent
-    traj_file = trial_dir / "agent" / "trajectory.json"
-    if traj_file.exists():
-        try:
-            with open(traj_file) as f:
-                traj = json.load(f)
-            fm = traj.get("final_metrics") or {}
-            if fm:
-                return {
-                    "total_prompt_tokens": int(fm.get("total_prompt_tokens") or 0),
-                    "total_completion_tokens": int(
-                        fm.get("total_completion_tokens") or 0
-                    ),
-                    "total_cost_usd": float(fm.get("total_cost_usd") or 0.0),
-                }
-        except (json.JSONDecodeError, OSError):
-            pass
-
-    # 2. ACP agent log written by openhands acp (benchflow 0.3.0 native)
-    def _parse_token_count(value: str, suffix: str) -> int:
-        n = float(value)
-        return int(
-            n * {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}.get(suffix.upper(), 1)
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
         )
 
-    for log_name in ("openhands.txt", "openhands_sdk.txt"):
-        log_file = trial_dir / "agent" / log_name
-        if not log_file.exists():
-            continue
-        try:
-            text = log_file.read_text(errors="replace")
-            cost_usd = 0.0
-            prompt_tokens = 0
-            completion_tokens = 0
-            m = _COST_RE.search(text)
-            if m:
-                cost_usd = float(m.group(1))
-            m = _TOKENS_RE.search(text)
-            if m:
-                prompt_tokens = _parse_token_count(m.group(1), m.group(2))
-                completion_tokens = _parse_token_count(m.group(3), m.group(4))
-            if cost_usd or prompt_tokens:
-                return {
-                    "total_prompt_tokens": prompt_tokens,
-                    "total_completion_tokens": completion_tokens,
-                    "total_cost_usd": cost_usd,
-                }
-        except OSError:
-            pass
-
-    return {
-        "total_prompt_tokens": 0,
-        "total_completion_tokens": 0,
-        "total_cost_usd": 0.0,
-    }
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the harbor job directory (timestamp-named) inside the output dir."""
+    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
+    # containing result.json and trial subdirectories
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    # Use the most recent job directory if multiple exist
+    return sorted(candidates)[-1]
 
 
-def convert_benchflow_to_eval_output(
-    jobs_dir: Path,
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
     eval_output_path: Path,
-    task_ids: list[str] | None = None,
 ) -> None:
-    """Convert benchflow job output to standard evaluation output format.
+    """Convert harbor output to evaluation output format.
 
-    benchflow 0.3.0 stores trial results as:
-        jobs_dir/YYYY-MM-DD__HH-MM-SS/TASK_NAME__UUID8/result.json
+    Harbor stores trial results in a job directory structured as:
+        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
 
-    Each result.json contains task_name, rewards, error, verifier_error, and timing.
+    Each trial's result.json contains task_name, verifier_result, agent_result,
+    timing info, and exception details.
 
     Args:
-        jobs_dir: Path to benchflow jobs directory.
-        eval_output_path: Path to write output.jsonl.
-        task_ids: Optional filter for specific task IDs (short names).
+        harbor_output_dir: Path to harbor output directory.
+        eval_output_path: Path to write the converted output.jsonl.
     """
-    logger.info(f"Converting benchflow output from {jobs_dir}")
-
-    # benchflow 0.3.0 writes:
-    #   jobs/summary.json
-    #   jobs/TIMESTAMP/TRIAL_NAME/result.json
-    # while older local outputs may place results directly under jobs/.
-    job_dirs = [d for d in jobs_dir.iterdir() if d.is_dir()]
-    timestamp_job_dirs = [d for d in job_dirs if _TIMESTAMP_RE.match(d.name)]
-
-    if timestamp_job_dirs:
-        selected_job_dir = sorted(timestamp_job_dirs)[-1]
-        logger.info(f"Using benchflow job directory: {selected_job_dir}")
-        task_dirs = [d for d in selected_job_dir.iterdir() if d.is_dir()]
-    else:
-        task_dirs = job_dirs
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
 
-    if not task_dirs:
-        raise RuntimeError(f"No task directories found in {jobs_dir}")
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
 
-    if task_ids:
-        short_ids = {tid.split("/")[-1] for tid in task_ids}
-        task_dirs = [d for d in task_dirs if d.name.split("__")[0] in short_ids]
+    # Find trial result files (each trial dir has a result.json)
+    result_files = list(job_dir.glob("*/result.json"))
+    # Exclude the job-level result.json
+    result_files = [f for f in result_files if f.parent != job_dir]
+
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
 
-    logger.info(f"Processing {len(task_dirs)} task directories")
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
 
     results: list[dict] = []
     errors: list[dict] = []
 
-    for task_dir in sorted(task_dirs):
-        # Find the trial result — benchflow writes trial-0/result.json
-        trial_results = list(task_dir.glob("trial-*/result.json"))
-        if not trial_results:
-            # Fall back to a direct result.json
-            direct = task_dir / "result.json"
-            if direct.exists():
-                trial_results = [direct]
-
-        if not trial_results:
-            logger.warning(f"No result.json found in {task_dir}, skipping")
-            errors.append(
-                {
-                    "instance_id": f"benchflow/{task_dir.name}",
-                    "error": "No result.json found",
-                    "test_result": {},
-                }
-            )
-            continue
-
-        # Use the last trial (highest retry index)
-        result_file = sorted(trial_results)[-1]
-
+    for result_file in result_files:
         try:
             with open(result_file) as f:
                 trial = json.load(f)
 
-            task_basename = task_dir.name.split("__")[0]
-            task_name = trial.get("task_name") or f"benchflow/{task_basename}"
-            # Normalise to benchflow/<name> form
-            if "/" not in task_name:
-                task_name = f"benchflow/{task_name}"
+            instance_id = trial.get("task_name", result_file.parent.name)
 
-            error = trial.get("error")
-            verifier_error = trial.get("verifier_error")
-
-            if error or verifier_error:
+            # Check for exceptions
+            if trial.get("exception_info"):
                 errors.append(
                     {
-                        "instance_id": task_name,
-                        "error": str(error or verifier_error),
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
                         "test_result": {},
                     }
                 )
                 continue
 
-            rewards = trial.get("rewards") or {}
-            passed = bool(rewards.get("reward", 0.0))
+            # Extract verifier results
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+
+            # Extract agent metrics
+            agent_result = trial.get("agent_result", {})
 
             eval_entry = {
-                "instance_id": task_name,
+                "instance_id": instance_id,
                 "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
                     "rewards": rewards,
                     "passed": passed,
                 },
                 "instruction": "",
                 "error": None,
                 "history": [],
-                "metrics": _extract_trial_metrics(result_file.parent),
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
             }
             results.append(eval_entry)
-            logger.info(f"Processed {task_name}: reward={rewards.get('reward', 'N/A')}")
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
 
         except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to read {result_file}: {e}")
+            logger.error(f"Failed to process result file {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": f"benchflow/{task_dir.name}",
+                    "instance_id": result_file.parent.name,
                     "error": str(e),
                     "test_result": {},
                 }
             )
 
     if not results and not errors:
-        raise RuntimeError(f"No trials processed from {jobs_dir}")
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
 
     if not results:
         logger.warning(
-            f"All {len(errors)} trials failed; writing error entries for reporting"
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
         )
 
+    # Write results to output.jsonl
     with open(eval_output_path, "w") as f:
-        for entry in results + errors:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
             f.write(json.dumps(entry) + "\n")
 
     logger.info(
@@ -518,18 +298,18 @@ def load_task_ids_from_file(filepath: str) -> list[str]:
 def main() -> None:
     """Main entry point for skillsbench inference."""
     parser = argparse.ArgumentParser(
-        description="Run SkillsBench evaluation with benchflow and openhands",
+        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
     # Run full skillsbench evaluation
     uv run skillsbench-infer .llm_config/claude.json
 
-    # Run specific tasks from a file
+    # Run specific tasks
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with more concurrency
-    uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+    # Run with custom dataset version
+    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
         """,
     )
 
@@ -542,7 +322,7 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="benchflow dataset name (e.g., benchflow/skillsbench)",
+        help="Harbor dataset name (e.g., benchflow/skillsbench)",
     )
     parser.add_argument(
         "--output-dir",
@@ -554,12 +334,12 @@ def main() -> None:
         "--num-workers",
         type=int,
         default=INFER_DEFAULTS["num_workers"],
-        help="Number of parallel workers (concurrency)",
+        help="Number of parallel workers",
     )
     parser.add_argument(
         "--n-limit",
         type=int,
-        help="Maximum number of tasks to run",
+        help="Maximum number of dataset tasks to run after Harbor filtering",
     )
     parser.add_argument(
         "--select",
@@ -578,13 +358,14 @@ def main() -> None:
         help="Optional note for the evaluation run",
     )
     parser.add_argument(
-        "--skip-run",
+        "--skip-harbor",
         action="store_true",
-        help="Skip running benchflow and only convert existing results",
+        help="Skip running harbor and only convert existing results",
     )
 
     args = parser.parse_args()
 
+    # Validate LLM config
     if not os.path.isfile(args.llm_config_path):
         logger.error(f"LLM config file does not exist: {args.llm_config_path}")
         sys.exit(1)
@@ -594,103 +375,87 @@ def main() -> None:
     llm = LLM.model_validate_json(llm_config)
     logger.info(f"Using LLM: {llm.model}")
 
-    if not args.skip_run and not check_benchflow_installed():
+    # Check harbor installation
+    if not args.skip_harbor and not check_harbor_installed():
         logger.error(
-            "benchflow CLI is not installed. Please install it:\n"
-            "  uv tool install benchflow==0.3.0\n"
+            "Harbor CLI is not installed. Please install it:\n"
+            "  pip install harbor\n"
             "  # or\n"
-            "  pip install benchflow==0.3.0\n"
-            "  # or\n"
-            "  uv pip install benchflow==0.3.0"
+            "  uv pip install harbor"
         )
         sys.exit(1)
 
+    # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
         base_dir=args.output_dir,
         dataset_name=dataset_description,
         model_name=llm.model,
-        max_iterations=100,
+        max_iterations=100,  # Not directly used but required for path construction
         eval_note=args.note,
     )
 
     logger.info(f"Output directory: {structured_output_dir}")
     os.makedirs(structured_output_dir, exist_ok=True)
 
+    # Save metadata
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
         "timestamp": datetime.now(timezone.utc).isoformat(),
-        "benchflow_agent": BENCHFLOW_DEFAULTS["agent_name"],
+        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
         "note": args.note,
     }
     metadata_path = Path(structured_output_dir) / "metadata.json"
     with open(metadata_path, "w") as f:
         json.dump(metadata, f, indent=2)
 
+    # Collect task IDs if specified
     task_ids: list[str] | None = None
     if args.select:
-        task_ids = load_task_ids_from_file(args.select)
-        logger.info(f"Loaded {len(task_ids)} task IDs from {args.select}")
+        loaded_ids = load_task_ids_from_file(args.select)
+        task_ids = loaded_ids
+        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
     elif args.task_id:
         task_ids = list(args.task_id)
         logger.info(f"Running {len(task_ids)} specified task IDs")
 
-    tasks_dir = Path(structured_output_dir) / "tasks"
-    jobs_dir = Path(structured_output_dir) / "jobs"
     output_path = Path(structured_output_dir) / OUTPUT_FILENAME
 
-    if not args.skip_run:
+    if not args.skip_harbor:
+        # Run harbor evaluation
         try:
-            ensure_tasks(args.dataset, tasks_dir, task_ids=task_ids)
-
-            # Apply n_limit by slicing available task directories
-            effective_task_dirs = tasks_dir
-            if args.n_limit is not None or task_ids is not None:
-                all_dirs = sorted(d for d in tasks_dir.iterdir() if d.is_dir())
-                if task_ids:
-                    short_ids = {tid.split("/")[-1] for tid in task_ids}
-                    all_dirs = [d for d in all_dirs if d.name in short_ids]
-                if args.n_limit is not None:
-                    all_dirs = all_dirs[: args.n_limit]
-
-                # Write a filtered tasks dir symlink tree
-                filtered_tasks_dir = Path(structured_output_dir) / "tasks_filtered"
-                filtered_tasks_dir.mkdir(exist_ok=True)
-                for d in all_dirs:
-                    link = filtered_tasks_dir / d.name
-                    if not link.exists():
-                        link.symlink_to(d.resolve())
-                effective_task_dirs = filtered_tasks_dir
-
-            run_benchflow_job(
+            harbor_output_dir = run_harbor_evaluation(
                 llm=llm,
-                tasks_dir=effective_task_dirs,
-                jobs_dir=jobs_dir,
+                dataset=args.dataset,
+                output_dir=structured_output_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
+                n_limit=args.n_limit,
             )
 
-            convert_benchflow_to_eval_output(
-                jobs_dir=jobs_dir,
+            # Convert harbor output to standard format
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
                 eval_output_path=output_path,
-                task_ids=task_ids,
             )
 
         except Exception as e:
             logger.error(f"Evaluation failed: {e}")
             sys.exit(1)
     else:
-        if jobs_dir.exists():
-            convert_benchflow_to_eval_output(
-                jobs_dir=jobs_dir,
+        # Skip harbor, just convert existing results
+        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
+        if harbor_output_dir.exists():
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
                 eval_output_path=output_path,
-                task_ids=task_ids,
             )
         else:
-            logger.error(f"No jobs output found at {jobs_dir}")
+            logger.error(f"No harbor output found at {harbor_output_dir}")
             sys.exit(1)
 
+    # Generate cost report
     if output_path.exists():
         generate_cost_report(str(output_path))
 
diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
index 1334da297..56d54f27a 100644
--- a/tests/test_skillsbench_eval_infer.py
+++ b/tests/test_skillsbench_eval_infer.py
@@ -39,6 +39,23 @@ def test_resolved_instance(self, tmp_path: Path) -> None:
         assert result["unresolved_instances"] == 0
         assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
 
+    def test_unresolved_instance(self, tmp_path: Path) -> None:
+        """Test processing an unresolved (passed=False) instance."""
+        input_file = tmp_path / "unresolved.jsonl"
+        output_file = tmp_path / "unresolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 0
+        assert result["unresolved_instances"] == 1
+
     def test_instance_with_error(self, tmp_path: Path) -> None:
         """Test processing an instance that errored."""
         input_file = tmp_path / "error.jsonl"
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 784b4d1cc..5f8452cb3 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -4,102 +4,82 @@
 from pathlib import Path
 
 import pytest
-import yaml
 
-from benchmarks.skillsbench.config import BENCHFLOW_DEFAULTS, INFER_DEFAULTS
+from benchmarks.skillsbench.config import INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
-    _build_benchflow_agent_env,
-    convert_benchflow_to_eval_output,
-    run_benchflow_job,
+    convert_harbor_to_eval_output,
+    run_harbor_evaluation,
 )
 from openhands.sdk import LLM
 
 
-class TestRunBenchflowJob:
-    """Tests for building benchflow job invocation arguments."""
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
 
-    def test_default_dataset_matches_benchflow_registry(self) -> None:
-        """Test that the default dataset name matches benchflow's published registry."""
+    def test_default_dataset_matches_harbor_registry(self) -> None:
+        """Test that the default dataset name matches Harbor's published registry."""
         assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
 
-    def test_default_agent_is_openhands(self) -> None:
-        """Test that the default agent is openhands."""
-        assert BENCHFLOW_DEFAULTS["agent_name"] == "openhands"
-
-    def test_run_benchflow_job_passes_model_and_concurrency(
+    def test_run_harbor_evaluation_passes_filters_and_limits(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test benchflow job command writes the expected YAML config."""
-        captured_cmd: list[str] = []
-        captured_env: dict[str, str] = {}
-        captured_config: dict = {}
-
-        # Force legacy benchflow binary path so the command format is deterministic
-        monkeypatch.setattr(
-            "benchmarks.skillsbench.run_infer.shutil.which",
-            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
-        )
+        """Test Harbor command includes task filters and n-limit."""
+        captured: dict[str, list[str]] = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured_cmd[:] = cmd
-            captured_env.clear()
-            captured_env.update(env)
-            with open(cmd[3]) as f:
-                captured_config.update(yaml.safe_load(f))
+            captured["cmd"] = cmd
             return type(
                 "Completed",
                 (),
-                {"returncode": 0, "stdout": "Score: 1/1 (100%)", "stderr": ""},
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        tasks_dir = tmp_path / "tasks"
-        tasks_dir.mkdir()
-        jobs_dir = tmp_path / "jobs"
-
-        run_benchflow_job(
+        harbor_output_dir = run_harbor_evaluation(
             llm=LLM(
-                model="anthropic/claude-sonnet-4-5",
+                model="litellm_proxy/test-model",
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            tasks_dir=tasks_dir,
-            jobs_dir=jobs_dir,
-            num_workers=4,
-        )
-
-        cmd = captured_cmd
-        assert cmd[0] == "/usr/local/bin/benchflow"
-        assert cmd[1] == "job"
-        assert cmd[2] == "--config"
-        assert captured_config["tasks_dir"] == str(tasks_dir)
-        assert captured_config["jobs_dir"] == str(jobs_dir.resolve())
-        assert captured_config["agent"] == "openhands"
-        assert captured_config["model"] == "anthropic/claude-sonnet-4-5"
-        assert captured_config["concurrency"] == 4
-        assert captured_config["sandbox_user"] is None
-
-    def test_llm_credentials_passed_via_subprocess_env(
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+            num_workers=2,
+            task_ids=["benchflow/task-a", "benchflow/task-b"],
+            n_limit=3,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "-d",
+            "benchflow/skillsbench",
+            "-a",
+            "openhands-sdk",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--include-task-name") == 2
+        assert "benchflow/task-a" in cmd
+        assert "benchflow/task-b" in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
+        assert cmd[cmd.index("--n-tasks") + 1] == "3"
+
+    def test_llm_credentials_passed_via_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test that LLM credentials are passed via subprocess env and YAML."""
-        captured_cmd: list[str] = []
-        captured_env: dict[str, str] = {}
-        captured_config: dict = {}
-
-        # Force legacy benchflow binary path so the command format is deterministic
-        monkeypatch.setattr(
-            "benchmarks.skillsbench.run_infer.shutil.which",
-            lambda name: "/usr/local/bin/benchflow" if name == "benchflow" else None,
-        )
+        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
+        captured: dict = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
-            captured_cmd[:] = cmd
-            captured_env.clear()
-            captured_env.update(env)
-            with open(cmd[3]) as f:
-                captured_config.update(yaml.safe_load(f))
+            captured["cmd"] = cmd
+            captured["env"] = env
             return type(
                 "Completed",
                 (),
@@ -108,127 +88,60 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
 
-        tasks_dir = tmp_path / "tasks"
-        tasks_dir.mkdir()
-
-        run_benchflow_job(
+        run_harbor_evaluation(
             llm=LLM(
                 model="test-model",
                 api_key="my-secret-key",
                 base_url="https://my-proxy.example.com",
             ),
-            tasks_dir=tasks_dir,
-            jobs_dir=tmp_path / "jobs",
-        )
-
-        # Credentials in subprocess env
-        assert captured_env["LLM_API_KEY"] == "my-secret-key"
-        assert captured_env["LLM_BASE_URL"] == "https://my-proxy.example.com"
-        assert "--ae" not in captured_cmd
-        assert captured_config["agent_env"]["LLM_API_KEY"] == "my-secret-key"
-        assert (
-            captured_config["agent_env"]["LLM_BASE_URL"]
-            == "https://my-proxy.example.com"
-        )
-
-    def test_direct_gemini_model_sets_provider_env_vars(self) -> None:
-        """Direct provider models need provider-specific env vars."""
-        env = _build_benchflow_agent_env(
-            LLM(
-                model="gemini/gemini-3.1-flash-lite-preview",
-                api_key="gemini-test-key",
-            )
-        )
-
-        assert env["LLM_API_KEY"] == "gemini-test-key"
-        assert env["GEMINI_API_KEY"] == "gemini-test-key"
-        assert env["GOOGLE_API_KEY"] == "gemini-test-key"
-
-    def test_proxy_model_does_not_set_provider_env_vars(self) -> None:
-        """LiteLLM proxy configs should keep using generic LLM_* vars only."""
-        env = _build_benchflow_agent_env(
-            LLM(
-                model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
-                api_key="proxy-key",
-                base_url="https://proxy.example.com",
-            )
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
         )
 
-        assert env["LLM_API_KEY"] == "proxy-key"
-        assert env["LLM_BASE_URL"] == "https://proxy.example.com"
-        assert "ANTHROPIC_API_KEY" not in env
-        assert "ANTHROPIC_BASE_URL" not in env
+        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
+        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
 
 
-class TestConvertBenchflowToEvalOutput:
-    """Tests for convert_benchflow_to_eval_output function."""
+class TestConvertHarborToEvalOutput:
+    """Tests for convert_harbor_to_eval_output function."""
 
-    def _create_benchflow_structure(
-        self, tmp_path: Path, tasks: list[tuple[str, dict]]
-    ) -> Path:
-        """Create a mock benchflow jobs directory structure.
-
-        benchflow writes: jobs_dir/TASK_NAME/trial-0/result.json
-        """
-        jobs_dir = tmp_path / "jobs"
-        for task_name, result in tasks:
-            trial_dir = jobs_dir / task_name / "trial-0"
-            trial_dir.mkdir(parents=True)
-            (trial_dir / "result.json").write_text(json.dumps(result))
-        return jobs_dir
-
-    def _create_benchflow_timestamped_job(
-        self, tmp_path: Path, tasks: list[tuple[str, dict]]
+    def _create_harbor_structure(
+        self, tmp_path: Path, trials: list[tuple[str, dict]]
     ) -> Path:
-        """Create a mock benchflow 0.3.0 jobs directory structure.
-
-        benchflow writes: jobs/TIMESTAMP/TASK_NAME__RUNID/result.json
-        """
-        jobs_dir = tmp_path / "jobs"
-        job_dir = jobs_dir / "2026-04-21__23-12-35"
+        """Create a mock Harbor output structure."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
         job_dir.mkdir(parents=True)
-        (jobs_dir / "summary.json").write_text(json.dumps({"total": len(tasks)}))
-        for task_name, result in tasks:
-            trial_dir = job_dir / f"{task_name}__abc123"
-            trial_dir.mkdir(parents=True)
-            (trial_dir / "result.json").write_text(json.dumps(result))
-        return jobs_dir
+        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
 
-    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
-        """Test successful parsing of a benchflow trial result.
+        for trial_name, trial_result in trials:
+            trial_dir = job_dir / trial_name
+            trial_dir.mkdir()
+            (trial_dir / "result.json").write_text(json.dumps(trial_result))
 
-        benchflow 0.3.0 does not write cost/token fields to result.json.
-        Metrics are read from agent/trajectory.json (harbor-format agent)
-        or parsed from agent/openhands.txt (ACP agent stdout).
-        """
+        return harbor_dir
+
+    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
+        """Test successful parsing of harbor trial result."""
         trial_result = {
             "task_name": "benchflow/weighted-gdp-calc",
-            "rewards": {"reward": 1.0},
-            "error": None,
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        # Write agent/trajectory.json with final_metrics (harbor-format agent output).
-        # agent/ sits next to result.json, inside the trial-0 subdirectory.
-        trial_dir = jobs_dir / "weighted-gdp-calc" / "trial-0"
-        agent_dir = trial_dir / "agent"
-        agent_dir.mkdir(parents=True, exist_ok=True)
-        (agent_dir / "trajectory.json").write_text(
-            json.dumps(
-                {
-                    "final_metrics": {
-                        "total_prompt_tokens": 1000,
-                        "total_completion_tokens": 200,
-                        "total_cost_usd": 0.05,
-                    }
-                }
-            )
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
 
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         assert output_file.exists()
         with open(output_file) as f:
@@ -238,52 +151,26 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
-        assert entries[0]["metrics"]["total_prompt_tokens"] == 1000
-        assert entries[0]["metrics"]["total_completion_tokens"] == 200
-
-    def test_metrics_from_acp_agent_log(self, tmp_path: Path) -> None:
-        """Test that metrics are extracted from agent/openhands.txt (ACP agent)."""
-        trial_result = {
-            "task_name": "benchflow/acp-task",
-            "rewards": {"reward": 1.0},
-            "error": None,
-        }
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path, [("acp-task", trial_result)]
-        )
-        # Write agent/openhands.txt simulating openhands ACP stdout
-        trial_dir = jobs_dir / "2026-04-21__23-12-35" / "acp-task__abc123"
-        agent_dir = trial_dir / "agent"
-        agent_dir.mkdir(parents=True, exist_ok=True)
-        (agent_dir / "openhands.txt").write_text(
-            "OpenHands SDK v1.16.0\n"
-            "Tokens: ↑ input 404.21K • cache hit 70.47% •  reasoning 579 • ↓ output 7.83K • $0.0487\n"
-            "Total cost: $0.0487\n"
-        )
-        output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["metrics"]["total_cost_usd"] == pytest.approx(0.0487)
-        assert entries[0]["metrics"]["total_prompt_tokens"] == 404210
-        assert entries[0]["metrics"]["total_completion_tokens"] == 7830
 
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {
             "task_name": "benchflow/task-1",
-            "rewards": {"reward": 0.0},
-            "error": None,
+            "trial_name": "task-1__xyz",
+            "agent_result": {
+                "n_input_tokens": None,
+                "n_output_tokens": None,
+                "cost_usd": None,
+            },
+            "verifier_result": {"rewards": {"reward": 0.0}},
+            "exception_info": None,
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("task-1", trial_result)]
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("task-1__xyz", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -291,19 +178,21 @@ def test_failed_trial(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is False
         assert entries[0]["metrics"]["total_cost_usd"] == 0.0
 
-    def test_trial_with_error(self, tmp_path: Path) -> None:
-        """Test that errored trials are written as error entries."""
+    def test_trial_with_exception(self, tmp_path: Path) -> None:
+        """Test that exception trials are written as error entries."""
         trial_result = {
             "task_name": "benchflow/error-task",
-            "rewards": {},
-            "error": "LLM_API_KEY not set",
+            "trial_name": "error-task__err",
+            "agent_result": {},
+            "verifier_result": {},
+            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
         }
 
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("error-task", trial_result)]
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("error-task__err", trial_result)]
         )
         output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
+        convert_harbor_to_eval_output(harbor_dir, output_file)
 
         with open(output_file) as f:
             entries = [json.loads(line) for line in f]
@@ -313,121 +202,20 @@ def test_trial_with_error(self, tmp_path: Path) -> None:
         assert entries[0]["error"] is not None
         assert entries[0]["test_result"] == {}
 
-    def test_missing_jobs_directory(self, tmp_path: Path) -> None:
-        """Test handling when jobs directory is empty."""
-        jobs_dir = tmp_path / "jobs"
-        jobs_dir.mkdir()
-
-        with pytest.raises(RuntimeError, match="No task directories found"):
-            convert_benchflow_to_eval_output(jobs_dir, tmp_path / "output.jsonl")
-
-    def test_task_id_filtering(self, tmp_path: Path) -> None:
-        """Test that only specified task IDs are converted."""
-        trials = [
-            (
-                "task-a",
-                {
-                    "task_name": "benchflow/task-a",
-                    "rewards": {"reward": 1.0},
-                    "error": None,
-                },
-            ),
-            (
-                "task-b",
-                {
-                    "task_name": "benchflow/task-b",
-                    "rewards": {"reward": 0.0},
-                    "error": None,
-                },
-            ),
-        ]
-        jobs_dir = self._create_benchflow_structure(tmp_path, trials)
-        output_file = tmp_path / "output.jsonl"
+    def test_missing_job_directory(self, tmp_path: Path) -> None:
+        """Test handling when no job directory exists."""
+        harbor_dir = tmp_path / "harbor_output"
+        harbor_dir.mkdir()
 
-        convert_benchflow_to_eval_output(
-            jobs_dir, output_file, task_ids=["benchflow/task-a"]
-        )
+        with pytest.raises(RuntimeError, match="No harbor job directory found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
 
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/task-a"
-
-    def test_task_name_normalised_to_benchflow_prefix(self, tmp_path: Path) -> None:
-        """Test that task names without prefix get benchflow/ prepended."""
-        trial_result = {
-            "task_name": "weighted-gdp-calc",  # no benchflow/ prefix
-            "rewards": {"reward": 1.0},
-            "error": None,
-        }
-        jobs_dir = self._create_benchflow_structure(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        output_file = tmp_path / "output.jsonl"
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
-
-    def test_timestamped_job_directory_is_processed(self, tmp_path: Path) -> None:
-        """Test benchflow 0.3.0 timestamped jobs directory layout."""
-        trial_result = {
-            "task_name": "weighted-gdp-calc",
-            "rewards": {"reward": 1.0},
-            "error": None,
-            "n_input_tokens": 42,
-            "n_output_tokens": 7,
-            "cost_usd": 0.01,
-        }
-
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path, [("weighted-gdp-calc", trial_result)]
-        )
-        output_file = tmp_path / "output.jsonl"
-
-        convert_benchflow_to_eval_output(jobs_dir, output_file)
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
-
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
-        assert entries[0]["test_result"]["passed"] is True
-
-    def test_task_id_filter_matches_timestamped_trial_dir(self, tmp_path: Path) -> None:
-        """Test filtering strips the run suffix from trial directory names."""
-        jobs_dir = self._create_benchflow_timestamped_job(
-            tmp_path,
-            [
-                (
-                    "task-a",
-                    {
-                        "task_name": "task-a",
-                        "rewards": {"reward": 1.0},
-                        "error": None,
-                    },
-                ),
-                (
-                    "task-b",
-                    {
-                        "task_name": "task-b",
-                        "rewards": {"reward": 0.0},
-                        "error": None,
-                    },
-                ),
-            ],
-        )
-        output_file = tmp_path / "output.jsonl"
-
-        convert_benchflow_to_eval_output(
-            jobs_dir, output_file, task_ids=["benchflow/task-a"]
-        )
-
-        with open(output_file) as f:
-            entries = [json.loads(line) for line in f]
+    def test_empty_job_directory(self, tmp_path: Path) -> None:
+        """Test handling of harbor job dir with no trial subdirs."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
 
-        assert len(entries) == 1
-        assert entries[0]["instance_id"] == "benchflow/task-a"
+        with pytest.raises(RuntimeError, match="No trial result files found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
diff --git a/uv.lock b/uv.lock
index 147abedc9..2cd0b3640 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,7 +1282,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
-    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1293,7 +1292,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1304,7 +1302,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -1519,6 +1516,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
+[package.optional-dependencies]
+socks = [
+    { name = "socksio" },
+]
+
 [[package]]
 name = "httpx-sse"
 version = "0.4.2"
@@ -1814,14 +1816,12 @@ wheels = [
 
 [[package]]
 name = "litellm"
-version = "1.80.10"
+version = "1.83.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
     { name = "click" },
     { name = "fastuuid" },
-    { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" },
-    { name = "grpcio", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "httpx" },
     { name = "importlib-metadata" },
     { name = "jinja2" },
@@ -1832,9 +1832,9 @@ dependencies = [
     { name = "tiktoken" },
     { name = "tokenizers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dd/44/0aaa7449e7c4aa05668ec03f1f68a01b1e476591071d9659a68db19371a2/litellm-1.80.10.tar.gz", hash = "sha256:4a4aff7558945c2f7e5c6523e67c1b5525a46b10b0e1ad6b8f847cb13b16779e", size = 12764777, upload-time = "2025-12-14T02:07:05.362Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/92/6ce9737554994ca8e536e5f4f6a87cc7c4774b656c9eb9add071caf7d54b/litellm-1.83.0.tar.gz", hash = "sha256:860bebc76c4bb27b4cf90b4a77acd66dba25aced37e3db98750de8a1766bfb7a", size = 17333062, upload-time = "2026-03-31T05:08:25.331Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8c/a9/4814b6aa58f6705df2831eaadeb5bc8240684c8c9d5964245212f85049d1/litellm-1.80.10-py3-none-any.whl", hash = "sha256:9b3e561efaba0eb1291cb1555d3dcb7283cf7f3cb65aadbcdb42e2a8765898c8", size = 11264240, upload-time = "2025-12-14T02:07:02.414Z" },
+    { url = "https://files.pythonhosted.org/packages/19/2c/a670cc050fcd6f45c6199eb99e259c73aea92edba8d5c2fc1b3686d36217/litellm-1.83.0-py3-none-any.whl", hash = "sha256:88c536d339248f3987571493015784671ba3f193a328e1ea6780dbebaa2094a8", size = 15610306, upload-time = "2026-03-31T05:08:21.987Z" },
 ]
 
 [[package]]
@@ -2402,7 +2402,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2467,7 +2467,6 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
-    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2522,7 +2521,6 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
-    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -2546,7 +2544,7 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "agent-client-protocol" },
@@ -2554,7 +2552,7 @@ dependencies = [
     { name = "fakeredis", extra = ["lua"] },
     { name = "fastmcp" },
     { name = "filelock" },
-    { name = "httpx" },
+    { name = "httpx", extra = ["socks"] },
     { name = "litellm" },
     { name = "lmnr" },
     { name = "pydantic" },
@@ -2577,8 +2575,8 @@ requires-dist = [
     { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" },
     { name = "fastmcp", specifier = ">=3.0.0" },
     { name = "filelock", specifier = ">=3.20.1" },
-    { name = "httpx", specifier = ">=0.27.0" },
-    { name = "litellm", specifier = "==1.80.10" },
+    { name = "httpx", extras = ["socks"], specifier = ">=0.27.0" },
+    { name = "litellm", specifier = ">=1.82.6,!=1.82.7,!=1.82.8" },
     { name = "lmnr", specifier = ">=0.7.24" },
     { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
@@ -2590,7 +2588,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2619,7 +2617,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.16.0"
+version = "1.16.1"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },
@@ -6709,6 +6707,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "socksio"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" },
+]
+
 [[package]]
 name = "sortedcontainers"
 version = "2.4.0"
@@ -6834,15 +6841,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
-[[package]]
-name = "swesmith"
-version = "0.0.9"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
-]
-
 [[package]]
 name = "swt-bench"
 version = "1.0.1"
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index acd5adc96..3e0a3a091 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit acd5adc965c08a0f815cf8e5f3166d1d090034d6
+Subproject commit 3e0a3a0915b369c7e2057c77722e98585855d30a

From 908e8519d7156b416dc732ca33420082f7c308ff Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 16:56:57 -0400
Subject: [PATCH 10/12] Update skillsbench dataset handling

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/skillsbench/README.md    |  16 +-
 benchmarks/skillsbench/run_infer.py | 351 +++++++++++++++++++++++++---
 tests/test_skillsbench_run_infer.py | 231 +++++++++++++++++-
 3 files changed, 558 insertions(+), 40 deletions(-)

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 60ff73652..c2c11928b 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -21,11 +21,12 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 ## Prerequisites
 
 1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+   This integration is currently validated against `harbor==0.1.33`.
 
    ```bash
-   pip install harbor
+   pip install harbor==0.1.33
    # or
-   uv pip install harbor
+   uv pip install harbor==0.1.33
    ```
 
 2. **Docker**: Harbor requires Docker to be installed and running.
@@ -34,12 +35,18 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 ## Usage
 
+By default, `skillsbench-infer` keeps a local copy of `tasks/` from
+`https://github.com/benchflow-ai/skillsbench` on the `main` branch under
+`benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in
+`benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the
+upstream `main` commit changes. The only supported dataset sources are this synced
+SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`.
+
 ### Running Inference
 
 Run the SkillsBench evaluation using the OpenHands SDK agent:
 
 ```bash
-# Run full evaluation
 uv run skillsbench-infer .llm_config/claude.json
 
 # Run specific tasks
@@ -53,6 +60,9 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5
 
 # Run with multiple workers
 uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+
+# Run against a Harbor registry dataset instead of the synced GitHub tasks
+uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
 ```
 
 ### LLM Configuration
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index a8afa7281..c8e06eee5 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -11,8 +11,11 @@
 import argparse
 import json
 import os
+import re
+import shutil
 import subprocess
 import sys
+import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -29,13 +32,21 @@
 # Output filename for results
 OUTPUT_FILENAME = "output.jsonl"
 
+SKILLSBENCH_REPO_URL = "https://github.com/benchflow-ai/skillsbench.git"
+SKILLSBENCH_REPO_BRANCH = "main"
+DATASET_CACHE_DIR = Path(__file__).parent / "data"
+TASKS_CACHE_DIR = DATASET_CACHE_DIR / "tasks"
+TASKS_METADATA_PATH = DATASET_CACHE_DIR / "source.json"
+REGISTRY_DATASET_PREFIX = "benchflow/skillsbench"
+INSTANCE_ID_PREFIX = "benchflow"
+
 
 def check_harbor_installed() -> bool:
     """Check if harbor CLI is installed and available."""
     harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
     try:
         result = subprocess.run(
-            [harbor_exe, "--version"],
+            [harbor_exe, "--help"],
             capture_output=True,
             text=True,
             timeout=10,
@@ -45,9 +56,252 @@ def check_harbor_installed() -> bool:
         return False
 
 
+def _run_command(cmd: list[str], error_message: str) -> str:
+    """Run a subprocess command and return stdout."""
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        stderr = result.stderr.strip() or result.stdout.strip()
+        raise RuntimeError(f"{error_message}: {stderr}")
+    return result.stdout.strip()
+
+
+def _get_supported_task_filter_flag(harbor_exe: str) -> str:
+    """Detect whether Harbor expects --task-name or --include-task-name."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return "--include-task-name"
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    supported_flags = set(re.findall(r"(?<![\w-])--[a-z0-9-]+", help_text))
+    if "--include-task-name" in supported_flags:
+        return "--include-task-name"
+    if "--task-name" in supported_flags:
+        return "--task-name"
+    return "--include-task-name"
+
+
+def _get_supported_agent_name(harbor_exe: str) -> str:
+    """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return HARBOR_DEFAULTS["agent_name"]
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower())
+    if "openhands-sdk" in compact_help_text:
+        return "openhands-sdk"
+    if "openhands" in compact_help_text:
+        return "openhands"
+    return HARBOR_DEFAULTS["agent_name"]
+
+
+def get_skillsbench_main_commit(
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> str:
+    """Resolve the latest commit hash for the upstream SkillsBench branch."""
+    stdout = _run_command(
+        ["git", "ls-remote", repo_url, f"refs/heads/{branch}"],
+        "Failed to resolve SkillsBench upstream commit",
+    )
+    commit_hash, _, ref = stdout.partition("\t")
+    if not commit_hash or ref != f"refs/heads/{branch}":
+        raise RuntimeError(
+            f"Unexpected git ls-remote output for {repo_url} {branch}: {stdout}"
+        )
+    return commit_hash
+
+
+def _load_cached_commit(metadata_path: Path = TASKS_METADATA_PATH) -> str | None:
+    """Load the cached upstream commit hash for the local task snapshot."""
+    if not metadata_path.is_file():
+        return None
+
+    try:
+        with open(metadata_path, encoding="utf-8") as f:
+            metadata = json.load(f)
+    except (OSError, json.JSONDecodeError) as e:
+        logger.warning(
+            "Ignoring unreadable SkillsBench dataset metadata at %s: %s",
+            metadata_path,
+            e,
+        )
+        return None
+
+    commit_hash = metadata.get("commit_hash")
+    return commit_hash if isinstance(commit_hash, str) and commit_hash else None
+
+
+def download_skillsbench_tasks(
+    commit_hash: str,
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> None:
+    """Download only the SkillsBench tasks directory for a specific commit."""
+    data_dir = tasks_dir.parent
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    logger.info(
+        "Downloading SkillsBench tasks from %s@%s into %s",
+        repo_url,
+        commit_hash,
+        tasks_dir,
+    )
+
+    with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir:
+        clone_dir = Path(temp_dir) / "skillsbench"
+        _run_command(
+            [
+                "git",
+                "clone",
+                "--depth",
+                "1",
+                "--branch",
+                branch,
+                "--filter=blob:none",
+                "--sparse",
+                repo_url,
+                str(clone_dir),
+            ],
+            "Failed to clone SkillsBench repository",
+        )
+        _run_command(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "set", "tasks"],
+            "Failed to sparsely checkout SkillsBench tasks",
+        )
+        checked_out_commit = _run_command(
+            ["git", "-C", str(clone_dir), "rev-parse", "HEAD"],
+            "Failed to read cloned SkillsBench commit",
+        )
+        if checked_out_commit != commit_hash:
+            raise RuntimeError(
+                "Cloned SkillsBench commit does not match upstream HEAD: "
+                f"expected {commit_hash}, got {checked_out_commit}"
+            )
+
+        source_tasks_dir = clone_dir / "tasks"
+        if not source_tasks_dir.is_dir():
+            raise RuntimeError(
+                f"SkillsBench clone at {clone_dir} does not contain a tasks/ directory"
+            )
+
+        if tasks_dir.exists():
+            shutil.rmtree(tasks_dir)
+        shutil.copytree(source_tasks_dir, tasks_dir)
+
+    metadata = {
+        "repo_url": repo_url,
+        "branch": branch,
+        "commit_hash": commit_hash,
+        "synced_at": datetime.now(timezone.utc).isoformat(),
+    }
+    with open(metadata_path, "w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+
+def ensure_skillsbench_tasks(
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> Path:
+    """Ensure a local SkillsBench task snapshot exists and matches upstream HEAD."""
+    cached_commit = _load_cached_commit(metadata_path)
+    has_cached_tasks = tasks_dir.is_dir() and any(tasks_dir.iterdir())
+
+    try:
+        upstream_commit = get_skillsbench_main_commit(repo_url=repo_url, branch=branch)
+    except RuntimeError as e:
+        if has_cached_tasks and cached_commit:
+            logger.warning(
+                "Failed to check SkillsBench upstream HEAD; using cached tasks from "
+                "%s (%s): %s",
+                tasks_dir,
+                cached_commit,
+                e,
+            )
+            return tasks_dir
+        raise
+
+    if has_cached_tasks and cached_commit == upstream_commit:
+        logger.info(
+            "Using cached SkillsBench tasks at %s (commit %s)",
+            tasks_dir,
+            upstream_commit,
+        )
+        return tasks_dir
+
+    if has_cached_tasks:
+        logger.info(
+            "Refreshing SkillsBench tasks in %s from commit %s to %s",
+            tasks_dir,
+            cached_commit or "<unknown>",
+            upstream_commit,
+        )
+    else:
+        logger.info("No cached SkillsBench tasks found at %s; downloading", tasks_dir)
+
+    download_skillsbench_tasks(
+        commit_hash=upstream_commit,
+        tasks_dir=tasks_dir,
+        metadata_path=metadata_path,
+        repo_url=repo_url,
+        branch=branch,
+    )
+    return tasks_dir
+
+
+def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]:
+    """Resolve the dataset argument to a synced local snapshot or registry id."""
+    if dataset == INFER_DEFAULTS["dataset"]:
+        local_tasks_dir = ensure_skillsbench_tasks()
+        return str(local_tasks_dir.resolve()), True
+    if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith(
+        f"{REGISTRY_DATASET_PREFIX}@"
+    ):
+        return dataset, False
+    raise ValueError(
+        "Unsupported SkillsBench dataset source. Use the default synced "
+        "SkillsBench snapshot or a Harbor registry id matching "
+        "'benchflow/skillsbench@<version>'."
+    )
+
+
+def _normalize_task_filter_value(task_id: str, *, dataset_is_path: bool) -> str:
+    """Normalize task filter values for Harbor's local-path dataset handling."""
+    if dataset_is_path:
+        return task_id.rsplit("/", 1)[-1]
+    return task_id
+
+
+def _canonicalize_instance_id(task_name: str) -> str:
+    """Normalize SkillsBench task names to stable benchflow/<task-name> ids."""
+    if "/" in task_name:
+        return task_name
+    return f"{INSTANCE_ID_PREFIX}/{task_name}"
+
+
 def run_harbor_evaluation(
     llm: LLM,
     dataset: str,
+    *,
+    dataset_is_path: bool,
     output_dir: str,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
@@ -57,7 +311,8 @@ def run_harbor_evaluation(
 
     Args:
         llm: LLM configuration for the agent.
-        dataset: Harbor dataset name (e.g., benchflow/skillsbench).
+        dataset: Synced SkillsBench task snapshot path or Harbor registry id.
+        dataset_is_path: Whether ``dataset`` should be passed via ``--path``.
         output_dir: Directory to store output files.
         num_workers: Number of parallel workers.
         task_ids: Optional list of specific task IDs to run.
@@ -69,16 +324,18 @@ def run_harbor_evaluation(
     harbor_output_dir = Path(output_dir) / "harbor_output"
     harbor_output_dir.mkdir(parents=True, exist_ok=True)
     harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    agent_name = _get_supported_agent_name(harbor_exe)
+    task_filter_flag = _get_supported_task_filter_flag(harbor_exe)
 
     # Build harbor command using harbor CLI flags.
     # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
     cmd = [
         harbor_exe,
         "run",
-        "-d",
+        "--path" if dataset_is_path else "-d",
         dataset,
         "-a",
-        HARBOR_DEFAULTS["agent_name"],
+        agent_name,
         "-m",
         llm.model,
         "--jobs-dir",
@@ -87,21 +344,17 @@ def run_harbor_evaluation(
         str(num_workers),
     ]
 
-    # Pass LLM credentials as agent environment variables
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
-    if llm.base_url:
-        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
-
     # Add specific task names if provided
     if task_ids:
         for task_id in task_ids:
-            cmd.extend(["--include-task-name", task_id])
+            cmd.extend(
+                [
+                    task_filter_flag,
+                    _normalize_task_filter_value(
+                        task_id, dataset_is_path=dataset_is_path
+                    ),
+                ]
+            )
 
     if n_limit is not None:
         cmd.extend(["--n-tasks", str(n_limit)])
@@ -131,10 +384,31 @@ def run_harbor_evaluation(
         )
 
         if result.returncode != 0:
-            logger.error(f"Harbor command failed with code {result.returncode}")
-            logger.error(f"stdout: {result.stdout}")
-            logger.error(f"stderr: {result.stderr}")
-            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+            if (
+                task_ids
+                and task_filter_flag == "--task-name"
+                and "No such option: --task-name" in result.stderr
+            ):
+                fallback_cmd = [
+                    "--include-task-name" if part == "--task-name" else part
+                    for part in cmd
+                ]
+                logger.warning(
+                    "Harbor does not support --task-name; retrying with "
+                    "--include-task-name"
+                )
+                result = subprocess.run(
+                    fallback_cmd,
+                    capture_output=True,
+                    text=True,
+                    env=env,
+                )
+
+            if result.returncode != 0:
+                logger.error(f"Harbor command failed with code {result.returncode}")
+                logger.error(f"stdout: {result.stdout}")
+                logger.error(f"stderr: {result.stderr}")
+                raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
 
         logger.info("Harbor evaluation completed successfully")
         logger.info(f"stdout: {result.stdout}")
@@ -207,7 +481,9 @@ def convert_harbor_to_eval_output(
             with open(result_file) as f:
                 trial = json.load(f)
 
-            instance_id = trial.get("task_name", result_file.parent.name)
+            instance_id = _canonicalize_instance_id(
+                trial.get("task_name", result_file.parent.name)
+            )
 
             # Check for exceptions
             if trial.get("exception_info"):
@@ -256,7 +532,7 @@ def convert_harbor_to_eval_output(
             logger.error(f"Failed to process result file {result_file}: {e}")
             errors.append(
                 {
-                    "instance_id": result_file.parent.name,
+                    "instance_id": _canonicalize_instance_id(result_file.parent.name),
                     "error": str(e),
                     "test_result": {},
                 }
@@ -302,13 +578,14 @@ def main() -> None:
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-    # Run full skillsbench evaluation
+    # Run full skillsbench evaluation using a local tasks/ snapshot synced from
+    # https://github.com/benchflow-ai/skillsbench main
     uv run skillsbench-infer .llm_config/claude.json
 
     # Run specific tasks
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run with custom dataset version
+    # Run against a Harbor registry dataset instead of the synced GitHub tasks
     uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
         """,
     )
@@ -322,7 +599,11 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="Harbor dataset name (e.g., benchflow/skillsbench)",
+        help=(
+            "SkillsBench dataset source. The default value syncs tasks/ from the "
+            "benchflow-ai/skillsbench main branch. You can also pass a Harbor "
+            "registry id like benchflow/skillsbench@1.0."
+        ),
     )
     parser.add_argument(
         "--output-dir",
@@ -385,6 +666,20 @@ def main() -> None:
         )
         sys.exit(1)
 
+    resolved_dataset = args.dataset
+    dataset_is_path = False
+    dataset_commit_hash: str | None = None
+    if not args.skip_harbor:
+        try:
+            resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+                args.dataset
+            )
+        except ValueError as e:
+            logger.error(str(e))
+            sys.exit(1)
+        if dataset_is_path and args.dataset == INFER_DEFAULTS["dataset"]:
+            dataset_commit_hash = _load_cached_commit()
+
     # Construct output directory
     dataset_description = args.dataset.replace("/", "__").replace("@", "-")
     structured_output_dir = construct_eval_output_dir(
@@ -402,6 +697,9 @@ def main() -> None:
     metadata = {
         "llm": llm.model_dump_json(),
         "dataset": args.dataset,
+        "resolved_dataset": resolved_dataset,
+        "dataset_is_path": dataset_is_path,
+        "dataset_commit_hash": dataset_commit_hash,
         "timestamp": datetime.now(timezone.utc).isoformat(),
         "harbor_agent": HARBOR_DEFAULTS["agent_name"],
         "note": args.note,
@@ -427,7 +725,8 @@ def main() -> None:
         try:
             harbor_output_dir = run_harbor_evaluation(
                 llm=llm,
-                dataset=args.dataset,
+                dataset=resolved_dataset,
+                dataset_is_path=dataset_is_path,
                 output_dir=structured_output_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 5f8452cb3..0632a6a46 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -8,22 +8,135 @@
 from benchmarks.skillsbench.config import INFER_DEFAULTS
 from benchmarks.skillsbench.run_infer import (
     convert_harbor_to_eval_output,
+    ensure_skillsbench_tasks,
+    resolve_skillsbench_dataset,
     run_harbor_evaluation,
 )
 from openhands.sdk import LLM
 
 
+class TestDatasetSync:
+    """Tests for syncing the local SkillsBench task snapshot."""
+
+    def test_ensure_skillsbench_tasks_reuses_matching_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that an up-to-date cached tasks directory is reused."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "abc123"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "abc123",
+        )
+
+        called = False
+
+        def fake_download(**kwargs) -> None:
+            nonlocal called
+            called = True
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+        assert called is False
+
+    def test_ensure_skillsbench_tasks_refreshes_stale_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a stale cached commit triggers a redownload."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "old-commit"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "new-commit",
+        )
+
+        captured: dict[str, str] = {}
+
+        def fake_download(
+            *,
+            commit_hash: str,
+            tasks_dir: Path,
+            metadata_path: Path,
+            repo_url: str,
+            branch: str,
+        ) -> None:
+            captured["commit_hash"] = commit_hash
+            captured["tasks_dir"] = str(tasks_dir)
+            captured["metadata_path"] = str(metadata_path)
+            tasks_dir.mkdir(exist_ok=True)
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert captured["commit_hash"] == "new-commit"
+        assert captured["tasks_dir"] == str(tasks_dir)
+        assert captured["metadata_path"] == str(metadata_path)
+
+    def test_ensure_skillsbench_tasks_uses_cache_if_remote_check_fails(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a usable cache is kept when the upstream HEAD check fails."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "cached-commit"}))
+
+        def fake_head(repo_url: str, branch: str) -> str:
+            raise RuntimeError("network unavailable")
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            fake_head,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+
+    def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None:
+        """Test that explicit Harbor dataset ids are passed through unchanged."""
+        resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+            "benchflow/skillsbench@1.0"
+        )
+
+        assert resolved_dataset == "benchflow/skillsbench@1.0"
+        assert dataset_is_path is False
+
+
 class TestRunHarborEvaluation:
     """Tests for building Harbor invocation arguments."""
 
-    def test_default_dataset_matches_harbor_registry(self) -> None:
-        """Test that the default dataset name matches Harbor's published registry."""
-        assert INFER_DEFAULTS["dataset"] == "benchflow/skillsbench"
-
     def test_run_harbor_evaluation_passes_filters_and_limits(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
-        """Test Harbor command includes task filters and n-limit."""
+        """Test Harbor command normalizes local task ids and includes main flags."""
         captured: dict[str, list[str]] = {}
 
         def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
@@ -35,6 +148,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
 
         harbor_output_dir = run_harbor_evaluation(
             llm=LLM(
@@ -42,7 +163,8 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
                 api_key="test-key",
                 base_url="https://proxy.example.com",
             ),
-            dataset=INFER_DEFAULTS["dataset"],
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
             output_dir=str(tmp_path),
             num_workers=2,
             task_ids=["benchflow/task-a", "benchflow/task-b"],
@@ -56,21 +178,69 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
         assert cmd[:8] == [
             "harbor",
             "run",
-            "-d",
-            "benchflow/skillsbench",
+            "--path",
+            str(tmp_path / "tasks"),
             "-a",
-            "openhands-sdk",
+            "openhands",
             "-m",
             "litellm_proxy/test-model",
         ]
         assert "--jobs-dir" in cmd
         assert str(expected_output_dir.resolve()) in cmd
         assert cmd.count("--include-task-name") == 2
-        assert "benchflow/task-a" in cmd
-        assert "benchflow/task-b" in cmd
+        assert "task-a" in cmd
+        assert "task-b" in cmd
+        assert "benchflow/task-a" not in cmd
+        assert "--ae" not in cmd
         assert cmd[cmd.index("--n-concurrent") + 1] == "2"
         assert cmd[cmd.index("--n-tasks") + 1] == "3"
 
+    def test_run_harbor_evaluation_retries_with_legacy_task_flag(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor falls back to --include-task-name when --task-name fails."""
+        captured_cmds: list[list[str]] = []
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured_cmds.append(cmd)
+            if "--task-name" in cmd:
+                return type(
+                    "Completed",
+                    (),
+                    {
+                        "returncode": 2,
+                        "stdout": "",
+                        "stderr": "No such option: --task-name",
+                    },
+                )()
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
+
+        run_harbor_evaluation(
+            llm=LLM(model="test-model"),
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
+            output_dir=str(tmp_path),
+            task_ids=["benchflow/task-a"],
+        )
+
+        assert len(captured_cmds) == 2
+        assert "--task-name" in captured_cmds[0]
+        assert "--include-task-name" in captured_cmds[1]
+
     def test_llm_credentials_passed_via_env(
         self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
     ) -> None:
@@ -87,6 +257,14 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
             )()
 
         monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
 
         run_harbor_evaluation(
             llm=LLM(
@@ -95,11 +273,13 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
                 base_url="https://my-proxy.example.com",
             ),
             dataset=INFER_DEFAULTS["dataset"],
+            dataset_is_path=False,
             output_dir=str(tmp_path),
         )
 
         assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
         assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+        assert "--ae" not in captured["cmd"]
 
 
 class TestConvertHarborToEvalOutput:
@@ -152,6 +332,35 @@ def test_successful_trial_parsing(self, tmp_path: Path) -> None:
         assert entries[0]["test_result"]["passed"] is True
         assert entries[0]["metrics"]["total_cost_usd"] == 0.05
 
+    def test_local_trial_names_are_normalized_to_canonical_instance_ids(
+        self, tmp_path: Path
+    ) -> None:
+        """Test local Harbor task names without namespace keep benchflow ids."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+
     def test_failed_trial(self, tmp_path: Path) -> None:
         """Test parsing of a trial with reward 0."""
         trial_result = {

From c1a62a28fac0c999c06408580ba4046c76fb00e5 Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Tue, 28 Apr 2026 22:35:41 -0400
Subject: [PATCH 11/12] fix: benchflow dataset loading

---
 benchmarks/skillsbench/README.md    |  7 ++++---
 benchmarks/skillsbench/run_infer.py | 28 ++++++++++++++++++----------
 tests/test_skillsbench_run_infer.py | 18 ++++++++++++++----
 uv.lock                             | 14 ++++++++++++++
 4 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index c2c11928b..382c51f02 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -39,8 +39,9 @@ By default, `skillsbench-infer` keeps a local copy of `tasks/` from
 `https://github.com/benchflow-ai/skillsbench` on the `main` branch under
 `benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in
 `benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the
-upstream `main` commit changes. The only supported dataset sources are this synced
-SkillsBench snapshot and Harbor registry ids matching `benchflow/skillsbench@...`.
+upstream `main` commit changes. Dataset aliases matching
+`benchflow/skillsbench@...` resolve to this same local Harbor task dataset because
+SkillsBench is not yet published in the public Harbor registry.
 
 ### Running Inference
 
@@ -61,7 +62,7 @@ uv run skillsbench-infer .llm_config/claude.json --n-limit 5
 # Run with multiple workers
 uv run skillsbench-infer .llm_config/claude.json --num-workers 4
 
-# Run against a Harbor registry dataset instead of the synced GitHub tasks
+# Versioned SkillsBench aliases also resolve to the synced local dataset
 uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
 ```
 
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index c8e06eee5..9e8c8496a 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -268,17 +268,22 @@ def ensure_skillsbench_tasks(
 
 
 def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]:
-    """Resolve the dataset argument to a synced local snapshot or registry id."""
-    if dataset == INFER_DEFAULTS["dataset"]:
-        local_tasks_dir = ensure_skillsbench_tasks()
-        return str(local_tasks_dir.resolve()), True
+    """Resolve the dataset argument to a synced local SkillsBench snapshot.
+
+    Harbor 0.5.x validates ``--dataset`` values against the registry before
+    starting a job. SkillsBench is not yet published in the public registry, so
+    ``benchflow/skillsbench`` and versioned aliases like
+    ``benchflow/skillsbench@1.0`` must be resolved to the locally synced Harbor
+    task dataset generated by the SkillsBench adapter.
+    """
     if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith(
         f"{REGISTRY_DATASET_PREFIX}@"
     ):
-        return dataset, False
+        local_tasks_dir = ensure_skillsbench_tasks()
+        return str(local_tasks_dir.resolve()), True
     raise ValueError(
         "Unsupported SkillsBench dataset source. Use the default synced "
-        "SkillsBench snapshot or a Harbor registry id matching "
+        "SkillsBench snapshot or a SkillsBench dataset alias matching "
         "'benchflow/skillsbench@<version>'."
     )
 
@@ -579,13 +584,14 @@ def main() -> None:
         epilog="""
 Examples:
     # Run full skillsbench evaluation using a local tasks/ snapshot synced from
-    # https://github.com/benchflow-ai/skillsbench main
+    # https://github.com/benchflow-ai/skillsbench main (adapter-generated
+    # Harbor tasks stored under benchmarks/skillsbench/data/tasks)
     uv run skillsbench-infer .llm_config/claude.json
 
     # Run specific tasks
     uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
 
-    # Run against a Harbor registry dataset instead of the synced GitHub tasks
+    # Versioned SkillsBench aliases also resolve to the synced local dataset
     uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
         """,
     )
@@ -601,8 +607,10 @@ def main() -> None:
         default=INFER_DEFAULTS["dataset"],
         help=(
             "SkillsBench dataset source. The default value syncs tasks/ from the "
-            "benchflow-ai/skillsbench main branch. You can also pass a Harbor "
-            "registry id like benchflow/skillsbench@1.0."
+            "benchflow-ai/skillsbench main branch. Versioned aliases like "
+            "benchflow/skillsbench@1.0 also resolve to the same local Harbor "
+            "dataset because SkillsBench is not published in the public Harbor "
+            "registry yet."
         ),
     )
     parser.add_argument(
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
index 0632a6a46..ae97989e8 100644
--- a/tests/test_skillsbench_run_infer.py
+++ b/tests/test_skillsbench_run_infer.py
@@ -120,14 +120,24 @@ def fake_head(repo_url: str, branch: str) -> str:
 
         assert resolved == tasks_dir
 
-    def test_resolve_skillsbench_dataset_preserves_remote_registry_ids(self) -> None:
-        """Test that explicit Harbor dataset ids are passed through unchanged."""
+    def test_resolve_skillsbench_dataset_maps_aliases_to_local_snapshot(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test SkillsBench dataset aliases resolve to the local Harbor dataset."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.ensure_skillsbench_tasks",
+            lambda: tasks_dir,
+        )
+
         resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
             "benchflow/skillsbench@1.0"
         )
 
-        assert resolved_dataset == "benchflow/skillsbench@1.0"
-        assert dataset_is_path is False
+        assert resolved_dataset == str(tasks_dir.resolve())
+        assert dataset_is_path is True
 
 
 class TestRunHarborEvaluation:
diff --git a/uv.lock b/uv.lock
index 2cd0b3640..ec4350755 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,6 +1282,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1292,6 +1293,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1302,6 +1304,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -2467,6 +2470,7 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
+    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2521,6 +2525,7 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
+    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -6841,6 +6846,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
+[[package]]
+name = "swesmith"
+version = "0.0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
+]
+
 [[package]]
 name = "swt-bench"
 version = "1.0.1"

From 87c3bd3f4b29224c45f7126e1d9b11612483ce2a Mon Sep 17 00:00:00 2001
From: Chujun Tao <chujunt@andrew.cmu.edu>
Date: Wed, 29 Apr 2026 10:47:31 -0400
Subject: [PATCH 12/12] enhance: skill loading and readme update

---
 benchmarks/skillsbench/README.md    |  43 ++++++++++-
 benchmarks/skillsbench/run_infer.py | 110 ++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
index 382c51f02..a67829193 100644
--- a/benchmarks/skillsbench/README.md
+++ b/benchmarks/skillsbench/README.md
@@ -31,7 +31,14 @@ SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills
 
 2. **Docker**: Harbor requires Docker to be installed and running.
 
-3. **LLM API Key**: Configure your LLM provider credentials.
+3. **Modal Credentials**: Some tasks (e.g., `mhc-implementation`, `diff-transformer`) run workloads on [Modal](https://modal.com) and require a Modal token. Set the following environment variables before running:
+
+   ```bash
+   export MODAL_TOKEN_ID=your_token_id
+   export MODAL_TOKEN_SECRET=your_token_secret
+   ```
+
+4. **LLM API Key**: Configure your LLM provider credentials.
 
 ## Usage
 
@@ -64,8 +71,42 @@ uv run skillsbench-infer .llm_config/claude.json --num-workers 4
 
 # Versioned SkillsBench aliases also resolve to the synced local dataset
 uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+
+# Run with agent skill definitions injected into task environments
+uv run skillsbench-infer .llm_config/claude.json --with-skills
+
+# Combine task selection with skills injection
+uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc --with-skills
+uv run skillsbench-infer .llm_config/claude.json --select tasks.txt --with-skills
+uv run skillsbench-infer .llm_config/claude.json --n-limit 5 --with-skills
+```
+
+### Skills Injection (`--with-skills`)
+
+The `--with-skills` flag injects agent skill definitions into the Docker environment of each evaluated task. When enabled, the following `COPY` instructions are added to each task's Dockerfile before building:
+
+```dockerfile
+# Claude Code
+COPY skills /root/.claude/skills
+# Claude Code (Harbor compatibility)
+COPY skills /etc/claude-code/.claude/skills
+# Codex
+COPY skills /root/.codex/skills
+# OpenCode
+COPY skills /root/.opencode/skill
+# Goose
+COPY skills /root/.goose/skills
+# Factory
+COPY skills /root/.factory/skills
+# Portable agents format (Goose, Amp)
+COPY skills /root/.agents/skills
 ```
 
+This makes any skills bundled in the task's `environment/skills/` directory available to the agent at the standard skill lookup paths for each supported agent framework.
+
+- Dockerfiles are automatically restored to their original content after Harbor finishes, regardless of success or failure.
+- The `with_skills` flag is recorded in `metadata.json` alongside each evaluation run.
+
 ### LLM Configuration
 
 Create an LLM configuration file (e.g., `.llm_config/claude.json`):
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index 9e8c8496a..535a27d62 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -40,6 +40,25 @@
 REGISTRY_DATASET_PREFIX = "benchflow/skillsbench"
 INSTANCE_ID_PREFIX = "benchflow"
 
+# Skills COPY block injected into Dockerfiles when --with-skills is set.
+# RUN mkdir -p lines ensure parent directories exist before COPY.
+SKILLS_COPY_BLOCK = """\
+# Claude Code
+COPY skills /root/.claude/skills
+# Claude Code (Harbor compatibility)
+COPY skills /etc/claude-code/.claude/skills
+# Codex
+COPY skills /root/.codex/skills
+# OpenCode
+COPY skills /root/.opencode/skill
+# Goose
+COPY skills /root/.goose/skills
+# Factory
+COPY skills /root/.factory/skills
+# Portable agents format (Goose, Amp)
+COPY skills /root/.agents/skills
+"""
+
 
 def check_harbor_installed() -> bool:
     """Check if harbor CLI is installed and available."""
@@ -302,6 +321,66 @@ def _canonicalize_instance_id(task_name: str) -> str:
     return f"{INSTANCE_ID_PREFIX}/{task_name}"
 
 
+def get_target_dockerfiles(
+    tasks_dir: Path,
+    task_ids: list[str] | None,
+) -> list[Path]:
+    """Return Dockerfile paths for the selected tasks (or all tasks if none specified)."""
+    if task_ids:
+        names = [tid.rsplit("/", 1)[-1] for tid in task_ids]
+        candidates = [tasks_dir / name / "environment" / "Dockerfile" for name in names]
+    else:
+        candidates = list(tasks_dir.glob("*/environment/Dockerfile"))
+
+    found = [p for p in candidates if p.is_file()]
+    missing = [p for p in candidates if not p.is_file()]
+    for p in missing:
+        logger.warning("Dockerfile not found (skipping skills injection): %s", p)
+    return found
+
+
+def inject_skills_into_dockerfiles(
+    dockerfiles: list[Path],
+) -> list[tuple[Path, str]]:
+    """Inject SKILLS_COPY_BLOCK into Dockerfiles that don't already contain it.
+
+    Returns a list of (path, original_content) for every file that was modified,
+    so callers can revert with revert_dockerfiles().
+    """
+    reverts: list[tuple[Path, str]] = []
+    for dockerfile in dockerfiles:
+        original = dockerfile.read_text(encoding="utf-8")
+        if "COPY skills" in original:
+            logger.debug("Skills already present in %s, skipping injection", dockerfile)
+            continue
+
+        # Insert the block after the last WORKDIR directive, or at end of file.
+        lines = original.splitlines(keepends=True)
+        insert_at = len(lines)
+        for i, line in enumerate(lines):
+            if line.strip().upper().startswith("WORKDIR"):
+                insert_at = i + 1
+
+        injected_lines = (
+            lines[:insert_at] + ["\n", SKILLS_COPY_BLOCK] + lines[insert_at:]
+        )
+        dockerfile.write_text("".join(injected_lines), encoding="utf-8")
+        reverts.append((dockerfile, original))
+        logger.info("Injected skills COPY block into %s", dockerfile)
+
+    return reverts
+
+
+def revert_dockerfiles(reverts: list[tuple[Path, str]]) -> None:
+    """Restore Dockerfiles to their original content after skills injection."""
+    for dockerfile, original in reverts:
+        try:
+            dockerfile.write_text(original, encoding="utf-8")
+            logger.info("Reverted %s", dockerfile)
+        except OSError as e:
+            logger.error("Failed to revert %s: %s", dockerfile, e)
+
+
 def run_harbor_evaluation(
     llm: LLM,
     dataset: str,
@@ -651,6 +730,17 @@ def main() -> None:
         action="store_true",
         help="Skip running harbor and only convert existing results",
     )
+    parser.add_argument(
+        "--with-skills",
+        action="store_true",
+        default=False,
+        help=(
+            "Inject agent skill definitions into the selected task Dockerfiles before "
+            "running evaluation. Adds COPY instructions for Claude Code, Codex, "
+            "OpenCode, Goose, Factory, and portable-agents skill directories. "
+            "Dockerfiles are restored to their original state after Harbor completes."
+        ),
+    )
 
     args = parser.parse_args()
 
@@ -711,6 +801,7 @@ def main() -> None:
         "timestamp": datetime.now(timezone.utc).isoformat(),
         "harbor_agent": HARBOR_DEFAULTS["agent_name"],
         "note": args.note,
+        "with_skills": args.with_skills,
     }
     metadata_path = Path(structured_output_dir) / "metadata.json"
     with open(metadata_path, "w") as f:
@@ -729,6 +820,18 @@ def main() -> None:
     output_path = Path(structured_output_dir) / OUTPUT_FILENAME
 
     if not args.skip_harbor:
+        # Optionally inject skill definitions into task Dockerfiles
+        dockerfile_reverts: list[tuple[Path, str]] = []
+        if args.with_skills and dataset_is_path:
+            target_dockerfiles = get_target_dockerfiles(
+                tasks_dir=Path(resolved_dataset),
+                task_ids=task_ids,
+            )
+            dockerfile_reverts = inject_skills_into_dockerfiles(target_dockerfiles)
+            logger.info(
+                "Injected skills into %d Dockerfile(s)", len(dockerfile_reverts)
+            )
+
         # Run harbor evaluation
         try:
             harbor_output_dir = run_harbor_evaluation(
@@ -750,6 +853,13 @@ def main() -> None:
         except Exception as e:
             logger.error(f"Evaluation failed: {e}")
             sys.exit(1)
+        finally:
+            if dockerfile_reverts:
+                revert_dockerfiles(dockerfile_reverts)
+                logger.info(
+                    "Reverted %d Dockerfile(s) after evaluation",
+                    len(dockerfile_reverts),
+                )
     else:
         # Skip harbor, just convert existing results
         harbor_output_dir = Path(structured_output_dir) / "harbor_output"