diff --git a/benchmarks/skillsbench/README.md b/benchmarks/skillsbench/README.md
new file mode 100644
index 00000000..a6782919
--- /dev/null
+++ b/benchmarks/skillsbench/README.md
@@ -0,0 +1,215 @@
+# SkillsBench Evaluation
+
+This module provides integration with [SkillsBench](https://www.skillsbench.ai/), a benchmark for evaluating AI agents on real-world skill-based tasks. The integration uses [Harbor](https://harborframework.com) as the evaluation harness with the `openhands-sdk` agent.
+
+## Overview
+
+SkillsBench comprises tasks across 11 domains, evaluating the efficacy of Skills augmentation in LLM-based agents.Domains contain
+
+- Software engineering
+- Office & white collar
+- Natural science
+- Media & content production
+- Cybersecurity
+- Finance
+- Robotics
+- Manufacturing
+- Energy
+- Mathematics
+- Healthcare
+
+## Prerequisites
+
+1. **Install Harbor**: Harbor is the official harness for running SkillsBench.
+   This integration is currently validated against `harbor==0.1.33`.
+
+   ```bash
+   pip install harbor==0.1.33
+   # or
+   uv pip install harbor==0.1.33
+   ```
+
+2. **Docker**: Harbor requires Docker to be installed and running.
+
+3. **Modal Credentials**: Some tasks (e.g., `mhc-implementation`, `diff-transformer`) run workloads on [Modal](https://modal.com) and require a Modal token. Set the following environment variables before running:
+
+   ```bash
+   export MODAL_TOKEN_ID=your_token_id
+   export MODAL_TOKEN_SECRET=your_token_secret
+   ```
+
+4. **LLM API Key**: Configure your LLM provider credentials.
+
+## Usage
+
+By default, `skillsbench-infer` keeps a local copy of `tasks/` from
+`https://github.com/benchflow-ai/skillsbench` on the `main` branch under
+`benchmarks/skillsbench/data/tasks`. It stores the synced upstream commit hash in
+`benchmarks/skillsbench/data/source.json` and refreshes the local snapshot when the
+upstream `main` commit changes. Dataset aliases matching
+`benchflow/skillsbench@...` resolve to this same local Harbor task dataset because
+SkillsBench is not yet published in the public Harbor registry.
+
+### Running Inference
+
+Run the SkillsBench evaluation using the OpenHands SDK agent:
+
+```bash
+uv run skillsbench-infer .llm_config/claude.json
+
+# Run specific tasks
+uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc
+
+# Run tasks from a file
+uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+# Limit the run to 5 tasks (useful for smoke tests)
+uv run skillsbench-infer .llm_config/claude.json --n-limit 5
+
+# Run with multiple workers
+uv run skillsbench-infer .llm_config/claude.json --num-workers 4
+
+# Versioned SkillsBench aliases also resolve to the synced local dataset
+uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+
+# Run with agent skill definitions injected into task environments
+uv run skillsbench-infer .llm_config/claude.json --with-skills
+
+# Combine task selection with skills injection
+uv run skillsbench-infer .llm_config/claude.json --task-id benchflow/weighted-gdp-calc --with-skills
+uv run skillsbench-infer .llm_config/claude.json --select tasks.txt --with-skills
+uv run skillsbench-infer .llm_config/claude.json --n-limit 5 --with-skills
+```
+
+### Skills Injection (`--with-skills`)
+
+The `--with-skills` flag injects agent skill definitions into the Docker environment of each evaluated task. When enabled, the following `COPY` instructions are added to each task's Dockerfile before building:
+
+```dockerfile
+# Claude Code
+COPY skills /root/.claude/skills
+# Claude Code (Harbor compatibility)
+COPY skills /etc/claude-code/.claude/skills
+# Codex
+COPY skills /root/.codex/skills
+# OpenCode
+COPY skills /root/.opencode/skill
+# Goose
+COPY skills /root/.goose/skills
+# Factory
+COPY skills /root/.factory/skills
+# Portable agents format (Goose, Amp)
+COPY skills /root/.agents/skills
+```
+
+This makes any skills bundled in the task's `environment/skills/` directory available to the agent at the standard skill lookup paths for each supported agent framework.
+
+- Dockerfiles are automatically restored to their original content after Harbor finishes, regardless of success or failure.
+- The `with_skills` flag is recorded in `metadata.json` alongside each evaluation run.
+
+### LLM Configuration
+
+Create an LLM configuration file (e.g., `.llm_config/claude.json`):
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4-20250514",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+Or use a LiteLLM proxy:
+
+```json
+{
+  "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",
+  "base_url": "https://your-proxy.example.com",
+  "api_key": "YOUR_API_KEY"
+}
+```
+
+### Evaluating Results
+
+After running inference, evaluate the results:
+
+```bash
+uv run skillsbench-eval ./evaluation_outputs/.../output.jsonl
+```
+
+This generates a report file (`output.report.json`) with:
+- Total/completed/resolved instance counts
+- Success rate
+- Aggregate metrics (cost, tokens)
+
+## Output Format
+
+### Inference Output (`output.jsonl`)
+
+Each line contains:
+
+```json
+{
+  "instance_id": "benchflow/task-name",
+  "test_result": {
+    "trial_name": "...",
+    "trial_uri": "...",
+    "rewards": {"reward": 1.0},
+    "passed": true
+  },
+  "instruction": "",
+  "error": null,
+  "history": [],
+  "metrics": {
+    "total_prompt_tokens": 5000,
+    "total_completion_tokens": 1000,
+    "total_cost_usd": 0.05
+  }
+}
+```
+
+### Evaluation Report (`output.report.json`)
+
+```json
+{
+  "total_instances": 100,
+  "completed_instances": 95,
+  "resolved_instances": 80,
+  "unresolved_instances": 15,
+  "error_instances": 5,
+  "aggregate_metrics": {
+    "total_cost_usd": 5.25,
+    "total_prompt_tokens": 500000,
+    "total_completion_tokens": 100000
+  }
+}
+```
+
+## Architecture
+
+The integration follows the Harbor agent adapter pattern:
+
+1. **Harbor Harness**: Manages task containers and lifecycle
+2. **OpenHands SDK Agent**: Runs inside containers to solve tasks
+3. **ATIF Trajectories**: Results stored in Agent Trajectory Interchange Format
+
+```text
+┌──────────────────────────────────────────────────┐
+│                 Harbor Harness                   │
+│  ┌────────────────────────────────────────────┐  │
+│  │           Task Container                   │  │
+│  │  ┌──────────────────────────────────────┐  │  │
+│  │  │       OpenHands SDK Agent            │  │  │
+│  │  │  - Terminal tool                     │  │  │
+│  │  │  - File editor tool                  │  │  │
+│  │  │  - Task tracker tool                 │  │  │
+│  │  └──────────────────────────────────────┘  │  │
+│  └────────────────────────────────────────────┘  │
+└──────────────────────────────────────────────────┘
+```
+
+## References
+
+- [SkillsBench](https://www.skillsbench.ai/) - The benchmark
+- [Harbor](https://harborframework.com) - The evaluation harness
+- [OpenHands SDK](https://github.com/OpenHands/software-agent-sdk) - The agent SDK
+- [ATIF Specification](https://github.com/laude-institute/harbor/blob/main/docs/rfcs/0001-trajectory-format.md) - Trajectory format
diff --git a/benchmarks/skillsbench/__init__.py b/benchmarks/skillsbench/__init__.py
new file mode 100644
index 00000000..c02f7baf
--- /dev/null
+++ b/benchmarks/skillsbench/__init__.py
@@ -0,0 +1 @@
+# SkillsBench evaluation benchmark
diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py
new file mode 100644
index 00000000..8b55a92b
--- /dev/null
+++ b/benchmarks/skillsbench/config.py
@@ -0,0 +1,16 @@
+"""SkillsBench configuration defaults."""
+
+# Default inference settings (only include values actually used by argparse)
+INFER_DEFAULTS = {
+    "dataset": "benchflow/skillsbench",
+    "output_dir": "./evaluation_outputs",
+    "num_workers": 1,
+}
+
+# Harbor configuration defaults
+HARBOR_DEFAULTS = {
+    # Harbor executable
+    "harbor_executable": "harbor",
+    # Default agent name for openhands-sdk
+    "agent_name": "openhands-sdk",
+}
diff --git a/benchmarks/skillsbench/eval_infer.py b/benchmarks/skillsbench/eval_infer.py
new file mode 100644
index 00000000..f55a9173
--- /dev/null
+++ b/benchmarks/skillsbench/eval_infer.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""SkillsBench Evaluation Script.
+
+This script processes SkillsBench output and generates evaluation reports.
+It reads the output.jsonl produced by run_infer, aggregates results,
+and writes a summary report.
+
+Usage:
+    uv run skillsbench-eval <path_to_output.jsonl>
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def process_skillsbench_results(
+    input_file: str,
+    output_file: str,
+) -> dict:
+    """Process SkillsBench output.jsonl and generate evaluation report.
+
+    SkillsBench format (from harbor conversion):
+    {
+        "instance_id": "task_id",
+        "test_result": {
+            "trajectory_path": "...",
+            "total_steps": N,
+            "final_metrics": {...},
+            "passed": true/false  # May be populated by harbor grading
+        },
+        "instruction": "...",
+        "history": [...]
+    }
+
+    Report format (similar to SWE-Bench):
+    {
+        "total_instances": N,
+        "submitted_instances": N,
+        "completed_instances": N,
+        "incomplete_instances": N,
+        "resolved_instances": N,
+        "unresolved_instances": N,
+        "error_instances": N,
+        "submitted_ids": [...],
+        "completed_ids": [...],
+        "incomplete_ids": [...],
+        "resolved_ids": [...],
+        "unresolved_ids": [...]
+    }
+    """
+    logger.info(f"Processing {input_file} to generate report: {output_file}")
+
+    # Use sets for O(1) lookup and automatic deduplication
+    # Convert to sorted lists only when building final report
+    completed_ids: set[str] = set()
+    resolved_ids: set[str] = set()
+    unresolved_ids: set[str] = set()
+    incomplete_ids: set[str] = set()
+    error_ids: set[str] = set()
+
+    # Aggregate metrics
+    total_cost_usd = 0.0
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+
+    with open(input_file) as infile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                # Extract required fields
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    continue
+
+                if instance_id in completed_ids:
+                    logger.warning(
+                        f"Line {line_num}: Duplicate instance_id {instance_id}"
+                    )
+                    continue
+
+                # Check for errors
+                error = data.get("error")
+                if error:
+                    error_ids.add(instance_id)
+                    incomplete_ids.add(instance_id)
+                    continue
+
+                # Extract test result
+                test_result = data.get("test_result", {})
+
+                # Check if task passed (harbor may include this)
+                passed = test_result.get("passed")
+                # If not explicitly set, we mark as completed but ungraded
+                is_resolved = passed is True
+
+                # Add to completed instances
+                completed_ids.add(instance_id)
+
+                if is_resolved:
+                    resolved_ids.add(instance_id)
+                else:
+                    unresolved_ids.add(instance_id)
+
+                # Aggregate metrics
+                # Use explicit None check to handle zero values correctly
+                # (using `or` would incorrectly fallback when value is 0)
+                metrics = data.get("metrics", {})
+                final_metrics = test_result.get("final_metrics", {})
+
+                cost = metrics.get("total_cost_usd")
+                if cost is None:
+                    cost = final_metrics.get("total_cost_usd", 0.0)
+
+                prompt_tokens = metrics.get("total_prompt_tokens")
+                if prompt_tokens is None:
+                    prompt_tokens = final_metrics.get("total_prompt_tokens", 0)
+
+                completion_tokens = metrics.get("total_completion_tokens")
+                if completion_tokens is None:
+                    completion_tokens = final_metrics.get("total_completion_tokens", 0)
+
+                # After the None checks above, these values are guaranteed to be non-None
+                total_cost_usd += cost
+                total_prompt_tokens += prompt_tokens
+                total_completion_tokens += completion_tokens
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+
+    # Check for separate error file (used in manual workflows where errors
+    # are extracted to a separate file for analysis/retry)
+    error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl")
+    if error_path.exists():
+        with open(error_path) as error_file:
+            for line_num, line in enumerate(error_file, 1):
+                try:
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    data = json.loads(line)
+                    instance_id = data.get("instance_id")
+                    if not instance_id:
+                        continue
+                    if instance_id in completed_ids or instance_id in incomplete_ids:
+                        continue
+
+                    incomplete_ids.add(instance_id)
+                    error_ids.add(instance_id)
+                except (json.JSONDecodeError, Exception) as e:
+                    logger.error(f"Error file line {line_num}: {e}")
+
+    submitted_ids = completed_ids | incomplete_ids
+
+    # Generate report - convert sets to sorted lists for consistent output
+    report = {
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": len(incomplete_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "error_instances": len(error_ids),
+        "submitted_ids": sorted(submitted_ids),
+        "completed_ids": sorted(completed_ids),
+        "incomplete_ids": sorted(incomplete_ids),
+        "resolved_ids": sorted(resolved_ids),
+        "unresolved_ids": sorted(unresolved_ids),
+        "error_ids": sorted(error_ids),
+        # Aggregate metrics
+        "aggregate_metrics": {
+            "total_cost_usd": total_cost_usd,
+            "total_prompt_tokens": total_prompt_tokens,
+            "total_completion_tokens": total_completion_tokens,
+        },
+    }
+
+    # Write report
+    with open(output_file, "w") as outfile:
+        json.dump(report, outfile, indent=4)
+
+    logger.info("Report generated successfully:")
+    logger.info(f"  Total instances: {report['total_instances']}")
+    logger.info(f"  Completed instances: {report['completed_instances']}")
+    logger.info(f"  Resolved instances: {report['resolved_instances']}")
+    logger.info(f"  Unresolved instances: {report['unresolved_instances']}")
+    logger.info(f"  Error instances: {report['error_instances']}")
+    if report["completed_instances"] > 0:
+        logger.info(
+            f"  Success rate: "
+            f"{report['resolved_instances'] / report['completed_instances'] * 100:.1f}%"
+        )
+    logger.info(f"  Total cost: ${total_cost_usd:.4f}")
+
+    return report
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Process SkillsBench output and generate evaluation report",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run skillsbench-eval output.jsonl
+    uv run skillsbench-eval /path/to/output.jsonl
+        """,
+    )
+
+    parser.add_argument("input_file", help="Path to the SkillsBench output.jsonl file")
+    parser.add_argument(
+        "--output-file",
+        help="Output file for report (default: input_file with .report.json extension)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate input file
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    # Determine output file
+    if args.output_file:
+        output_file = Path(args.output_file)
+    else:
+        output_file = input_file.with_suffix(".report.json")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+
+    try:
+        # Process results and generate report
+        process_skillsbench_results(
+            str(input_file),
+            str(output_file),
+        )
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+    # Non-critical telemetry and reporting - wrap in try/except so expensive
+    # multi-hour evaluations don't fail at the telemetry step after completing
+    try:
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+    except Exception as e:
+        logger.warning(f"Laminar update failed (non-critical): {e}")
+
+    try:
+        generate_cost_report(str(input_file))
+    except Exception as e:
+        logger.warning(f"Cost report generation failed (non-critical): {e}")
+
+    logger.info("Script completed successfully!")
+    print(json.dumps({"report_json": str(output_file)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
new file mode 100644
index 00000000..535a27d6
--- /dev/null
+++ b/benchmarks/skillsbench/run_infer.py
@@ -0,0 +1,884 @@
+"""SkillsBench inference script using Harbor with openhands-sdk agent.
+
+This script runs SkillsBench evaluation using Harbor as the harness
+and openhands-sdk as the agent. Results are saved in a format compatible
+with the standard evaluation pipeline.
+
+Usage:
+    uv run skillsbench-infer <llm_config_path> --dataset benchflow/skillsbench
+"""
+
+import argparse
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
+from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import LLM, get_logger
+
+
+logger = get_logger(__name__)
+
+# Output filename for results
+OUTPUT_FILENAME = "output.jsonl"
+
+SKILLSBENCH_REPO_URL = "https://github.com/benchflow-ai/skillsbench.git"
+SKILLSBENCH_REPO_BRANCH = "main"
+DATASET_CACHE_DIR = Path(__file__).parent / "data"
+TASKS_CACHE_DIR = DATASET_CACHE_DIR / "tasks"
+TASKS_METADATA_PATH = DATASET_CACHE_DIR / "source.json"
+REGISTRY_DATASET_PREFIX = "benchflow/skillsbench"
+INSTANCE_ID_PREFIX = "benchflow"
+
+# Skills COPY block injected into Dockerfiles when --with-skills is set.
+# RUN mkdir -p lines ensure parent directories exist before COPY.
+SKILLS_COPY_BLOCK = """\
+# Claude Code
+COPY skills /root/.claude/skills
+# Claude Code (Harbor compatibility)
+COPY skills /etc/claude-code/.claude/skills
+# Codex
+COPY skills /root/.codex/skills
+# OpenCode
+COPY skills /root/.opencode/skill
+# Goose
+COPY skills /root/.goose/skills
+# Factory
+COPY skills /root/.factory/skills
+# Portable agents format (Goose, Amp)
+COPY skills /root/.agents/skills
+"""
+
+
+def check_harbor_installed() -> bool:
+    """Check if harbor CLI is installed and available."""
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    try:
+        result = subprocess.run(
+            [harbor_exe, "--help"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
+
+
+def _run_command(cmd: list[str], error_message: str) -> str:
+    """Run a subprocess command and return stdout."""
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        stderr = result.stderr.strip() or result.stdout.strip()
+        raise RuntimeError(f"{error_message}: {stderr}")
+    return result.stdout.strip()
+
+
+def _get_supported_task_filter_flag(harbor_exe: str) -> str:
+    """Detect whether Harbor expects --task-name or --include-task-name."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return "--include-task-name"
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    supported_flags = set(re.findall(r"(?<![\w-])--[a-z0-9-]+", help_text))
+    if "--include-task-name" in supported_flags:
+        return "--include-task-name"
+    if "--task-name" in supported_flags:
+        return "--task-name"
+    return "--include-task-name"
+
+
+def _get_supported_agent_name(harbor_exe: str) -> str:
+    """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk."""
+    try:
+        result = subprocess.run(
+            [harbor_exe, "run", "--help"],
+            capture_output=True,
+            text=True,
+        )
+    except FileNotFoundError:
+        return HARBOR_DEFAULTS["agent_name"]
+
+    help_text = f"{result.stdout}\n{result.stderr}"
+    compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower())
+    if "openhands-sdk" in compact_help_text:
+        return "openhands-sdk"
+    if "openhands" in compact_help_text:
+        return "openhands"
+    return HARBOR_DEFAULTS["agent_name"]
+
+
+def get_skillsbench_main_commit(
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> str:
+    """Resolve the latest commit hash for the upstream SkillsBench branch."""
+    stdout = _run_command(
+        ["git", "ls-remote", repo_url, f"refs/heads/{branch}"],
+        "Failed to resolve SkillsBench upstream commit",
+    )
+    commit_hash, _, ref = stdout.partition("\t")
+    if not commit_hash or ref != f"refs/heads/{branch}":
+        raise RuntimeError(
+            f"Unexpected git ls-remote output for {repo_url} {branch}: {stdout}"
+        )
+    return commit_hash
+
+
+def _load_cached_commit(metadata_path: Path = TASKS_METADATA_PATH) -> str | None:
+    """Load the cached upstream commit hash for the local task snapshot."""
+    if not metadata_path.is_file():
+        return None
+
+    try:
+        with open(metadata_path, encoding="utf-8") as f:
+            metadata = json.load(f)
+    except (OSError, json.JSONDecodeError) as e:
+        logger.warning(
+            "Ignoring unreadable SkillsBench dataset metadata at %s: %s",
+            metadata_path,
+            e,
+        )
+        return None
+
+    commit_hash = metadata.get("commit_hash")
+    return commit_hash if isinstance(commit_hash, str) and commit_hash else None
+
+
+def download_skillsbench_tasks(
+    commit_hash: str,
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> None:
+    """Download only the SkillsBench tasks directory for a specific commit."""
+    data_dir = tasks_dir.parent
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    logger.info(
+        "Downloading SkillsBench tasks from %s@%s into %s",
+        repo_url,
+        commit_hash,
+        tasks_dir,
+    )
+
+    with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir:
+        clone_dir = Path(temp_dir) / "skillsbench"
+        _run_command(
+            [
+                "git",
+                "clone",
+                "--depth",
+                "1",
+                "--branch",
+                branch,
+                "--filter=blob:none",
+                "--sparse",
+                repo_url,
+                str(clone_dir),
+            ],
+            "Failed to clone SkillsBench repository",
+        )
+        _run_command(
+            ["git", "-C", str(clone_dir), "sparse-checkout", "set", "tasks"],
+            "Failed to sparsely checkout SkillsBench tasks",
+        )
+        checked_out_commit = _run_command(
+            ["git", "-C", str(clone_dir), "rev-parse", "HEAD"],
+            "Failed to read cloned SkillsBench commit",
+        )
+        if checked_out_commit != commit_hash:
+            raise RuntimeError(
+                "Cloned SkillsBench commit does not match upstream HEAD: "
+                f"expected {commit_hash}, got {checked_out_commit}"
+            )
+
+        source_tasks_dir = clone_dir / "tasks"
+        if not source_tasks_dir.is_dir():
+            raise RuntimeError(
+                f"SkillsBench clone at {clone_dir} does not contain a tasks/ directory"
+            )
+
+        if tasks_dir.exists():
+            shutil.rmtree(tasks_dir)
+        shutil.copytree(source_tasks_dir, tasks_dir)
+
+    metadata = {
+        "repo_url": repo_url,
+        "branch": branch,
+        "commit_hash": commit_hash,
+        "synced_at": datetime.now(timezone.utc).isoformat(),
+    }
+    with open(metadata_path, "w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+
+def ensure_skillsbench_tasks(
+    tasks_dir: Path = TASKS_CACHE_DIR,
+    metadata_path: Path = TASKS_METADATA_PATH,
+    repo_url: str = SKILLSBENCH_REPO_URL,
+    branch: str = SKILLSBENCH_REPO_BRANCH,
+) -> Path:
+    """Ensure a local SkillsBench task snapshot exists and matches upstream HEAD."""
+    cached_commit = _load_cached_commit(metadata_path)
+    has_cached_tasks = tasks_dir.is_dir() and any(tasks_dir.iterdir())
+
+    try:
+        upstream_commit = get_skillsbench_main_commit(repo_url=repo_url, branch=branch)
+    except RuntimeError as e:
+        if has_cached_tasks and cached_commit:
+            logger.warning(
+                "Failed to check SkillsBench upstream HEAD; using cached tasks from "
+                "%s (%s): %s",
+                tasks_dir,
+                cached_commit,
+                e,
+            )
+            return tasks_dir
+        raise
+
+    if has_cached_tasks and cached_commit == upstream_commit:
+        logger.info(
+            "Using cached SkillsBench tasks at %s (commit %s)",
+            tasks_dir,
+            upstream_commit,
+        )
+        return tasks_dir
+
+    if has_cached_tasks:
+        logger.info(
+            "Refreshing SkillsBench tasks in %s from commit %s to %s",
+            tasks_dir,
+            cached_commit or "<unknown>",
+            upstream_commit,
+        )
+    else:
+        logger.info("No cached SkillsBench tasks found at %s; downloading", tasks_dir)
+
+    download_skillsbench_tasks(
+        commit_hash=upstream_commit,
+        tasks_dir=tasks_dir,
+        metadata_path=metadata_path,
+        repo_url=repo_url,
+        branch=branch,
+    )
+    return tasks_dir
+
+
+def resolve_skillsbench_dataset(dataset: str) -> tuple[str, bool]:
+    """Resolve the dataset argument to a synced local SkillsBench snapshot.
+
+    Harbor 0.5.x validates ``--dataset`` values against the registry before
+    starting a job. SkillsBench is not yet published in the public registry, so
+    ``benchflow/skillsbench`` and versioned aliases like
+    ``benchflow/skillsbench@1.0`` must be resolved to the locally synced Harbor
+    task dataset generated by the SkillsBench adapter.
+    """
+    if dataset == REGISTRY_DATASET_PREFIX or dataset.startswith(
+        f"{REGISTRY_DATASET_PREFIX}@"
+    ):
+        local_tasks_dir = ensure_skillsbench_tasks()
+        return str(local_tasks_dir.resolve()), True
+    raise ValueError(
+        "Unsupported SkillsBench dataset source. Use the default synced "
+        "SkillsBench snapshot or a SkillsBench dataset alias matching "
+        "'benchflow/skillsbench@<version>'."
+    )
+
+
+def _normalize_task_filter_value(task_id: str, *, dataset_is_path: bool) -> str:
+    """Normalize task filter values for Harbor's local-path dataset handling."""
+    if dataset_is_path:
+        return task_id.rsplit("/", 1)[-1]
+    return task_id
+
+
+def _canonicalize_instance_id(task_name: str) -> str:
+    """Normalize SkillsBench task names to stable benchflow/<task-name> ids."""
+    if "/" in task_name:
+        return task_name
+    return f"{INSTANCE_ID_PREFIX}/{task_name}"
+
+
+def get_target_dockerfiles(
+    tasks_dir: Path,
+    task_ids: list[str] | None,
+) -> list[Path]:
+    """Return Dockerfile paths for the selected tasks (or all tasks if none specified)."""
+    if task_ids:
+        names = [tid.rsplit("/", 1)[-1] for tid in task_ids]
+        candidates = [tasks_dir / name / "environment" / "Dockerfile" for name in names]
+    else:
+        candidates = list(tasks_dir.glob("*/environment/Dockerfile"))
+
+    found = [p for p in candidates if p.is_file()]
+    missing = [p for p in candidates if not p.is_file()]
+    for p in missing:
+        logger.warning("Dockerfile not found (skipping skills injection): %s", p)
+    return found
+
+
+def inject_skills_into_dockerfiles(
+    dockerfiles: list[Path],
+) -> list[tuple[Path, str]]:
+    """Inject SKILLS_COPY_BLOCK into Dockerfiles that don't already contain it.
+
+    Returns a list of (path, original_content) for every file that was modified,
+    so callers can revert with revert_dockerfiles().
+    """
+    reverts: list[tuple[Path, str]] = []
+    for dockerfile in dockerfiles:
+        original = dockerfile.read_text(encoding="utf-8")
+        if "COPY skills" in original:
+            logger.debug("Skills already present in %s, skipping injection", dockerfile)
+            continue
+
+        # Insert the block after the last WORKDIR directive, or at end of file.
+        lines = original.splitlines(keepends=True)
+        insert_at = len(lines)
+        for i, line in enumerate(lines):
+            if line.strip().upper().startswith("WORKDIR"):
+                insert_at = i + 1
+
+        injected_lines = (
+            lines[:insert_at] + ["\n", SKILLS_COPY_BLOCK] + lines[insert_at:]
+        )
+        dockerfile.write_text("".join(injected_lines), encoding="utf-8")
+        reverts.append((dockerfile, original))
+        logger.info("Injected skills COPY block into %s", dockerfile)
+
+    return reverts
+
+
+def revert_dockerfiles(reverts: list[tuple[Path, str]]) -> None:
+    """Restore Dockerfiles to their original content after skills injection."""
+    for dockerfile, original in reverts:
+        try:
+            dockerfile.write_text(original, encoding="utf-8")
+            logger.info("Reverted %s", dockerfile)
+        except OSError as e:
+            logger.error("Failed to revert %s: %s", dockerfile, e)
+
+
+def run_harbor_evaluation(
+    llm: LLM,
+    dataset: str,
+    *,
+    dataset_is_path: bool,
+    output_dir: str,
+    num_workers: int = 1,
+    task_ids: list[str] | None = None,
+    n_limit: int | None = None,
+) -> Path:
+    """Run harbor evaluation with openhands-sdk agent.
+
+    Args:
+        llm: LLM configuration for the agent.
+        dataset: Synced SkillsBench task snapshot path or Harbor registry id.
+        dataset_is_path: Whether ``dataset`` should be passed via ``--path``.
+        output_dir: Directory to store output files.
+        num_workers: Number of parallel workers.
+        task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
+
+    Returns:
+        Path to the harbor output directory.
+    """
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
+    agent_name = _get_supported_agent_name(harbor_exe)
+    task_filter_flag = _get_supported_task_filter_flag(harbor_exe)
+
+    # Build harbor command using harbor CLI flags.
+    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
+    cmd = [
+        harbor_exe,
+        "run",
+        "--path" if dataset_is_path else "-d",
+        dataset,
+        "-a",
+        agent_name,
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    # Add specific task names if provided
+    if task_ids:
+        for task_id in task_ids:
+            cmd.extend(
+                [
+                    task_filter_flag,
+                    _normalize_task_filter_value(
+                        task_id, dataset_is_path=dataset_is_path
+                    ),
+                ]
+            )
+
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
+
+    logger.info(f"Running harbor command: {' '.join(cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
+
+    # harbor's openhands-sdk agent reads LLM credentials from the host process
+    # environment (os.environ), not from --ae flags which go to the sandbox.
+    env = os.environ.copy()
+    if llm.api_key:
+        api_key = (
+            llm.api_key.get_secret_value()
+            if isinstance(llm.api_key, SecretStr)
+            else llm.api_key
+        )
+        env["LLM_API_KEY"] = api_key
+    if llm.base_url:
+        env["LLM_BASE_URL"] = llm.base_url
+
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+
+        if result.returncode != 0:
+            if (
+                task_ids
+                and task_filter_flag == "--task-name"
+                and "No such option: --task-name" in result.stderr
+            ):
+                fallback_cmd = [
+                    "--include-task-name" if part == "--task-name" else part
+                    for part in cmd
+                ]
+                logger.warning(
+                    "Harbor does not support --task-name; retrying with "
+                    "--include-task-name"
+                )
+                result = subprocess.run(
+                    fallback_cmd,
+                    capture_output=True,
+                    text=True,
+                    env=env,
+                )
+
+            if result.returncode != 0:
+                logger.error(f"Harbor command failed with code {result.returncode}")
+                logger.error(f"stdout: {result.stdout}")
+                logger.error(f"stderr: {result.stderr}")
+                raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
+        )
+
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the harbor job directory (timestamp-named) inside the output dir."""
+    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
+    # containing result.json and trial subdirectories
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    # Use the most recent job directory if multiple exist
+    return sorted(candidates)[-1]
+
+
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
+    eval_output_path: Path,
+) -> None:
+    """Convert harbor output to evaluation output format.
+
+    Harbor stores trial results in a job directory structured as:
+        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
+
+    Each trial's result.json contains task_name, verifier_result, agent_result,
+    timing info, and exception details.
+
+    Args:
+        harbor_output_dir: Path to harbor output directory.
+        eval_output_path: Path to write the converted output.jsonl.
+    """
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
+
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
+
+    # Find trial result files (each trial dir has a result.json)
+    result_files = list(job_dir.glob("*/result.json"))
+    # Exclude the job-level result.json
+    result_files = [f for f in result_files if f.parent != job_dir]
+
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
+
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+
+    results: list[dict] = []
+    errors: list[dict] = []
+
+    for result_file in result_files:
+        try:
+            with open(result_file) as f:
+                trial = json.load(f)
+
+            instance_id = _canonicalize_instance_id(
+                trial.get("task_name", result_file.parent.name)
+            )
+
+            # Check for exceptions
+            if trial.get("exception_info"):
+                errors.append(
+                    {
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
+                        "test_result": {},
+                    }
+                )
+                continue
+
+            # Extract verifier results
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+
+            # Extract agent metrics
+            agent_result = trial.get("agent_result", {})
+
+            eval_entry = {
+                "instance_id": instance_id,
+                "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
+                    "rewards": rewards,
+                    "passed": passed,
+                },
+                "instruction": "",
+                "error": None,
+                "history": [],
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
+            }
+            results.append(eval_entry)
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
+
+        except (json.JSONDecodeError, OSError) as e:
+            logger.error(f"Failed to process result file {result_file}: {e}")
+            errors.append(
+                {
+                    "instance_id": _canonicalize_instance_id(result_file.parent.name),
+                    "error": str(e),
+                    "test_result": {},
+                }
+            )
+
+    if not results and not errors:
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+
+    if not results:
+        logger.warning(
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
+        )
+
+    # Write results to output.jsonl
+    with open(eval_output_path, "w") as f:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
+            f.write(json.dumps(entry) + "\n")
+
+    logger.info(
+        f"Wrote {len(results)} successful + {len(errors)} failed entries "
+        f"to {eval_output_path}"
+    )
+
+
+def load_task_ids_from_file(filepath: str) -> list[str]:
+    """Load task IDs from a text file (one per line)."""
+    task_ids = []
+    with open(filepath) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                task_ids.append(line)
+    return task_ids
+
+
+def main() -> None:
+    """Main entry point for skillsbench inference."""
+    parser = argparse.ArgumentParser(
+        description="Run SkillsBench evaluation with openhands-sdk via Harbor",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run full skillsbench evaluation using a local tasks/ snapshot synced from
+    # https://github.com/benchflow-ai/skillsbench main (adapter-generated
+    # Harbor tasks stored under benchmarks/skillsbench/data/tasks)
+    uv run skillsbench-infer .llm_config/claude.json
+
+    # Run specific tasks
+    uv run skillsbench-infer .llm_config/claude.json --select tasks.txt
+
+    # Versioned SkillsBench aliases also resolve to the synced local dataset
+    uv run skillsbench-infer .llm_config/claude.json --dataset benchflow/skillsbench@1.0
+        """,
+    )
+
+    parser.add_argument(
+        "llm_config_path",
+        type=str,
+        help="Path to JSON LLM configuration file",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=INFER_DEFAULTS["dataset"],
+        help=(
+            "SkillsBench dataset source. The default value syncs tasks/ from the "
+            "benchflow-ai/skillsbench main branch. Versioned aliases like "
+            "benchflow/skillsbench@1.0 also resolve to the same local Harbor "
+            "dataset because SkillsBench is not published in the public Harbor "
+            "registry yet."
+        ),
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=INFER_DEFAULTS["output_dir"],
+        help="Base output directory for evaluation results",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=INFER_DEFAULTS["num_workers"],
+        help="Number of parallel workers",
+    )
+    parser.add_argument(
+        "--n-limit",
+        type=int,
+        help="Maximum number of dataset tasks to run after Harbor filtering",
+    )
+    parser.add_argument(
+        "--select",
+        type=str,
+        help="Path to text file containing task IDs to run (one per line)",
+    )
+    parser.add_argument(
+        "--task-id",
+        type=str,
+        action="append",
+        help="Specific task ID to run (can be specified multiple times)",
+    )
+    parser.add_argument(
+        "--note",
+        type=str,
+        help="Optional note for the evaluation run",
+    )
+    parser.add_argument(
+        "--skip-harbor",
+        action="store_true",
+        help="Skip running harbor and only convert existing results",
+    )
+    parser.add_argument(
+        "--with-skills",
+        action="store_true",
+        default=False,
+        help=(
+            "Inject agent skill definitions into the selected task Dockerfiles before "
+            "running evaluation. Adds COPY instructions for Claude Code, Codex, "
+            "OpenCode, Goose, Factory, and portable-agents skill directories. "
+            "Dockerfiles are restored to their original state after Harbor completes."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Validate LLM config
+    if not os.path.isfile(args.llm_config_path):
+        logger.error(f"LLM config file does not exist: {args.llm_config_path}")
+        sys.exit(1)
+
+    with open(args.llm_config_path) as f:
+        llm_config = f.read()
+    llm = LLM.model_validate_json(llm_config)
+    logger.info(f"Using LLM: {llm.model}")
+
+    # Check harbor installation
+    if not args.skip_harbor and not check_harbor_installed():
+        logger.error(
+            "Harbor CLI is not installed. Please install it:\n"
+            "  pip install harbor\n"
+            "  # or\n"
+            "  uv pip install harbor"
+        )
+        sys.exit(1)
+
+    resolved_dataset = args.dataset
+    dataset_is_path = False
+    dataset_commit_hash: str | None = None
+    if not args.skip_harbor:
+        try:
+            resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+                args.dataset
+            )
+        except ValueError as e:
+            logger.error(str(e))
+            sys.exit(1)
+        if dataset_is_path and args.dataset == INFER_DEFAULTS["dataset"]:
+            dataset_commit_hash = _load_cached_commit()
+
+    # Construct output directory
+    dataset_description = args.dataset.replace("/", "__").replace("@", "-")
+    structured_output_dir = construct_eval_output_dir(
+        base_dir=args.output_dir,
+        dataset_name=dataset_description,
+        model_name=llm.model,
+        max_iterations=100,  # Not directly used but required for path construction
+        eval_note=args.note,
+    )
+
+    logger.info(f"Output directory: {structured_output_dir}")
+    os.makedirs(structured_output_dir, exist_ok=True)
+
+    # Save metadata
+    metadata = {
+        "llm": llm.model_dump_json(),
+        "dataset": args.dataset,
+        "resolved_dataset": resolved_dataset,
+        "dataset_is_path": dataset_is_path,
+        "dataset_commit_hash": dataset_commit_hash,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "harbor_agent": HARBOR_DEFAULTS["agent_name"],
+        "note": args.note,
+        "with_skills": args.with_skills,
+    }
+    metadata_path = Path(structured_output_dir) / "metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Collect task IDs if specified
+    task_ids: list[str] | None = None
+    if args.select:
+        loaded_ids = load_task_ids_from_file(args.select)
+        task_ids = loaded_ids
+        logger.info(f"Loaded {len(loaded_ids)} task IDs from {args.select}")
+    elif args.task_id:
+        task_ids = list(args.task_id)
+        logger.info(f"Running {len(task_ids)} specified task IDs")
+
+    output_path = Path(structured_output_dir) / OUTPUT_FILENAME
+
+    if not args.skip_harbor:
+        # Optionally inject skill definitions into task Dockerfiles
+        dockerfile_reverts: list[tuple[Path, str]] = []
+        if args.with_skills and dataset_is_path:
+            target_dockerfiles = get_target_dockerfiles(
+                tasks_dir=Path(resolved_dataset),
+                task_ids=task_ids,
+            )
+            dockerfile_reverts = inject_skills_into_dockerfiles(target_dockerfiles)
+            logger.info(
+                "Injected skills into %d Dockerfile(s)", len(dockerfile_reverts)
+            )
+
+        # Run harbor evaluation
+        try:
+            harbor_output_dir = run_harbor_evaluation(
+                llm=llm,
+                dataset=resolved_dataset,
+                dataset_is_path=dataset_is_path,
+                output_dir=structured_output_dir,
+                num_workers=args.num_workers,
+                task_ids=task_ids,
+                n_limit=args.n_limit,
+            )
+
+            # Convert harbor output to standard format
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+
+        except Exception as e:
+            logger.error(f"Evaluation failed: {e}")
+            sys.exit(1)
+        finally:
+            if dockerfile_reverts:
+                revert_dockerfiles(dockerfile_reverts)
+                logger.info(
+                    "Reverted %d Dockerfile(s) after evaluation",
+                    len(dockerfile_reverts),
+                )
+    else:
+        # Skip harbor, just convert existing results
+        harbor_output_dir = Path(structured_output_dir) / "harbor_output"
+        if harbor_output_dir.exists():
+            convert_harbor_to_eval_output(
+                harbor_output_dir=harbor_output_dir,
+                eval_output_path=output_path,
+            )
+        else:
+            logger.error(f"No harbor output found at {harbor_output_dir}")
+            sys.exit(1)
+
+    # Generate cost report
+    if output_path.exists():
+        generate_cost_report(str(output_path))
+
+    logger.info("SkillsBench inference completed!")
+    print(json.dumps({"output_json": str(output_path)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py
index 8f38909f..7a21a383 100755
--- a/benchmarks/utils/report_costs.py
+++ b/benchmarks/utils/report_costs.py
@@ -48,7 +48,9 @@ def extract_accumulated_cost(jsonl_data: List[Optional[Dict]]) -> float:
         if entry is None:
             continue
         metrics = entry.get("metrics") or {}
-        accumulated_cost = metrics.get("accumulated_cost")
+        accumulated_cost = metrics.get("accumulated_cost") or metrics.get(
+            "total_cost_usd"
+        )
         if accumulated_cost is not None:
             total_cost += float(accumulated_cost)
 
diff --git a/pyproject.toml b/pyproject.toml
index 11773729..34ecaf33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,8 @@ swebenchmultilingual-eval = "benchmarks.swebenchmultilingual.eval_infer:main"
 swefficiency-infer = "benchmarks.swefficiency.run_infer:main"
 terminalbench-infer = "benchmarks.terminalbench.run_infer:main"
 terminalbench-eval = "benchmarks.terminalbench.eval_infer:main"
+skillsbench-infer = "benchmarks.skillsbench.run_infer:main"
+skillsbench-eval = "benchmarks.skillsbench.eval_infer:main"
 hybridgym-funclocalize-infer = "benchmarks.hybridgym_funclocalize.run_infer:main"
 hybridgym-funclocalize-eval = "benchmarks.hybridgym_funclocalize.eval_infer:main"
 hybridgym-depsearch-infer = "benchmarks.hybridgym_depsearch.run_infer:main"
diff --git a/tests/test_skillsbench_eval_infer.py b/tests/test_skillsbench_eval_infer.py
new file mode 100644
index 00000000..56d54f27
--- /dev/null
+++ b/tests/test_skillsbench_eval_infer.py
@@ -0,0 +1,125 @@
+"""Tests for SkillsBench eval_infer module."""
+
+import json
+from pathlib import Path
+
+from benchmarks.skillsbench.eval_infer import process_skillsbench_results
+
+
+class TestProcessSkillsbenchResults:
+    """Tests for the process_skillsbench_results function."""
+
+    def test_empty_input(self, tmp_path: Path) -> None:
+        """Test processing empty input file."""
+        input_file = tmp_path / "empty.jsonl"
+        output_file = tmp_path / "empty.report.json"
+        input_file.write_text("")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 0
+        assert result["completed_instances"] == 0
+        assert result["resolved_instances"] == 0
+
+    def test_resolved_instance(self, tmp_path: Path) -> None:
+        """Test processing a resolved (passed=True) instance."""
+        input_file = tmp_path / "resolved.jsonl"
+        output_file = tmp_path / "resolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/weighted-gdp-calc",
+            "test_result": {"passed": True, "rewards": {"reward": 1.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 0
+        assert "benchflow/weighted-gdp-calc" in result["resolved_ids"]
+
+    def test_unresolved_instance(self, tmp_path: Path) -> None:
+        """Test processing an unresolved (passed=False) instance."""
+        input_file = tmp_path / "unresolved.jsonl"
+        output_file = tmp_path / "unresolved.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": False, "rewards": {"reward": 0.0}},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["resolved_instances"] == 0
+        assert result["unresolved_instances"] == 1
+
+    def test_instance_with_error(self, tmp_path: Path) -> None:
+        """Test processing an instance that errored."""
+        input_file = tmp_path / "error.jsonl"
+        output_file = tmp_path / "error.report.json"
+
+        entry = {
+            "instance_id": "benchflow/error-task",
+            "test_result": {},
+            "error": "ValueError: LLM_API_KEY environment variable must be set",
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["error_instances"] == 1
+        assert result["incomplete_instances"] == 1
+        assert result["completed_instances"] == 0
+        assert "benchflow/error-task" in result["error_ids"]
+
+    def test_multiple_instances(self, tmp_path: Path) -> None:
+        """Test processing multiple instances with mixed results."""
+        input_file = tmp_path / "multi.jsonl"
+        output_file = tmp_path / "multi.report.json"
+
+        entries = [
+            {
+                "instance_id": "benchflow/task-1",
+                "test_result": {"passed": True},
+                "error": None,
+            },
+            {
+                "instance_id": "benchflow/task-2",
+                "test_result": {"passed": False},
+                "error": None,
+            },
+            {"instance_id": "benchflow/task-3", "test_result": {}, "error": "Timeout"},
+        ]
+        input_file.write_text("\n".join(json.dumps(e) for e in entries) + "\n")
+
+        result = process_skillsbench_results(str(input_file), str(output_file))
+
+        assert result["total_instances"] == 3
+        assert result["completed_instances"] == 2
+        assert result["resolved_instances"] == 1
+        assert result["unresolved_instances"] == 1
+        assert result["error_instances"] == 1
+
+    def test_report_file_written(self, tmp_path: Path) -> None:
+        """Test that report file is written correctly."""
+        input_file = tmp_path / "input.jsonl"
+        output_file = tmp_path / "output.report.json"
+
+        entry = {
+            "instance_id": "benchflow/task-1",
+            "test_result": {"passed": True},
+            "error": None,
+        }
+        input_file.write_text(json.dumps(entry) + "\n")
+
+        process_skillsbench_results(str(input_file), str(output_file))
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            report = json.load(f)
+        assert "total_instances" in report
+        assert "resolved_ids" in report
+        assert "aggregate_metrics" in report
diff --git a/tests/test_skillsbench_run_infer.py b/tests/test_skillsbench_run_infer.py
new file mode 100644
index 00000000..ae97989e
--- /dev/null
+++ b/tests/test_skillsbench_run_infer.py
@@ -0,0 +1,440 @@
+"""Tests for SkillsBench run_infer module."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from benchmarks.skillsbench.config import INFER_DEFAULTS
+from benchmarks.skillsbench.run_infer import (
+    convert_harbor_to_eval_output,
+    ensure_skillsbench_tasks,
+    resolve_skillsbench_dataset,
+    run_harbor_evaluation,
+)
+from openhands.sdk import LLM
+
+
+class TestDatasetSync:
+    """Tests for syncing the local SkillsBench task snapshot."""
+
+    def test_ensure_skillsbench_tasks_reuses_matching_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that an up-to-date cached tasks directory is reused."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "abc123"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "abc123",
+        )
+
+        called = False
+
+        def fake_download(**kwargs) -> None:
+            nonlocal called
+            called = True
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+        assert called is False
+
+    def test_ensure_skillsbench_tasks_refreshes_stale_cache(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a stale cached commit triggers a redownload."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "old-commit"}))
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            lambda repo_url, branch: "new-commit",
+        )
+
+        captured: dict[str, str] = {}
+
+        def fake_download(
+            *,
+            commit_hash: str,
+            tasks_dir: Path,
+            metadata_path: Path,
+            repo_url: str,
+            branch: str,
+        ) -> None:
+            captured["commit_hash"] = commit_hash
+            captured["tasks_dir"] = str(tasks_dir)
+            captured["metadata_path"] = str(metadata_path)
+            tasks_dir.mkdir(exist_ok=True)
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.download_skillsbench_tasks",
+            fake_download,
+        )
+
+        ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert captured["commit_hash"] == "new-commit"
+        assert captured["tasks_dir"] == str(tasks_dir)
+        assert captured["metadata_path"] == str(metadata_path)
+
+    def test_ensure_skillsbench_tasks_uses_cache_if_remote_check_fails(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that a usable cache is kept when the upstream HEAD check fails."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+        (tasks_dir / "task-a").mkdir()
+        metadata_path = tmp_path / "source.json"
+        metadata_path.write_text(json.dumps({"commit_hash": "cached-commit"}))
+
+        def fake_head(repo_url: str, branch: str) -> str:
+            raise RuntimeError("network unavailable")
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.get_skillsbench_main_commit",
+            fake_head,
+        )
+
+        resolved = ensure_skillsbench_tasks(
+            tasks_dir=tasks_dir,
+            metadata_path=metadata_path,
+        )
+
+        assert resolved == tasks_dir
+
+    def test_resolve_skillsbench_dataset_maps_aliases_to_local_snapshot(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test SkillsBench dataset aliases resolve to the local Harbor dataset."""
+        tasks_dir = tmp_path / "tasks"
+        tasks_dir.mkdir()
+
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer.ensure_skillsbench_tasks",
+            lambda: tasks_dir,
+        )
+
+        resolved_dataset, dataset_is_path = resolve_skillsbench_dataset(
+            "benchflow/skillsbench@1.0"
+        )
+
+        assert resolved_dataset == str(tasks_dir.resolve())
+        assert dataset_is_path is True
+
+
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
+
+    def test_run_harbor_evaluation_passes_filters_and_limits(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor command normalizes local task ids and includes main flags."""
+        captured: dict[str, list[str]] = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
+
+        harbor_output_dir = run_harbor_evaluation(
+            llm=LLM(
+                model="litellm_proxy/test-model",
+                api_key="test-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
+            output_dir=str(tmp_path),
+            num_workers=2,
+            task_ids=["benchflow/task-a", "benchflow/task-b"],
+            n_limit=3,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "--path",
+            str(tmp_path / "tasks"),
+            "-a",
+            "openhands",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--include-task-name") == 2
+        assert "task-a" in cmd
+        assert "task-b" in cmd
+        assert "benchflow/task-a" not in cmd
+        assert "--ae" not in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "2"
+        assert cmd[cmd.index("--n-tasks") + 1] == "3"
+
+    def test_run_harbor_evaluation_retries_with_legacy_task_flag(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor falls back to --include-task-name when --task-name fails."""
+        captured_cmds: list[list[str]] = []
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured_cmds.append(cmd)
+            if "--task-name" in cmd:
+                return type(
+                    "Completed",
+                    (),
+                    {
+                        "returncode": 2,
+                        "stdout": "",
+                        "stderr": "No such option: --task-name",
+                    },
+                )()
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
+
+        run_harbor_evaluation(
+            llm=LLM(model="test-model"),
+            dataset=str(tmp_path / "tasks"),
+            dataset_is_path=True,
+            output_dir=str(tmp_path),
+            task_ids=["benchflow/task-a"],
+        )
+
+        assert len(captured_cmds) == 2
+        assert "--task-name" in captured_cmds[0]
+        assert "--include-task-name" in captured_cmds[1]
+
+    def test_llm_credentials_passed_via_env(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test that LLM credentials are passed via subprocess env, not --ae flags."""
+        captured: dict = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool, env: dict):
+            captured["cmd"] = cmd
+            captured["env"] = env
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr("benchmarks.skillsbench.run_infer.subprocess.run", fake_run)
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_task_filter_flag",
+            lambda harbor_exe: "--include-task-name",
+        )
+        monkeypatch.setattr(
+            "benchmarks.skillsbench.run_infer._get_supported_agent_name",
+            lambda harbor_exe: "openhands",
+        )
+
+        run_harbor_evaluation(
+            llm=LLM(
+                model="test-model",
+                api_key="my-secret-key",
+                base_url="https://my-proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            dataset_is_path=False,
+            output_dir=str(tmp_path),
+        )
+
+        assert captured["env"]["LLM_API_KEY"] == "my-secret-key"
+        assert captured["env"]["LLM_BASE_URL"] == "https://my-proxy.example.com"
+        assert "--ae" not in captured["cmd"]
+
+
+class TestConvertHarborToEvalOutput:
+    """Tests for convert_harbor_to_eval_output function."""
+
+    def _create_harbor_structure(
+        self, tmp_path: Path, trials: list[tuple[str, dict]]
+    ) -> Path:
+        """Create a mock Harbor output structure."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test-job"}))
+
+        for trial_name, trial_result in trials:
+            trial_dir = job_dir / trial_name
+            trial_dir.mkdir()
+            (trial_dir / "result.json").write_text(json.dumps(trial_result))
+
+        return harbor_dir
+
+    def test_successful_trial_parsing(self, tmp_path: Path) -> None:
+        """Test successful parsing of harbor trial result."""
+        trial_result = {
+            "task_name": "benchflow/weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        assert output_file.exists()
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+        assert entries[0]["test_result"]["passed"] is True
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.05
+
+    def test_local_trial_names_are_normalized_to_canonical_instance_ids(
+        self, tmp_path: Path
+    ) -> None:
+        """Test local Harbor task names without namespace keep benchflow ids."""
+        trial_result = {
+            "task_name": "weighted-gdp-calc",
+            "trial_name": "weighted-gdp-calc__abc123",
+            "trial_uri": "file:///path/to/trial",
+            "agent_result": {
+                "n_input_tokens": 1000,
+                "n_output_tokens": 200,
+                "cost_usd": 0.05,
+            },
+            "verifier_result": {"rewards": {"reward": 1.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("weighted-gdp-calc__abc123", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["instance_id"] == "benchflow/weighted-gdp-calc"
+
+    def test_failed_trial(self, tmp_path: Path) -> None:
+        """Test parsing of a trial with reward 0."""
+        trial_result = {
+            "task_name": "benchflow/task-1",
+            "trial_name": "task-1__xyz",
+            "agent_result": {
+                "n_input_tokens": None,
+                "n_output_tokens": None,
+                "cost_usd": None,
+            },
+            "verifier_result": {"rewards": {"reward": 0.0}},
+            "exception_info": None,
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("task-1__xyz", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert entries[0]["test_result"]["passed"] is False
+        assert entries[0]["metrics"]["total_cost_usd"] == 0.0
+
+    def test_trial_with_exception(self, tmp_path: Path) -> None:
+        """Test that exception trials are written as error entries."""
+        trial_result = {
+            "task_name": "benchflow/error-task",
+            "trial_name": "error-task__err",
+            "agent_result": {},
+            "verifier_result": {},
+            "exception_info": {"type": "ValueError", "message": "LLM_API_KEY not set"},
+        }
+
+        harbor_dir = self._create_harbor_structure(
+            tmp_path, [("error-task__err", trial_result)]
+        )
+        output_file = tmp_path / "output.jsonl"
+        convert_harbor_to_eval_output(harbor_dir, output_file)
+
+        with open(output_file) as f:
+            entries = [json.loads(line) for line in f]
+
+        assert len(entries) == 1
+        assert entries[0]["instance_id"] == "benchflow/error-task"
+        assert entries[0]["error"] is not None
+        assert entries[0]["test_result"] == {}
+
+    def test_missing_job_directory(self, tmp_path: Path) -> None:
+        """Test handling when no job directory exists."""
+        harbor_dir = tmp_path / "harbor_output"
+        harbor_dir.mkdir()
+
+        with pytest.raises(RuntimeError, match="No harbor job directory found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
+
+    def test_empty_job_directory(self, tmp_path: Path) -> None:
+        """Test handling of harbor job dir with no trial subdirs."""
+        harbor_dir = tmp_path / "harbor_output"
+        job_dir = harbor_dir / "2026-01-01__00-00-00"
+        job_dir.mkdir(parents=True)
+        (job_dir / "result.json").write_text(json.dumps({"id": "test"}))
+
+        with pytest.raises(RuntimeError, match="No trial result files found"):
+            convert_harbor_to_eval_output(harbor_dir, tmp_path / "output.jsonl")
diff --git a/uv.lock b/uv.lock
index 2cd0b364..ec435075 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1282,6 +1282,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" },
     { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" },
     { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" },
+    { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" },
     { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" },
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
@@ -1292,6 +1293,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
     { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
     { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1302,6 +1304,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
     { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
     { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -2467,6 +2470,7 @@ dependencies = [
     { name = "python-json-logger" },
     { name = "requests" },
     { name = "swebench" },
+    { name = "swesmith" },
     { name = "swt-bench" },
     { name = "tenacity" },
     { name = "toml" },
@@ -2521,6 +2525,7 @@ requires-dist = [
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "requests" },
     { name = "swebench", specifier = "==4.1.0" },
+    { name = "swesmith", specifier = ">=0.0.9" },
     { name = "swt-bench", git = "https://github.com/logic-star-ai/swt-bench.git?rev=5fdcd446ff05e248ecfffc19d560a210699f71f8" },
     { name = "tenacity", specifier = ">=9.1.2" },
     { name = "toml" },
@@ -6841,6 +6846,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
 ]
 
+[[package]]
+name = "swesmith"
+version = "0.0.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/07/97/e506b20fa59debc66e4660a86b0e98b45d32c87f23b994ad739e9c5d542a/swesmith-0.0.9.tar.gz", hash = "sha256:1726124ea43577853c6efb0a5a0db5fa3ce5c340e1bed479afa5bab85d8a69da", size = 214830, upload-time = "2026-02-27T01:06:13.455Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/2d/71b6ac5dadbe7199085de3815624775744d51b6c554efeeddfb12dc45ce1/swesmith-0.0.9-py3-none-any.whl", hash = "sha256:cbb98a52fc573b38032cde1179b6ce5f5862ce7c31d6931cfd5b8ad4969ce900", size = 275800, upload-time = "2026-02-27T01:06:11.864Z" },
+]
+
 [[package]]
 name = "swt-bench"
 version = "1.0.1"