Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmarks/skillsbench/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""SkillsBench configuration defaults."""

from benchmarks.utils.harbor_compat import get_harbor_dataset


# Default inference settings (only include values actually used by argparse)
INFER_DEFAULTS = {
"dataset": "benchflow/skillsbench",
"dataset": get_harbor_dataset("skillsbench"),
"output_dir": "./evaluation_outputs",
"num_workers": 1,
}
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/terminalbench/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Terminal-Bench configuration defaults."""

from benchmarks.utils.harbor_compat import get_harbor_dataset


# Default inference settings (only include values actually used by argparse)
INFER_DEFAULTS = {
"dataset": "terminal-bench@2.0",
"dataset": get_harbor_dataset("terminalbench"),
"output_dir": "./evaluation_outputs",
"num_workers": 1,
}
Expand Down
46 changes: 46 additions & 0 deletions benchmarks/utils/harbor_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Harbor dataset names for benchmarks with Harbor-backed wrappers."""

from __future__ import annotations


HARBOR_DATASET_BY_BENCHMARK: dict[str, str] = {
"gaia": "gaia",
"multiswebench": "multi-swe-bench",
"skillsbench": "benchflow/skillsbench",
"swebench": "swebench-verified",
"swebenchmultilingual": "swebench_multilingual",
"swebenchpro": "swebenchpro",
"swesmith": "swesmith",
"swtbench": "swtbench-verified",
"swegym": "swegym",
"terminalbench": "terminal-bench@2.0",
}


def normalize_benchmark_name(benchmark_name: str) -> str:
"""Normalize benchmark names to the package/module key used in this repo."""
return (
benchmark_name.strip()
.lower()
.replace("-", "")
.replace("_", "")
.replace("/", "")
)


def is_harbor_covered(benchmark_name: str) -> bool:
"""Return whether the benchmark has a Harbor dataset mapping."""
return normalize_benchmark_name(benchmark_name) in HARBOR_DATASET_BY_BENCHMARK


def get_harbor_dataset(benchmark_name: str) -> str:
"""Return the Harbor dataset name for a covered benchmark."""
key = normalize_benchmark_name(benchmark_name)
try:
return HARBOR_DATASET_BY_BENCHMARK[key]
except KeyError as exc:
supported = ", ".join(sorted(HARBOR_DATASET_BY_BENCHMARK))
raise KeyError(
f"No Harbor dataset mapping for {benchmark_name!r}. "
f"Supported benchmarks: {supported}"
) from exc
43 changes: 43 additions & 0 deletions tests/test_harbor_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Tests for Harbor benchmark compatibility mappings."""

import pytest

from benchmarks.terminalbench.config import INFER_DEFAULTS as TERMINAL_DEFAULTS
from benchmarks.utils.harbor_compat import (
HARBOR_DATASET_BY_BENCHMARK,
get_harbor_dataset,
is_harbor_covered,
normalize_benchmark_name,
)


def test_known_harbor_dataset_mappings() -> None:
"""Test Harbor dataset names for covered OpenHands benchmark modules."""
assert get_harbor_dataset("swebench") == "swebench-verified"
assert get_harbor_dataset("swebenchpro") == "swebenchpro"
assert get_harbor_dataset("swebenchmultilingual") == "swebench_multilingual"
assert get_harbor_dataset("swtbench") == "swtbench-verified"
assert get_harbor_dataset("swesmith") == "swesmith"
assert get_harbor_dataset("multiswebench") == "multi-swe-bench"
assert get_harbor_dataset("gaia") == "gaia"
assert get_harbor_dataset("terminalbench") == "terminal-bench@2.0"
assert get_harbor_dataset("swegym") == "swegym"


def test_name_normalization_accepts_cli_style_names() -> None:
"""Test hyphen/underscore variants normalize to the same mapping key."""
assert normalize_benchmark_name("SWE-Bench_Pro") == "swebenchpro"
assert get_harbor_dataset("swe-bench-pro") == "swebenchpro"
assert get_harbor_dataset("terminal-bench") == "terminal-bench@2.0"


def test_uncovered_benchmark_is_explicit() -> None:
"""Test benchmarks without Harbor adapters are not silently mapped."""
assert is_harbor_covered("commit0") is False
with pytest.raises(KeyError, match="No Harbor dataset mapping"):
get_harbor_dataset("commit0")


def test_terminalbench_default_uses_mapping() -> None:
"""Test an existing Harbor-backed wrapper consumes the shared mapping."""
assert TERMINAL_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["terminalbench"]
Loading