Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/hybridgym_depsearch/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]:
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=False)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=False)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/hybridgym_funcgen/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]:
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=False)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=False)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/hybridgym_funclocalize/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]:
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=False)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=False)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/hybridgym_issuelocalize/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]:
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=False)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=False)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_tools_for_preset(
"""Get the list of tools for the given preset.
Args:
preset: The tool preset to use (default, gemini, or planning).
preset: The tool preset to use (default, gemini, gpt5, or planning).
enable_browser: Whether to include browser tools.
Returns:
Expand All @@ -71,6 +71,10 @@ def get_tools_for_preset(
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=enable_browser)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=enable_browser)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/swebenchmultilingual/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def get_tools_for_preset(
"""Get the list of tools for the given preset.

Args:
preset: The tool preset to use (default, gemini, or planning).
preset: The tool preset to use (default, gemini, gpt5, or planning).
enable_browser: Whether to include browser tools.

Returns:
Expand All @@ -61,6 +61,10 @@ def get_tools_for_preset(
from openhands.tools.preset.gemini import get_gemini_tools

return get_gemini_tools(enable_browser=enable_browser)
elif preset == "gpt5":
from openhands.tools.preset.gpt5 import get_gpt5_tools

return get_gpt5_tools(enable_browser=enable_browser)
elif preset == "planning":
from openhands.tools.preset.planning import get_planning_tools

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/utils/args_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,11 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
"--tool-preset",
type=str,
default="default",
choices=["default", "gemini", "planning"],
choices=["default", "gemini", "gpt5", "planning"],
help=(
"Tool preset for file editing. 'default' uses FileEditorTool, "
"'gemini' uses read_file/write_file/edit/list_directory (default: default)"
"'gemini' uses read_file/write_file/edit/list_directory, "
"'gpt5' uses apply_patch tool (default: default)"
),
)
parser.add_argument(
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


# Tool preset type for selecting which file editing toolset to use
ToolPresetType = Literal["default", "gemini", "planning"]
ToolPresetType = Literal["default", "gemini", "gpt5", "planning"]


class EvalMetadata(BaseModel):
Expand Down Expand Up @@ -101,6 +101,7 @@ class EvalMetadata(BaseModel):
description=(
"Tool preset for file editing. 'default' uses FileEditorTool, "
"'gemini' uses read_file/write_file/edit/list_directory, "
"'gpt5' uses apply_patch tool, "
"'planning' uses planning-mode tools."
),
)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_multimodal_phased_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def test_force_build_forwarded(self, _collect, mock_builder, _bases, mock_assemb
class TestMultimodalParser:
def test_defaults(self):
from benchmarks.swebenchmultimodal.build_images import get_parser
from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS

parser = get_parser()
args = parser.parse_args([])
Expand All @@ -287,4 +288,4 @@ def test_defaults(self):
assert args.push is False
assert args.force_build is False
assert args.n_limit == 0
assert args.select is None
assert args.select == BUILD_DEFAULTS["select"]
50 changes: 50 additions & 0 deletions tests/test_tool_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest

from benchmarks.hybridgym_depsearch.run_infer import DepSearchEvaluation
from benchmarks.hybridgym_funcgen.run_infer import FuncGenEvaluation
from benchmarks.hybridgym_funclocalize.run_infer import FuncLocalizeEvaluation
from benchmarks.hybridgym_issuelocalize.run_infer import IssueLocalizeEvaluation
from benchmarks.swebench.run_infer import get_tools_for_preset as get_swebench_tools
from benchmarks.swebenchmultilingual.run_infer import (
get_tools_for_preset as get_swebenchmultilingual_tools,
)
from benchmarks.utils.args_parser import get_parser
from openhands.tools.preset.gpt5 import get_gpt5_tools


@pytest.mark.parametrize(
"getter",
[
get_swebench_tools,
get_swebenchmultilingual_tools,
],
)
def test_swebench_tool_helpers_support_gpt5(getter):
expected = [tool.name for tool in get_gpt5_tools(enable_browser=False)]

assert [tool.name for tool in getter("gpt5", enable_browser=False)] == expected


@pytest.mark.parametrize(
"evaluation_cls",
[
DepSearchEvaluation,
FuncGenEvaluation,
FuncLocalizeEvaluation,
IssueLocalizeEvaluation,
],
)
def test_hybridgym_tool_helpers_support_gpt5(evaluation_cls):
expected = [tool.name for tool in get_gpt5_tools(enable_browser=False)]

assert [
tool.name for tool in evaluation_cls._get_tools(None, preset="gpt5")
] == expected


def test_common_parser_accepts_gpt5_tool_preset():
parser = get_parser(add_llm_config=False)

args = parser.parse_args(["--tool-preset", "gpt5"])

assert args.tool_preset == "gpt5"
Loading