From 1962a039f8fc869ee662cbc72f5df9190872043c Mon Sep 17 00:00:00 2001 From: enyst Date: Tue, 21 Apr 2026 14:35:16 +0000 Subject: [PATCH 1/2] feat: add gpt5 benchmark tool preset support Co-authored-by: openhands --- benchmarks/hybridgym_depsearch/run_infer.py | 4 ++ benchmarks/hybridgym_funcgen/run_infer.py | 4 ++ .../hybridgym_funclocalize/run_infer.py | 4 ++ .../hybridgym_issuelocalize/run_infer.py | 4 ++ benchmarks/swebench/run_infer.py | 6 ++- benchmarks/swebenchmultilingual/run_infer.py | 6 ++- benchmarks/utils/args_parser.py | 5 +- benchmarks/utils/models.py | 3 +- tests/test_tool_presets.py | 50 +++++++++++++++++++ 9 files changed, 81 insertions(+), 5 deletions(-) create mode 100644 tests/test_tool_presets.py diff --git a/benchmarks/hybridgym_depsearch/run_infer.py b/benchmarks/hybridgym_depsearch/run_infer.py index 51fe2d00a..444c7f97c 100644 --- a/benchmarks/hybridgym_depsearch/run_infer.py +++ b/benchmarks/hybridgym_depsearch/run_infer.py @@ -265,6 +265,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]: from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=False) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=False) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/hybridgym_funcgen/run_infer.py b/benchmarks/hybridgym_funcgen/run_infer.py index 9152bfe23..2afaaf7e9 100644 --- a/benchmarks/hybridgym_funcgen/run_infer.py +++ b/benchmarks/hybridgym_funcgen/run_infer.py @@ -366,6 +366,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]: from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=False) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=False) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/hybridgym_funclocalize/run_infer.py b/benchmarks/hybridgym_funclocalize/run_infer.py index 465d0ddb9..a1878fa2d 100644 --- a/benchmarks/hybridgym_funclocalize/run_infer.py +++ b/benchmarks/hybridgym_funclocalize/run_infer.py @@ -351,6 +351,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]: from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=False) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=False) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/hybridgym_issuelocalize/run_infer.py b/benchmarks/hybridgym_issuelocalize/run_infer.py index 0ff3abf6b..e7b02c81c 100644 --- a/benchmarks/hybridgym_issuelocalize/run_infer.py +++ b/benchmarks/hybridgym_issuelocalize/run_infer.py @@ -260,6 +260,10 @@ def _get_tools(self, preset: ToolPresetType = "default") -> list[Tool]: from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=False) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=False) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 9482efd7b..b410700f6 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -61,7 +61,7 @@ def get_tools_for_preset( """Get the list of tools for the given preset. Args: - preset: The tool preset to use (default, gemini, or planning). + preset: The tool preset to use (default, gemini, gpt5, or planning). enable_browser: Whether to include browser tools. Returns: @@ -71,6 +71,10 @@ def get_tools_for_preset( from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=enable_browser) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=enable_browser) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/swebenchmultilingual/run_infer.py b/benchmarks/swebenchmultilingual/run_infer.py index 91a3732d9..ae87daa8c 100644 --- a/benchmarks/swebenchmultilingual/run_infer.py +++ b/benchmarks/swebenchmultilingual/run_infer.py @@ -51,7 +51,7 @@ def get_tools_for_preset( """Get the list of tools for the given preset. Args: - preset: The tool preset to use (default, gemini, or planning). + preset: The tool preset to use (default, gemini, gpt5, or planning). enable_browser: Whether to include browser tools. Returns: @@ -61,6 +61,10 @@ def get_tools_for_preset( from openhands.tools.preset.gemini import get_gemini_tools return get_gemini_tools(enable_browser=enable_browser) + elif preset == "gpt5": + from openhands.tools.preset.gpt5 import get_gpt5_tools + + return get_gpt5_tools(enable_browser=enable_browser) elif preset == "planning": from openhands.tools.preset.planning import get_planning_tools diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 921f01eb0..4df17499b 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -89,10 +89,11 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: "--tool-preset", type=str, default="default", - choices=["default", "gemini", "planning"], + choices=["default", "gemini", "gpt5", "planning"], help=( "Tool preset for file editing. 'default' uses FileEditorTool, " - "'gemini' uses read_file/write_file/edit/list_directory (default: default)" + "'gemini' uses read_file/write_file/edit/list_directory, " + "'gpt5' uses apply_patch tool (default: default)" ), ) parser.add_argument( diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 45c6dab88..9dd471383 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -14,7 +14,7 @@ # Tool preset type for selecting which file editing toolset to use -ToolPresetType = Literal["default", "gemini", "planning"] +ToolPresetType = Literal["default", "gemini", "gpt5", "planning"] class EvalMetadata(BaseModel): @@ -101,6 +101,7 @@ class EvalMetadata(BaseModel): description=( "Tool preset for file editing. 'default' uses FileEditorTool, " "'gemini' uses read_file/write_file/edit/list_directory, " + "'gpt5' uses apply_patch tool, " "'planning' uses planning-mode tools." ), ) diff --git a/tests/test_tool_presets.py b/tests/test_tool_presets.py new file mode 100644 index 000000000..0cab3a7db --- /dev/null +++ b/tests/test_tool_presets.py @@ -0,0 +1,50 @@ +import pytest + +from benchmarks.hybridgym_depsearch.run_infer import DepSearchEvaluation +from benchmarks.hybridgym_funcgen.run_infer import FuncGenEvaluation +from benchmarks.hybridgym_funclocalize.run_infer import FuncLocalizeEvaluation +from benchmarks.hybridgym_issuelocalize.run_infer import IssueLocalizeEvaluation +from benchmarks.swebench.run_infer import get_tools_for_preset as get_swebench_tools +from benchmarks.swebenchmultilingual.run_infer import ( + get_tools_for_preset as get_swebenchmultilingual_tools, +) +from benchmarks.utils.args_parser import get_parser +from openhands.tools.preset.gpt5 import get_gpt5_tools + + +@pytest.mark.parametrize( + "getter", + [ + get_swebench_tools, + get_swebenchmultilingual_tools, + ], +) +def test_swebench_tool_helpers_support_gpt5(getter): + expected = [tool.name for tool in get_gpt5_tools(enable_browser=False)] + + assert [tool.name for tool in getter("gpt5", enable_browser=False)] == expected + + +@pytest.mark.parametrize( + "evaluation_cls", + [ + DepSearchEvaluation, + FuncGenEvaluation, + FuncLocalizeEvaluation, + IssueLocalizeEvaluation, + ], +) +def test_hybridgym_tool_helpers_support_gpt5(evaluation_cls): + expected = [tool.name for tool in get_gpt5_tools(enable_browser=False)] + + assert [ + tool.name for tool in evaluation_cls._get_tools(None, preset="gpt5") + ] == expected + + +def test_common_parser_accepts_gpt5_tool_preset(): + parser = get_parser(add_llm_config=False) + + args = parser.parse_args(["--tool-preset", "gpt5"]) + + assert args.tool_preset == "gpt5" From f91c93b61663992bf68e0c13a48dd05cad233c5f Mon Sep 17 00:00:00 2001 From: enyst Date: Tue, 21 Apr 2026 14:45:08 +0000 Subject: [PATCH 2/2] test: fix stale multimodal parser default expectation Co-authored-by: openhands --- tests/test_multimodal_phased_build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_multimodal_phased_build.py b/tests/test_multimodal_phased_build.py index 84c833bf7..54c44a565 100644 --- a/tests/test_multimodal_phased_build.py +++ b/tests/test_multimodal_phased_build.py @@ -278,6 +278,7 @@ def test_force_build_forwarded(self, _collect, mock_builder, _bases, mock_assemb class TestMultimodalParser: def test_defaults(self): from benchmarks.swebenchmultimodal.build_images import get_parser + from benchmarks.swebenchmultimodal.config import BUILD_DEFAULTS parser = get_parser() args = parser.parse_args([]) @@ -287,4 +288,4 @@ def test_defaults(self): assert args.push is False assert args.force_build is False assert args.n_limit == 0 - assert args.select is None + assert args.select == BUILD_DEFAULTS["select"]