Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/commit0/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sys
from pathlib import Path

from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.report_costs import generate_cost_report


Expand Down Expand Up @@ -199,6 +200,9 @@ def main() -> None:
# Process results and generate report
process_commit0_results(str(input_file), str(output_file), args.model_name)

# Update Laminar datapoints with evaluation scores
LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))

# Generate cost report as final step
generate_cost_report(str(input_file))

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,9 @@ def prepare_instances(self) -> List[EvalInstance]:
logger.info("Total instances to process: %d", len(instances))
return instances

def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
Comment thread
Rainhunter13 marked this conversation as resolved.
"""
Create workspace and set up the commit0 repository.
"""
Expand All @@ -167,6 +169,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
base_image=base_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
logger.info(
f"Building workspace from {base_docker_image}. This may take a while..."
Expand Down Expand Up @@ -201,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/gaia/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
from pathlib import Path

from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger

Expand Down Expand Up @@ -226,6 +227,9 @@ def main() -> None:
args.model_name,
)

# Update Laminar datapoints with evaluation scores
LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))

# Generate cost report as final step
generate_cost_report(str(input_file))

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def prepare_instances(self) -> List[EvalInstance]:
logger.info(f"Total instances to process: {len(instances)}")
return instances

def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""Create workspace and copy necessary files."""
logger.info(f"Preparing workspace for instance {instance.id}")

Expand All @@ -125,6 +127,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
workspace = DockerDevWorkspace(
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
Expand Down Expand Up @@ -159,6 +162,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="binary", # GAIA images use binary target
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/multiswebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import (
update_multi_swe_config,
)
from benchmarks.utils.laminar import LaminarService
from openhands.sdk import get_logger


Expand Down Expand Up @@ -143,6 +144,11 @@ def main():
shutil.move(str(results_file), str(output_report_path))
logger.info(f"Report moved to {output_report_path}")

# Update Laminar datapoints with evaluation scores
LaminarService.get().update_evaluation_scores(
str(args.input_file), str(output_report_path)
)


if __name__ == "__main__":
main()
6 changes: 5 additions & 1 deletion benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,9 @@ def prepare_instances(self) -> List[EvalInstance]:
return instances

# ---- Hook: prepare a workspace per instance ----------------------------------
def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""
Use DockerWorkspace by default.
"""
Expand Down Expand Up @@ -228,6 +230,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand Down Expand Up @@ -255,6 +258,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/openagentsafety/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,17 @@ def prepare_instances(self) -> List[EvalInstance]:
logger.info("Total instances to process: %d", len(instances))
return instances

def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""Create a fresh Docker workspace for this instance."""
server_image = build_workspace_image()

workspace = DockerWorkspace(
server_image=server_image,
platform="linux/amd64",
extra_ports=True,
forward_env=forward_env or [],
)

# Setup host mapping for The Agent Company services
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/swebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
from pathlib import Path

from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger
Expand Down Expand Up @@ -265,6 +266,11 @@ def main() -> None:
shutil.move(str(report_path), str(dest_report_path))
logger.info(f"Moved report file to: {dest_report_path}")

# Update Laminar datapoints with evaluation scores
LaminarService.get().update_evaluation_scores(
str(input_file), str(dest_report_path)
)

# Generate cost report as final step
generate_cost_report(str(input_file))

Expand Down
6 changes: 5 additions & 1 deletion benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def prepare_instances(self) -> List[EvalInstance]:
return instances

# ---- Hook: prepare a workspace per instance ----------------------------------
def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""
Use DockerWorkspace by default.
"""
Expand Down Expand Up @@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand Down Expand Up @@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def prepare_instances(self) -> List[EvalInstance]:
return instances

# ---- Hook: prepare a workspace per instance ----------------------------------
def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""
Use DockerWorkspace by default.
"""
Expand Down Expand Up @@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand Down Expand Up @@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/swtbench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import sys
from pathlib import Path

from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.patch_utils import remove_files_from_patch
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger
Expand Down Expand Up @@ -380,6 +381,11 @@ def main() -> None:
logger.info(f"Moved evaluation report to: {target_file}")
update_report_with_submitted_instances(target_file, output_file)

# Update Laminar datapoints with evaluation scores
LaminarService.get().update_evaluation_scores(
str(input_file), str(target_file)
)

# Generate cost report as final step
generate_cost_report(str(input_file))

Expand Down
7 changes: 6 additions & 1 deletion benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def prepare_instances(self) -> List[EvalInstance]:
return instances

# ---- Hook: prepare a workspace per instance ----------------------------------
def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
def prepare_workspace(
self, instance: EvalInstance, forward_env: list[str] | None = None
) -> RemoteWorkspace:
"""
Create workspace based on workspace_type (docker or remote).
"""
Expand Down Expand Up @@ -168,11 +170,13 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
base_image=official_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
else:
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand Down Expand Up @@ -200,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
runtime_api_key=runtime_api_key,
server_image=agent_server_image,
target_type="source" if "source" in build_target else "binary",
forward_env=forward_env or [],
)
else:
raise ValueError(
Expand Down
Loading