diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 32d5ed63..f7e840f3 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -17,6 +17,7 @@ construct_eval_output_dir, get_default_on_result_writer, ) +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -26,7 +27,7 @@ from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools -from openhands.workspace import DockerWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerWorkspace logger = get_logger(__name__) @@ -96,45 +97,78 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: """ Use DockerWorkspace by default. """ - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") official_docker_image = get_official_docker_image(instance.id) build_target = "source-minimal" custom_tag = extract_custom_tag(official_docker_image) - # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" - ) - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/swe_bench/build_images.py and set " - "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." + + if self.metadata.workspace_type == "docker": + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + ) + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swe_bench/build_images.py and set " + "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", ) - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, + elif self.metadata.workspace_type == "remote": + runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + if not runtime_api_key: + raise ValueError( + "RUNTIME_API_KEY environment variable is not set for remote workspace" + ) + + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: + if not image_exists(agent_server_image): raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" + f"Agent server image {agent_server_image} does not exist in container registry, " + "make sure to build, push it, and make it public accessible before using remote workspace." ) + logger.info( + f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})" + ) + workspace = APIRemoteWorkspace( + runtime_api_url=os.getenv( + "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + ), + runtime_api_key=runtime_api_key, + server_image=agent_server_image, + target_type="source" if "source" in build_target else "binary", + ) + else: + raise ValueError( + f"Unsupported workspace_type: {self.metadata.workspace_type}" + ) - workspace = DockerWorkspace( - server_image=agent_server_image, - working_dir="/workspace", - ) for cmd in self.metadata.env_setup_commands or []: res = workspace.execute_command(cmd) if res.exit_code != 0: @@ -297,6 +331,7 @@ def main() -> None: critic_name=args.critic, selected_instances_file=args.select, max_retries=args.max_retries, + workspace_type=args.workspace, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index cb1584e5..56f950ad 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -25,6 +25,13 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Dataset name", ) parser.add_argument("--split", type=str, default="test", help="Dataset split") + parser.add_argument( + "--workspace", + type=str, + default="docker", + choices=["docker", "remote"], + help="Type of workspace to use (default: docker)", + ) parser.add_argument( "--max-iterations", type=int, default=100, help="Maximum iterations" ) diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index cdcb9e61..e55b3273 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -7,6 +7,7 @@ import contextlib import io import subprocess +import time import tomllib from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import UTC, datetime @@ -19,6 +20,7 @@ from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from benchmarks.utils.image_utils import image_exists from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -195,6 +197,11 @@ def build_image( git_sha=git_sha, sdk_version=sdk_version, ) + for t in opts.all_tags[0]: + # Check if image exists or not + if image_exists(t): + logger.info(f"Image {t} already exists. Skipping build.") + return BuildOutput(base_image=base_image, tags=[t], error=None) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None) @@ -224,6 +231,7 @@ def _build_with_logging( logger.info( f"Retrying build for {base_image} (attempt {attempt + 1}/{max_retries})" ) + time.sleep(2 + attempt * 2) result = build_image(base_image, target_image, custom_tag, target, push) result.log_path = str(log_path) if not result.error: diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py new file mode 100644 index 00000000..a463f3b4 --- /dev/null +++ b/benchmarks/utils/image_utils.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +import base64 +import sys + +import requests + + +ACCEPT = ",".join( + [ + "application/vnd.oci.image.index.v1+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.docker.distribution.manifest.v2+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + ] +) + + +def _parse(image: str): + digest = None + if "@" in image: + image, digest = image.split("@", 1) + tag = None + last = image.rsplit("/", 1)[-1] + if ":" in last: # tag after last slash (not registry:port) + image, tag = image.rsplit(":", 1) + parts = image.split("/") + if "." in parts[0] or ":" in parts[0] or parts[0] == "localhost": + registry, repo = parts[0], "/".join(parts[1:]) + else: + registry, repo = "registry-1.docker.io", "/".join(parts) + ref = digest or tag or "latest" + return registry, repo, ref + + +def _dockerhub_token(repo: str) -> str | None: + url = f"https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull" + r = requests.get(url, timeout=10) + if r.ok: + return r.json().get("token") + return None + + +def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None: + # Public: anonymous works; Private: Basic auth with PAT (read:packages) to get bearer + url = f"https://ghcr.io/token?service=ghcr.io&scope=repository:{repo}:pull" + headers = {} + if username and pat: + headers["Authorization"] = ( + "Basic " + base64.b64encode(f"{username}:{pat}".encode()).decode() + ) + r = requests.get(url, headers=headers, timeout=10) + if r.ok: + return r.json().get("token") + return None + + +def image_exists( + image_ref: str, + gh_username: str | None = None, + gh_pat: str | None = None, # GitHub PAT with read:packages for private GHCR + docker_token: str | None = None, # Docker Hub JWT if you already have one +) -> bool: + registry, repo, ref = _parse(image_ref) + headers = {"Accept": ACCEPT} + + if registry in ("docker.io", "index.docker.io", "registry-1.docker.io"): + base = "https://registry-1.docker.io" + token = docker_token or _dockerhub_token(repo) + if token: + headers["Authorization"] = f"Bearer {token}" + elif registry == "ghcr.io": + base = "https://ghcr.io" + token = _ghcr_token(repo, gh_username, gh_pat) + if token: + headers["Authorization"] = f"Bearer {token}" + else: + base = f"https://{registry}" + + url = f"{base}/v2/{repo}/manifests/{ref}" + try: + r = requests.head(url, headers=headers, timeout=10) + if r.status_code in ( + 405, + 406, + ): # some registries disallow HEAD or need GET for content-negotiation + r = requests.get(url, headers=headers, timeout=10) + # 200 -> exists; 401/403 -> exists but unauthorized; 404 -> not found + return r.status_code == 200 + except requests.RequestException: + return False + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print( + "Usage: python image_check.py [gh_user] [gh_pat]" + ) + sys.exit(1) + + image = sys.argv[1] + gh_user = sys.argv[2] if len(sys.argv) > 2 else None + gh_pat = sys.argv[3] if len(sys.argv) > 3 else None + + ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat) + print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}") diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index b10df1f3..d3599772 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, Field @@ -45,6 +45,10 @@ class EvalMetadata(BaseModel): ge=0, description="Maximum number of retries for instances that throw exceptions", ) + workspace_type: Literal["docker", "remote"] = Field( + default="docker", + description="Type of workspace to use, e.g., 'docker' or 'remote'", + ) EvalInstanceID = str diff --git a/pyproject.toml b/pyproject.toml index 5e924d58..8561951d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "openhands-workspace", "modal>=1.1.4", "swebench", + "docker-registry-client>=0.5.2", ] [project.scripts] diff --git a/uv.lock b/uv.lock index ab6872cf..7c233247 100644 --- a/uv.lock +++ b/uv.lock @@ -719,6 +719,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "docker-registry-client" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ecdsa" }, + { name = "jws" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/3c/287104dcdbd6fd3d367b8bc50f1387f8326fb8026312af61b2bcf5c09387/docker-registry-client-0.5.2.tar.gz", hash = "sha256:8482efc9ec9ec708dfb74193cdfa530eee23c93596c63d704c5a3702b049e58f", size = 8037, upload-time = "2017-06-16T16:05:24.387Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b4/f1b3b2da3024fc20fe1e359871dc3c4f8e0ade1b0bbd85294f244c6a29d7/docker_registry_client-0.5.2-py2.py3-none-any.whl", hash = "sha256:cb6c1c5e72e091ada9b32499c8529850e247bafb2202bc31bbe45e9710bf9038", size = 11731, upload-time = "2017-06-16T16:05:26.057Z" }, +] + [[package]] name = "docstring-parser" version = "0.17.0" @@ -737,6 +751,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/dd/f95350e853a4468ec37478414fc04ae2d61dad7a947b3015c3dcc51a09b9/docutils-0.22.2-py3-none-any.whl", hash = "sha256:b0e98d679283fc3bb0ead8a5da7f501baa632654e7056e9c5846842213d674d8", size = 632667, upload-time = "2025-09-20T17:55:43.052Z" }, ] +[[package]] +name = "ecdsa" +version = "0.13.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/d8/9c3596fd0f18ae0a76333492a119c00183323d8e64de1a4f4bd642856963/ecdsa-0.13.3.tar.gz", hash = "sha256:163c80b064a763ea733870feb96f9dd9b92216cfcacd374837af18e4e8ec3d4d", size = 60477, upload-time = "2019-10-07T14:05:24.318Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/81/2b170b460c84fdc8700cf08aa077ac6a9ff41f4ad3f05d0b3a64ba9f8f2e/ecdsa-0.13.3-py2.py3-none-any.whl", hash = "sha256:9814e700890991abeceeb2242586024d4758c8fc18445b194a49bd62d85861db", size = 52113, upload-time = "2019-10-07T14:05:22.583Z" }, +] + [[package]] name = "email-validator" version = "2.3.0" @@ -1470,6 +1493,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "jws" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/9e/1536d578ed50f5fe8196310ddcc921a3cd8e973312d60ac74488b805d395/jws-0.1.3.tar.gz", hash = "sha256:0e3d4cb06ae7c5c1d16d357b4e7acb5c5ecab0cccb3a4b998035b85052488053", size = 8104, upload-time = "2015-03-10T15:53:37.844Z" } + [[package]] name = "lazy-object-proxy" version = "1.12.0" @@ -1975,6 +2004,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "datasets" }, + { name = "docker-registry-client" }, { name = "huggingface-hub" }, { name = "jinja2" }, { name = "modal" }, @@ -2005,6 +2035,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "datasets" }, + { name = "docker-registry-client", specifier = ">=0.5.2" }, { name = "huggingface-hub" }, { name = "jinja2" }, { name = "modal", specifier = ">=1.1.4" },