OpenHands · xingyaoww · Nov 13, 2025 · Oct 27, 2025 · Nov 3, 2025 · Nov 4, 2025
diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
@@ -17,6 +17,7 @@
     construct_eval_output_dir,
     get_default_on_result_writer,
 )
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -26,7 +27,7 @@
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import DockerWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
 
 
 logger = get_logger(__name__)
@@ -96,45 +97,78 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
         """
         Use DockerWorkspace by default.
         """
-        SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
-        logger.info(f"SKIP_BUILD={SKIP_BUILD}")
         official_docker_image = get_official_docker_image(instance.id)
         build_target = "source-minimal"
         custom_tag = extract_custom_tag(official_docker_image)
-
         # For non-binary targets, append target suffix
         suffix = f"-{build_target}" if build_target != "binary" else ""
-        agent_server_image = (
-            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
-        )
-        if not SKIP_BUILD:
-            logger.info(
-                f"Building workspace from {official_docker_image} "
-                f"for instance {instance.id}. "
-                "This may take a while...\n"
-                "You can run benchmarks/swe_bench/build_images.py and set "
-                "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
-                "agent-server image."
+
+        if self.metadata.workspace_type == "docker":
+            agent_server_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+            )
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swe_bench/build_images.py and set "
+                    "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if agent_server_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{agent_server_image}"
+                    )
+
+            workspace = DockerWorkspace(
+                server_image=agent_server_image,
+                working_dir="/workspace",
             )
-            output = build_image(
-                base_image=official_docker_image,
-                target_image=EVAL_AGENT_SERVER_IMAGE,
-                custom_tag=custom_tag,
-                target=build_target,
-                push=False,
+        elif self.metadata.workspace_type == "remote":
+            runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
+            if not runtime_api_key:
+                raise ValueError(
+                    "RUNTIME_API_KEY environment variable is not set for remote workspace"
+                )
+
+            agent_server_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
-            logger.info(f"Image build output: {output}")
-            assert output.error is None, f"Image build failed: {output.error}"
-            if agent_server_image not in output.tags:
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
-                    f"Built image tags {output.tags} do not include expected tag "
-                    f"{agent_server_image}"
+                    f"Agent server image {agent_server_image} does not exist in container registry, "
+                    "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
+            logger.info(
+                f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})"
+            )
+            workspace = APIRemoteWorkspace(
+                runtime_api_url=os.getenv(
+                    "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
+                ),
+                runtime_api_key=runtime_api_key,
+                server_image=agent_server_image,
+                target_type="source" if "source" in build_target else "binary",
+            )
+        else:
+            raise ValueError(
+                f"Unsupported workspace_type: {self.metadata.workspace_type}"
+            )
 
-        workspace = DockerWorkspace(
-            server_image=agent_server_image,
-            working_dir="/workspace",
-        )
         for cmd in self.metadata.env_setup_commands or []:
             res = workspace.execute_command(cmd)
             if res.exit_code != 0:
@@ -297,6 +331,7 @@ def main() -> None:
         critic_name=args.critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        workspace_type=args.workspace,
     )
 
     # Run orchestrator with a simple JSONL writer

diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
@@ -25,6 +25,13 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         help="Dataset name",
     )
     parser.add_argument("--split", type=str, default="test", help="Dataset split")
+    parser.add_argument(
+        "--workspace",
+        type=str,
+        default="docker",
+        choices=["docker", "remote"],
+        help="Type of workspace to use (default: docker)",
+    )
     parser.add_argument(
         "--max-iterations", type=int, default=100, help="Maximum iterations"
     )

diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
@@ -7,6 +7,7 @@
 import contextlib
 import io
 import subprocess
+import time
 import tomllib
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import UTC, datetime
@@ -19,6 +20,7 @@
 
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.image_utils import image_exists
 from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
@@ -195,6 +197,11 @@ def build_image(
         git_sha=git_sha,
         sdk_version=sdk_version,
     )
+    for t in opts.all_tags[0]:
+        # Check if image exists or not
+        if image_exists(t):
+            logger.info(f"Image {t} already exists. Skipping build.")
+            return BuildOutput(base_image=base_image, tags=[t], error=None)
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)
 
@@ -224,6 +231,7 @@ def _build_with_logging(
                 logger.info(
                     f"Retrying build for {base_image} (attempt {attempt + 1}/{max_retries})"
                 )
+                time.sleep(2 + attempt * 2)
             result = build_image(base_image, target_image, custom_tag, target, push)
             result.log_path = str(log_path)
             if not result.error:

diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+import base64
+import sys
+
+import requests
+
+
+ACCEPT = ",".join(
+    [
+        "application/vnd.oci.image.index.v1+json",
+        "application/vnd.oci.image.manifest.v1+json",
+        "application/vnd.docker.distribution.manifest.v2+json",
+        "application/vnd.docker.distribution.manifest.list.v2+json",
+    ]
+)
+
+
+def _parse(image: str):
+    digest = None
+    if "@" in image:
+        image, digest = image.split("@", 1)
+    tag = None
+    last = image.rsplit("/", 1)[-1]
+    if ":" in last:  # tag after last slash (not registry:port)
+        image, tag = image.rsplit(":", 1)
+    parts = image.split("/")
+    if "." in parts[0] or ":" in parts[0] or parts[0] == "localhost":
+        registry, repo = parts[0], "/".join(parts[1:])
+    else:
+        registry, repo = "registry-1.docker.io", "/".join(parts)
+    ref = digest or tag or "latest"
+    return registry, repo, ref
+
+
+def _dockerhub_token(repo: str) -> str | None:
+    url = f"https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull"
+    r = requests.get(url, timeout=10)
+    if r.ok:
+        return r.json().get("token")
+    return None
+
+
+def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None:
+    # Public: anonymous works; Private: Basic auth with PAT (read:packages) to get bearer
+    url = f"https://ghcr.io/token?service=ghcr.io&scope=repository:{repo}:pull"
+    headers = {}
+    if username and pat:
+        headers["Authorization"] = (
+            "Basic " + base64.b64encode(f"{username}:{pat}".encode()).decode()
+        )
+    r = requests.get(url, headers=headers, timeout=10)
+    if r.ok:
+        return r.json().get("token")
+    return None
+
+
+def image_exists(
+    image_ref: str,
+    gh_username: str | None = None,
+    gh_pat: str | None = None,  # GitHub PAT with read:packages for private GHCR
+    docker_token: str | None = None,  # Docker Hub JWT if you already have one
+) -> bool:
+    registry, repo, ref = _parse(image_ref)
+    headers = {"Accept": ACCEPT}
+
+    if registry in ("docker.io", "index.docker.io", "registry-1.docker.io"):
+        base = "https://registry-1.docker.io"
+        token = docker_token or _dockerhub_token(repo)
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+    elif registry == "ghcr.io":
+        base = "https://ghcr.io"
+        token = _ghcr_token(repo, gh_username, gh_pat)
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+    else:
+        base = f"https://{registry}"
+
+    url = f"{base}/v2/{repo}/manifests/{ref}"
+    try:
+        r = requests.head(url, headers=headers, timeout=10)
+        if r.status_code in (
+            405,
+            406,
+        ):  # some registries disallow HEAD or need GET for content-negotiation
+            r = requests.get(url, headers=headers, timeout=10)
+        # 200 -> exists; 401/403 -> exists but unauthorized; 404 -> not found
+        return r.status_code == 200
+    except requests.RequestException:
+        return False
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(
+            "Usage: python image_check.py <image[:tag]|image@sha256:...> [gh_user] [gh_pat]"
+        )
+        sys.exit(1)
+
+    image = sys.argv[1]
+    gh_user = sys.argv[2] if len(sys.argv) > 2 else None
+    gh_pat = sys.argv[3] if len(sys.argv) > 3 else None
+
+    ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat)
+    print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}")
diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Literal
 
 from pydantic import BaseModel, Field
 
@@ -45,6 +45,10 @@ class EvalMetadata(BaseModel):
         ge=0,
         description="Maximum number of retries for instances that throw exceptions",
     )
+    workspace_type: Literal["docker", "remote"] = Field(
+        default="docker",
+        description="Type of workspace to use, e.g., 'docker' or 'remote'",
+    )
 
 
 EvalInstanceID = str

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "openhands-workspace",
     "modal>=1.1.4",
     "swebench",
+    "docker-registry-client>=0.5.2",
 ]
 
 [project.scripts]

diff --git a/uv.lock b/uv.lock