From c9b86c1f69a01fc0d9d0353150ebb94e6034470b Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 21:39:45 +0000
Subject: [PATCH 01/66] Add GitHub workflow for building SWE-Bench images with
 Blacksmith caching

- Create workflow that can be manually triggered via workflow_dispatch
- Integrate Blacksmith caching for faster Docker builds
- Configure workflow to push images to ghcr.io/openhands/eval-agent-server
- Make --critic parameter optional in build_images.py for build-only usage
- Fix .gitignore patterns for eval_outputs and builds directories

This workflow follows Blacksmith documentation for Docker builds and allows
building SWE-Bench evaluation images with configurable parameters like dataset,
split, target, platforms, and concurrent workers.

Closes #37
---
 .github/workflows/build-swe-bench-images.yml | 141 +++++++++++++++++++
 .gitignore                                   |   4 +-
 benchmarks/swe_bench/build_images.py         |   7 +
 3 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/build-swe-bench-images.yml

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
new file mode 100644
index 00000000..a412e653
--- /dev/null
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -0,0 +1,141 @@
+name: Build SWE-Bench Images
+
+on:
+  workflow_dispatch:
+    inputs:
+      dataset:
+        description: 'Dataset name (e.g., princeton-nlp/SWE-bench_Verified)'
+        required: true
+        default: 'princeton-nlp/SWE-bench_Verified'
+        type: string
+      split:
+        description: 'Dataset split (e.g., test, dev)'
+        required: true
+        default: 'test'
+        type: string
+      target:
+        description: 'Build target (source | source-minimal | binary | binary-minimal)'
+        required: false
+        default: 'source-minimal'
+        type: choice
+        options:
+          - source
+          - source-minimal
+          - binary
+          - binary-minimal
+      platforms:
+        description: 'Comma-separated platforms (e.g., linux/amd64,linux/arm64)'
+        required: false
+        default: 'linux/amd64'
+        type: string
+      max-workers:
+        description: 'Number of concurrent builds'
+        required: false
+        default: '1'
+        type: string
+      n-limit:
+        description: 'Limit number of images to build (for testing)'
+        required: false
+        default: ''
+        type: string
+
+jobs:
+  build-and-push:
+    runs-on:
+      labels: blacksmith-32vcpu-ubuntu-2204
+    
+    permissions:
+      contents: read
+      packages: write
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          driver-opts: |
+            image=moby/buildkit:latest
+            network=host
+      
+      - name: Cache Docker layers
+        uses: useblacksmith/cache@v6
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-
+      
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          version: "0.8.13"
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      
+      - name: Install dependencies
+        run: |
+          uv sync --dev
+      
+      - name: Build and push SWE-Bench images
+        run: |
+          # Construct the command with required arguments
+          CMD="uv run benchmarks/swe_bench/build_images.py \
+            --dataset ${{ inputs.dataset }} \
+            --split ${{ inputs.split }} \
+            --image ghcr.io/openhands/eval-agent-server \
+            --target ${{ inputs.target }} \
+            --platforms ${{ inputs.platforms }} \
+            --push \
+            --max-workers ${{ inputs.max-workers }}"
+          
+          # Add optional n-limit if provided
+          if [ -n "${{ inputs.n-limit }}" ]; then
+            CMD="$CMD --n-limit ${{ inputs.n-limit }}"
+          fi
+          
+          # Execute the build command
+          eval $CMD
+        env:
+          DOCKER_BUILDKIT: 1
+          BUILDKIT_PROGRESS: plain
+      
+      - name: Upload build manifest
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-manifest-${{ inputs.dataset }}-${{ inputs.split }}
+          path: |
+            builds/**/manifest.jsonl
+            builds/**/summary.json
+          retention-days: 30
+      
+      - name: Upload build logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-logs-${{ inputs.dataset }}-${{ inputs.split }}
+          path: builds/**/logs/**/*.log
+          retention-days: 7
+      
+      - name: Display build summary
+        if: always()
+        run: |
+          if [ -f builds/*/summary.json ]; then
+            echo "## Build Summary" >> $GITHUB_STEP_SUMMARY
+            cat builds/*/summary.json | python -m json.tool >> $GITHUB_STEP_SUMMARY
+          fi
diff --git a/.gitignore b/.gitignore
index 43135338..89a2b5d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -213,5 +213,5 @@ workspace/
 !.llm_config/example.json
 
 # Evaluation outputs
-./eval_outputs
-./builds
+eval_outputs/
+builds/
diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 6bbc739a..823aa176 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -86,6 +86,13 @@ def extend_parser() -> argparse.ArgumentParser:
     parser = get_parser(add_llm_config=False)
     parser.description = "Build all agent-server images for SWE-Bench base images."
 
+    # Make --critic optional for build_images use case
+    for action in parser._actions:
+        if action.dest == "critic":
+            action.required = False
+            action.default = "none"
+            break
+
     parser.add_argument(
         "--docker-image-prefix",
         default="docker.io/swebench/",

From 57520433a5902922bbbbaf02100d1c8ad94e21d4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 3 Nov 2025 21:02:28 +0000
Subject: [PATCH 02/66] Use Blacksmith's setup-docker-builder action for faster
 Docker layer caching

Following the pattern from https://github.com/OpenHands/software-agent-sdk/pull/990
and Blacksmith's official documentation (https://docs.blacksmith.sh/blacksmith-caching/docker-builds),
this change replaces the standard docker/setup-buildx-action with useblacksmith/setup-docker-builder@v1.

Key improvements:
- Replaces docker/setup-buildx-action@v3 with useblacksmith/setup-docker-builder@v1
- Removes manual cache configuration (useblacksmith/cache@v6)
- Blacksmith's Docker builder automatically manages Docker layer caching via NVMe-backed sticky disks
- Provides 2x to 40x improvements in build times according to Blacksmith's customers
- Since we only build amd64 images, we don't need the complex multi-platform matrix strategy

This approach is recommended for workflows that use Docker commands directly
(as opposed to using docker/build-push-action).

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index a412e653..680869b7 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -54,20 +54,8 @@ jobs:
         with:
           submodules: recursive
       
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          driver-opts: |
-            image=moby/buildkit:latest
-            network=host
-      
-      - name: Cache Docker layers
-        uses: useblacksmith/cache@v6
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: |
-            ${{ runner.os }}-buildx-
+      - name: Set up Docker Buildx with Blacksmith
+        uses: useblacksmith/setup-docker-builder@v1
       
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3

From 85080062806bf2ba90b1a0340140c6e634e80e0b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 20:41:17 +0000
Subject: [PATCH 03/66] revert unneed stuff

---
 benchmarks/swe_bench/build_images.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index dcd6301e..fc7c22e1 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -86,13 +86,6 @@ def extend_parser() -> argparse.ArgumentParser:
     parser = get_parser(add_llm_config=False)
     parser.description = "Build all agent-server images for SWE-Bench base images."
 
-    # Make --critic optional for build_images use case
-    for action in parser._actions:
-        if action.dest == "critic":
-            action.required = False
-            action.default = "none"
-            break
-
     parser.add_argument(
         "--docker-image-prefix",
         default="docker.io/swebench/",

From a565e77a6bd53475ab97a5952ab246e3f41b69c1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 20:46:24 +0000
Subject: [PATCH 04/66] simplify setup dependency

---
 .github/workflows/build-swe-bench-images.yml | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 680869b7..e2f78007 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -68,16 +68,10 @@ jobs:
         uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
-          version: "0.8.13"
-      
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      
+
       - name: Install dependencies
         run: |
-          uv sync --dev
+          make build
       
       - name: Build and push SWE-Bench images
         run: |

From 9bbd7fbbd0448fba75ab6cfbbe2d644d4103568d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 20:56:01 +0000
Subject: [PATCH 05/66] set eval-agent-server

---
 README.md                            | 4 ++--
 benchmarks/swe_bench/build_images.py | 4 ++--
 benchmarks/swe_bench/run_infer.py    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index fb4695e6..ef2f86c2 100644
--- a/README.md
+++ b/README.md
@@ -95,8 +95,8 @@ Build ALL docker images for SWE-Bench.
 ```bash
 uv run benchmarks/swe_bench/build_images.py \
   --dataset princeton-nlp/SWE-bench_Verified --split test \
-  --critic pass \
-  --image ghcr.io/openhands/agent-server --target binary-minimal
+  --image ghcr.io/openhands/eval-
+  agent-server --target binary-minimal
 ```
 
 
diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index fc7c22e1..08054c35 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -5,7 +5,7 @@
 Example:
   uv run benchmarks/swe_bench/build_images.py \
     --dataset princeton-nlp/SWE-bench_Verified --split test \
-    --image ghcr.io/openhands/agent-server --target source-minimal
+    --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
 import argparse
@@ -93,7 +93,7 @@ def extend_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument(
         "--image",
-        default="ghcr.io/openhands/agent-server",
+        default="ghcr.io/openhands/eval-agent-server",
         help="Target repo/name for built image",
     )
     parser.add_argument(
diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 4bb67edc..98eb396f 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -46,7 +46,7 @@ def get_agent_server_docker_image(
 ) -> str:
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     return (
-        "ghcr.io/openhands/agent-server"
+        "ghcr.io/openhands/eval-agent-server"
         + f":v{SDK_VERSION}_{_base_slug(official_image_name)}_{target}"
     )
 

From c661b2cd02c08d13ecf27c87345cc271365a5c98 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 20:57:00 +0000
Subject: [PATCH 06/66] fix line break

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ef2f86c2..0fcc3725 100644
--- a/README.md
+++ b/README.md
@@ -95,8 +95,7 @@ Build ALL docker images for SWE-Bench.
 ```bash
 uv run benchmarks/swe_bench/build_images.py \
   --dataset princeton-nlp/SWE-bench_Verified --split test \
-  --image ghcr.io/openhands/eval-
-  agent-server --target binary-minimal
+  --image ghcr.io/openhands/eval-agent-server --target binary-minimal
 ```
 
 

From 632432e7f1330131ce8e80ec3dcbfe6014cb3bb2 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 20:58:28 +0000
Subject: [PATCH 07/66] default to 10 for testing

---
 .github/workflows/build-swe-bench-images.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index e2f78007..8dcbc234 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -36,7 +36,7 @@ on:
       n-limit:
         description: 'Limit number of images to build (for testing)'
         required: false
-        default: ''
+        default: '10'
         type: string
 
 jobs:

From c536903f65976f5cc648b43d6e3ded9a66468f9e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 4 Nov 2025 21:08:25 +0000
Subject: [PATCH 08/66] run on all prs for debugging

---
 .github/workflows/build-swe-bench-images.yml | 86 +++++++++++++-------
 1 file changed, 57 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 8dcbc234..7cc3c68e 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -1,6 +1,7 @@
 name: Build SWE-Bench Images
 
 on:
+  pull_request:  # for debugging
   workflow_dispatch:
     inputs:
       dataset:
@@ -31,39 +32,65 @@ on:
       max-workers:
         description: 'Number of concurrent builds'
         required: false
-        default: '1'
+        default: '2'
         type: string
       n-limit:
-        description: 'Limit number of images to build (for testing)'
+        description: 'Limit number of images to build (for testing). Leave blank for no limit.'
         required: false
         default: '10'
         type: string
 
+# Reasonable defaults for automatic (push) runs; workflow_dispatch can override these.
+env:
+  DATASET: princeton-nlp/SWE-bench_Verified
+  SPLIT: test
+  TARGET: source-minimal
+  PLATFORMS: linux/amd64
+  MAX_WORKERS: '2'      # modest concurrency for reliability
+  N_LIMIT: '10'           # empty = no limit
+
+concurrency:
+  group: build-swe-bench-${{ github.ref }}
+  cancel-in-progress: false
+
 jobs:
   build-and-push:
     runs-on:
       labels: blacksmith-32vcpu-ubuntu-2204
-    
+
+    # Allow pushing to GHCR
     permissions:
       contents: read
       packages: write
-    
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
           submodules: recursive
-      
+
+      # If this was a manual dispatch, override defaults with provided inputs.
+      - name: Apply workflow_dispatch overrides (if any)
+        if: ${{ github.event_name == 'workflow_dispatch' }}
+        run: |
+          if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi
+          if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi
+          if [ -n "${{ inputs.target }}" ]; then echo "TARGET=${{ inputs.target }}" >> "$GITHUB_ENV"; fi
+          if [ -n "${{ inputs.platforms }}" ]; then echo "PLATFORMS=${{ inputs.platforms }}" >> "$GITHUB_ENV"; fi
+          if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi
+          # Empty string means "no limit"
+          if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi
+
       - name: Set up Docker Buildx with Blacksmith
         uses: useblacksmith/setup-docker-builder@v1
-      
+
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      
+
       - name: Install uv
         uses: astral-sh/setup-uv@v7
         with:
@@ -72,52 +99,53 @@ jobs:
       - name: Install dependencies
         run: |
           make build
-      
+
       - name: Build and push SWE-Bench images
         run: |
-          # Construct the command with required arguments
+          set -euo pipefail
+
           CMD="uv run benchmarks/swe_bench/build_images.py \
-            --dataset ${{ inputs.dataset }} \
-            --split ${{ inputs.split }} \
+            --dataset '${DATASET}' \
+            --split '${SPLIT}' \
             --image ghcr.io/openhands/eval-agent-server \
-            --target ${{ inputs.target }} \
-            --platforms ${{ inputs.platforms }} \
+            --target '${TARGET}' \
+            --platforms '${PLATFORMS}' \
             --push \
-            --max-workers ${{ inputs.max-workers }}"
-          
-          # Add optional n-limit if provided
-          if [ -n "${{ inputs.n-limit }}" ]; then
-            CMD="$CMD --n-limit ${{ inputs.n-limit }}"
+            --max-workers '${MAX_WORKERS}'"
+
+          # Only include --n-limit if provided (non-empty)
+          if [ -n "${N_LIMIT}" ]; then
+            CMD="$CMD --n-limit '${N_LIMIT}'"
           fi
-          
-          # Execute the build command
-          eval $CMD
+
+          echo "Running: $CMD"
+          eval "$CMD"
         env:
           DOCKER_BUILDKIT: 1
           BUILDKIT_PROGRESS: plain
-      
+
       - name: Upload build manifest
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: build-manifest-${{ inputs.dataset }}-${{ inputs.split }}
+          name: build-manifest-${{ env.DATASET }}-${{ env.SPLIT }}
           path: |
             builds/**/manifest.jsonl
             builds/**/summary.json
           retention-days: 30
-      
+
       - name: Upload build logs
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: build-logs-${{ inputs.dataset }}-${{ inputs.split }}
+          name: build-logs-${{ env.DATASET }}-${{ env.SPLIT }}
           path: builds/**/logs/**/*.log
           retention-days: 7
-      
+
       - name: Display build summary
         if: always()
         run: |
-          if [ -f builds/*/summary.json ]; then
-            echo "## Build Summary" >> $GITHUB_STEP_SUMMARY
-            cat builds/*/summary.json | python -m json.tool >> $GITHUB_STEP_SUMMARY
+          if ls builds/*/summary.json >/dev/null 2>&1; then
+            echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
+            cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY"
           fi

From efb731f3a28e5e752cf740afc35a7d8e572ba3b2 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:24:02 +0000
Subject: [PATCH 09/66] Fix pyarrow build issue by forcing binary wheel
 installation

The GitHub Actions workflow was failing because uv was trying to build
pyarrow from source, which requires the Arrow C++ library and CMake.
This change adds the --no-build-package pyarrow flag to force uv to use
the pre-built binary wheel instead of attempting to build from source.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 7cc3c68e..b563b80b 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -98,7 +98,10 @@ jobs:
 
       - name: Install dependencies
         run: |
-          make build
+          # Install dependencies, preferring binary wheels for problematic packages
+          git submodule update --init --recursive
+          uv sync --dev --no-build-package pyarrow
+          uv run pre-commit install
 
       - name: Build and push SWE-Bench images
         run: |

From 29084f237af47ca229f08e4a5a4982974a82d743 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:26:36 +0000
Subject: [PATCH 10/66] Pin Python version to 3.12 to fix pyarrow compatibility

The root cause of the build failure was that uv was installing Python 3.14.0,
which doesn't have binary wheels for pyarrow 21.0.0 yet. This caused uv to
attempt building from source, which failed due to missing Arrow C++ libraries.

Solution: Added .python-version file to pin Python to 3.12, which matches
the project's target-version in pyproject.toml and has full binary wheel
support for all dependencies.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 5 +----
 .python-version                              | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)
 create mode 100644 .python-version

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index b563b80b..7cc3c68e 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -98,10 +98,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          # Install dependencies, preferring binary wheels for problematic packages
-          git submodule update --init --recursive
-          uv sync --dev --no-build-package pyarrow
-          uv run pre-commit install
+          make build
 
       - name: Build and push SWE-Bench images
         run: |
diff --git a/.python-version b/.python-version
new file mode 100644
index 00000000..e4fba218
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12

From 551405b2a6a96a1ece0ca4443afd9dc42289064c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:36:02 +0000
Subject: [PATCH 11/66] Fix artifact upload naming to avoid invalid characters

Use github.run_id instead of dataset/split names which contain slashes
that are invalid in artifact names. Also added if-no-files-found: warn
to provide better feedback if logs are missing.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 7cc3c68e..9d6f6f20 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -128,7 +128,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: build-manifest-${{ env.DATASET }}-${{ env.SPLIT }}
+          name: build-manifest-${{ github.run_id }}
           path: |
             builds/**/manifest.jsonl
             builds/**/summary.json
@@ -138,9 +138,10 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: build-logs-${{ env.DATASET }}-${{ env.SPLIT }}
+          name: build-logs-${{ github.run_id }}
           path: builds/**/logs/**/*.log
           retention-days: 7
+          if-no-files-found: warn
 
       - name: Display build summary
         if: always()

From 90b6ed6bd9a7f5ea0300fa727ef4fba5910f8616 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:40:42 +0000
Subject: [PATCH 12/66] Fix artifact upload by archiving logs to avoid invalid
 filename characters

GitHub Actions artifact upload doesn't allow colons in filenames, but our
log paths contain colons from Docker image tags (e.g., 'django-11999:latest').
Archive the entire builds directory into a tar.gz before upload to work
around this restriction.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 9d6f6f20..bd4d741a 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -134,12 +134,23 @@ jobs:
             builds/**/summary.json
           retention-days: 30
 
+      - name: Archive build logs
+        if: always()
+        run: |
+          if [ -d builds ]; then
+            # Create tar archive to avoid filename restrictions (colons, etc.)
+            tar -czf build-logs.tar.gz builds/
+            echo "Build logs archived successfully"
+          else
+            echo "No builds directory found"
+          fi
+
       - name: Upload build logs
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: build-logs-${{ github.run_id }}
-          path: builds/**/logs/**/*.log
+          path: build-logs.tar.gz
           retention-days: 7
           if-no-files-found: warn
 

From 3ba1e46f29b01376d90c23087562373eb2f8e5d8 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:48:35 +0000
Subject: [PATCH 13/66] Fix Docker cache tag length exceeding 128 character
 limit

Docker image tags have a maximum length of 128 characters. When building
SWE-Bench images with long base image names (e.g., scikit-learn), the
generated cache tags exceed this limit and cause build failures with:
'ERROR: failed to configure registry cache exporter: invalid reference format'

Solution: Apply a patch to vendor/software-agent-sdk that hashes the
base_image_slug when it would cause the final tag to exceed 128 characters.
Uses SHA256 hash (first 12 chars) to create a shorter unique identifier
while maintaining cache efficiency.

The patch is applied during the workflow setup before installing dependencies.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml |  6 +++
 .github/workflows/fix-cache-tag-length.patch | 44 ++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 .github/workflows/fix-cache-tag-length.patch

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index bd4d741a..e56d3343 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -96,6 +96,12 @@ jobs:
         with:
           enable-cache: true
 
+      - name: Apply fix for Docker cache tag length limit
+        run: |
+          cd vendor/software-agent-sdk
+          git apply ../../.github/workflows/fix-cache-tag-length.patch
+          echo "Applied patch to fix cache tag length limit"
+
       - name: Install dependencies
         run: |
           make build
diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch
new file mode 100644
index 00000000..f6679aaa
--- /dev/null
+++ b/.github/workflows/fix-cache-tag-length.patch
@@ -0,0 +1,44 @@
+diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py
+index 1cc9cd4d..a1add62e 100755
+--- a/openhands-agent-server/openhands/agent_server/docker/build.py
++++ b/openhands-agent-server/openhands/agent_server/docker/build.py
+@@ -14,6 +14,7 @@ Single-entry build helper for agent-server images.
+ """
+ 
+ import argparse
++import hashlib
+ import os
+ import re
+ import shutil
+@@ -284,7 +285,30 @@ class BuildOptions(BaseModel):
+ 
+     @property
+     def cache_tags(self) -> tuple[str, str]:
+-        base = f"buildcache-{self.target}-{self.base_image_slug}"
++        # Docker image tags have a 128-character limit.
++        # If the base slug is too long, hash it to create a shorter unique identifier.
++        MAX_TAG_LENGTH = 128
++        base_slug = self.base_image_slug
++        
++        # Reserve space for prefix, branch, and separators
++        prefix = f"buildcache-{self.target}-"
++        branch_suffix = f"-{_sanitize_branch(GIT_REF)}" if GIT_REF not in ("main", "refs/heads/main", "unknown") else ""
++        main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else ""
++        
++        # Calculate available space for base_slug
++        reserved = len(prefix) + max(len(branch_suffix), len(main_suffix))
++        available = MAX_TAG_LENGTH - reserved
++        
++        # If base_slug is too long, use a hash
++        if len(base_slug) > available:
++            # Use first 8 chars of SHA256 hash for uniqueness while keeping it short
++            hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12]
++            base_slug_short = hash_digest
++            logger.debug(f"[build] Base image slug too long ({len(base_slug)} chars), using hash: {base_slug_short}")
++        else:
++            base_slug_short = base_slug
++        
++        base = f"{prefix}{base_slug_short}"
+         if GIT_REF in ("main", "refs/heads/main"):
+             return f"{base}-main", base
+         elif GIT_REF != "unknown":

From 21bb22616be7a54f8c8060866365f98184ae4b55 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 21:57:18 +0000
Subject: [PATCH 14/66] Update patch with pre-commit formatting fixes

Updated the patch to match the formatting requirements from ruff and
other pre-commit checks. This ensures the patch applies cleanly and
passes all linting/formatting checks.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/fix-cache-tag-length.patch | 23 +++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch
index f6679aaa..72306970 100644
--- a/.github/workflows/fix-cache-tag-length.patch
+++ b/.github/workflows/fix-cache-tag-length.patch
@@ -1,5 +1,5 @@
 diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py
-index 1cc9cd4d..a1add62e 100755
+index 1cc9cd4d..c5023d79 100755
 --- a/openhands-agent-server/openhands/agent_server/docker/build.py
 +++ b/openhands-agent-server/openhands/agent_server/docker/build.py
 @@ -14,6 +14,7 @@ Single-entry build helper for agent-server images.
@@ -10,7 +10,7 @@ index 1cc9cd4d..a1add62e 100755
  import os
  import re
  import shutil
-@@ -284,7 +285,30 @@ class BuildOptions(BaseModel):
+@@ -284,7 +285,37 @@ class BuildOptions(BaseModel):
  
      @property
      def cache_tags(self) -> tuple[str, str]:
@@ -19,25 +19,32 @@ index 1cc9cd4d..a1add62e 100755
 +        # If the base slug is too long, hash it to create a shorter unique identifier.
 +        MAX_TAG_LENGTH = 128
 +        base_slug = self.base_image_slug
-+        
++
 +        # Reserve space for prefix, branch, and separators
 +        prefix = f"buildcache-{self.target}-"
-+        branch_suffix = f"-{_sanitize_branch(GIT_REF)}" if GIT_REF not in ("main", "refs/heads/main", "unknown") else ""
++        branch_suffix = (
++            f"-{_sanitize_branch(GIT_REF)}"
++            if GIT_REF not in ("main", "refs/heads/main", "unknown")
++            else ""
++        )
 +        main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else ""
-+        
++
 +        # Calculate available space for base_slug
 +        reserved = len(prefix) + max(len(branch_suffix), len(main_suffix))
 +        available = MAX_TAG_LENGTH - reserved
-+        
++
 +        # If base_slug is too long, use a hash
 +        if len(base_slug) > available:
 +            # Use first 8 chars of SHA256 hash for uniqueness while keeping it short
 +            hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12]
 +            base_slug_short = hash_digest
-+            logger.debug(f"[build] Base image slug too long ({len(base_slug)} chars), using hash: {base_slug_short}")
++            logger.debug(
++                f"[build] Base image slug too long ({len(base_slug)} chars), "
++                f"using hash: {base_slug_short}"
++            )
 +        else:
 +            base_slug_short = base_slug
-+        
++
 +        base = f"{prefix}{base_slug_short}"
          if GIT_REF in ("main", "refs/heads/main"):
              return f"{base}-main", base

From 2f897757ebb00ab7ce3db9d0ff90c14bb6eb47f0 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 6 Nov 2025 17:52:38 +0000
Subject: [PATCH 15/66] checkout to v1.0.0 of sdk

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 448af7af..a612c0a6 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 448af7af9c64d5b0d373dcf9c84e2dc7d7e57b19
+Subproject commit a612c0a685fa96bc725085ac81c59492d4a88974

From dfb966bd2d3e4d2086223cf4ff85d998d15354d4 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 6 Nov 2025 17:52:59 +0000
Subject: [PATCH 16/66] update uv.lock

---
 uv.lock | 320 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 315 insertions(+), 5 deletions(-)

diff --git a/uv.lock b/uv.lock
index 38431b39..6ea29413 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1125,6 +1125,47 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9d/08/24d62fccb01c4e86c59ba79073af7e5c8ab643846823c2fa3e957bde4b58/groq-0.32.0-py3-none-any.whl", hash = "sha256:0ed0be290042f8826f851f3a1defaac4f979dcfce86ec4a0681a23af00ec800b", size = 135387, upload-time = "2025-09-27T23:01:33.223Z" },
 ]
 
+[[package]]
+name = "grpcio"
+version = "1.76.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bf/05/8e29121994b8d959ffa0afd28996d452f291b48cfc0875619de0bde2c50c/grpcio-1.76.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8", size = 5799718, upload-time = "2025-10-21T16:21:17.939Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/75/11d0e66b3cdf998c996489581bdad8900db79ebd83513e45c19548f1cba4/grpcio-1.76.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280", size = 11825627, upload-time = "2025-10-21T16:21:20.466Z" },
+    { url = "https://files.pythonhosted.org/packages/28/50/2f0aa0498bc188048f5d9504dcc5c2c24f2eb1a9337cd0fa09a61a2e75f0/grpcio-1.76.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4", size = 6359167, upload-time = "2025-10-21T16:21:23.122Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e5/bbf0bb97d29ede1d59d6588af40018cfc345b17ce979b7b45424628dc8bb/grpcio-1.76.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11", size = 7044267, upload-time = "2025-10-21T16:21:25.995Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/86/f6ec2164f743d9609691115ae8ece098c76b894ebe4f7c94a655c6b03e98/grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6", size = 6573963, upload-time = "2025-10-21T16:21:28.631Z" },
+    { url = "https://files.pythonhosted.org/packages/60/bc/8d9d0d8505feccfdf38a766d262c71e73639c165b311c9457208b56d92ae/grpcio-1.76.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8", size = 7164484, upload-time = "2025-10-21T16:21:30.837Z" },
+    { url = "https://files.pythonhosted.org/packages/67/e6/5d6c2fc10b95edf6df9b8f19cf10a34263b7fd48493936fffd5085521292/grpcio-1.76.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980", size = 8127777, upload-time = "2025-10-21T16:21:33.577Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/c8/dce8ff21c86abe025efe304d9e31fdb0deaaa3b502b6a78141080f206da0/grpcio-1.76.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882", size = 7594014, upload-time = "2025-10-21T16:21:41.882Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/42/ad28191ebf983a5d0ecef90bab66baa5a6b18f2bfdef9d0a63b1973d9f75/grpcio-1.76.0-cp312-cp312-win32.whl", hash = "sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958", size = 3984750, upload-time = "2025-10-21T16:21:44.006Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/00/7bd478cbb851c04a48baccaa49b75abaa8e4122f7d86da797500cccdd771/grpcio-1.76.0-cp312-cp312-win_amd64.whl", hash = "sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347", size = 4704003, upload-time = "2025-10-21T16:21:46.244Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/ed/71467ab770effc9e8cef5f2e7388beb2be26ed642d567697bb103a790c72/grpcio-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2", size = 5807716, upload-time = "2025-10-21T16:21:48.475Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/85/c6ed56f9817fab03fa8a111ca91469941fb514e3e3ce6d793cb8f1e1347b/grpcio-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468", size = 11821522, upload-time = "2025-10-21T16:21:51.142Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/31/2b8a235ab40c39cbc141ef647f8a6eb7b0028f023015a4842933bc0d6831/grpcio-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3", size = 6362558, upload-time = "2025-10-21T16:21:54.213Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/64/9784eab483358e08847498ee56faf8ff6ea8e0a4592568d9f68edc97e9e9/grpcio-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb", size = 7049990, upload-time = "2025-10-21T16:21:56.476Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/94/8c12319a6369434e7a184b987e8e9f3b49a114c489b8315f029e24de4837/grpcio-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae", size = 6575387, upload-time = "2025-10-21T16:21:59.051Z" },
+    { url = "https://files.pythonhosted.org/packages/15/0f/f12c32b03f731f4a6242f771f63039df182c8b8e2cf8075b245b409259d4/grpcio-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77", size = 7166668, upload-time = "2025-10-21T16:22:02.049Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/2d/3ec9ce0c2b1d92dd59d1c3264aaec9f0f7c817d6e8ac683b97198a36ed5a/grpcio-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03", size = 8124928, upload-time = "2025-10-21T16:22:04.984Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/74/fd3317be5672f4856bcdd1a9e7b5e17554692d3db9a3b273879dc02d657d/grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42", size = 7589983, upload-time = "2025-10-21T16:22:07.881Z" },
+    { url = "https://files.pythonhosted.org/packages/45/bb/ca038cf420f405971f19821c8c15bcbc875505f6ffadafe9ffd77871dc4c/grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f", size = 3984727, upload-time = "2025-10-21T16:22:10.032Z" },
+    { url = "https://files.pythonhosted.org/packages/41/80/84087dc56437ced7cdd4b13d7875e7439a52a261e3ab4e06488ba6173b0a/grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8", size = 4702799, upload-time = "2025-10-21T16:22:12.709Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/46/39adac80de49d678e6e073b70204091e76631e03e94928b9ea4ecf0f6e0e/grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62", size = 5808417, upload-time = "2025-10-21T16:22:15.02Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f5/a4531f7fb8b4e2a60b94e39d5d924469b7a6988176b3422487be61fe2998/grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd", size = 11828219, upload-time = "2025-10-21T16:22:17.954Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/1c/de55d868ed7a8bd6acc6b1d6ddc4aa36d07a9f31d33c912c804adb1b971b/grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc", size = 6367826, upload-time = "2025-10-21T16:22:20.721Z" },
+    { url = "https://files.pythonhosted.org/packages/59/64/99e44c02b5adb0ad13ab3adc89cb33cb54bfa90c74770f2607eea629b86f/grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a", size = 7049550, upload-time = "2025-10-21T16:22:23.637Z" },
+    { url = "https://files.pythonhosted.org/packages/43/28/40a5be3f9a86949b83e7d6a2ad6011d993cbe9b6bd27bea881f61c7788b6/grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba", size = 6575564, upload-time = "2025-10-21T16:22:26.016Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/a9/1be18e6055b64467440208a8559afac243c66a8b904213af6f392dc2212f/grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09", size = 7176236, upload-time = "2025-10-21T16:22:28.362Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/55/dba05d3fcc151ce6e81327541d2cc8394f442f6b350fead67401661bf041/grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc", size = 8125795, upload-time = "2025-10-21T16:22:31.075Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/45/122df922d05655f63930cf42c9e3f72ba20aadb26c100ee105cad4ce4257/grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc", size = 7592214, upload-time = "2025-10-21T16:22:33.831Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/6e/0b899b7f6b66e5af39e377055fb4a6675c9ee28431df5708139df2e93233/grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e", size = 4062961, upload-time = "2025-10-21T16:22:36.468Z" },
+    { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" },
+]
+
 [[package]]
 name = "grpclib"
 version = "0.4.8"
@@ -1493,6 +1534,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/50/53df2244d4aca2af73d2f2c6ad21c731cf24bd0dbe89d896184a1eaa874f/litellm-1.77.7-py3-none-any.whl", hash = "sha256:1b3a1b17bd521a0ad25226fb62a912602c803922aabb4a16adf83834673be574", size = 9223061, upload-time = "2025-10-05T00:22:34.112Z" },
 ]
 
+[[package]]
+name = "lmnr"
+version = "0.7.20"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "httpx" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-grpc" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-instrumentation-threading" },
+    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "opentelemetry-semantic-conventions-ai" },
+    { name = "orjson" },
+    { name = "packaging" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "tenacity" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d4/c0/996403cc2f6967881a42af4b27ff8931956d57ab3ed2d8bf11e5b37aed40/lmnr-0.7.20.tar.gz", hash = "sha256:1f484cd618db2d71af65f90a0b8b36d20d80dc91a5138b811575c8677bf7c4fd", size = 194075, upload-time = "2025-11-04T16:53:34.49Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/df/4665a3931b2fbc5f5b66e4906ffab106f3f65ab7e78732ecdaf3ba4a3076/lmnr-0.7.20-py3-none-any.whl", hash = "sha256:5f9fa7444e6f96c25e097f66484ff29e632bdd1de0e9346948bf5595f4a8af38", size = 247465, upload-time = "2025-11-04T16:53:32.713Z" },
+]
+
 [[package]]
 name = "mako"
 version = "1.3.10"
@@ -1874,7 +1942,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.0.0a3"
+version = "1.0.0"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -1966,12 +2034,13 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.0.0a3"
+version = "1.0.0"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "fastmcp" },
     { name = "httpx" },
     { name = "litellm" },
+    { name = "lmnr" },
     { name = "pydantic" },
     { name = "python-frontmatter" },
     { name = "python-json-logger" },
@@ -1990,6 +2059,7 @@ requires-dist = [
     { name = "fastmcp", specifier = ">=2.11.3" },
     { name = "httpx", specifier = ">=0.27.0" },
     { name = "litellm", specifier = ">=1.77.7.dev9" },
+    { name = "lmnr", specifier = ">=0.7.20" },
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
     { name = "python-json-logger", specifier = ">=3.3.0" },
@@ -2000,7 +2070,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.0.0a3"
+version = "1.0.0"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2017,7 +2087,7 @@ dependencies = [
 requires-dist = [
     { name = "bashlex", specifier = ">=0.18" },
     { name = "binaryornot", specifier = ">=0.4.4" },
-    { name = "browser-use", specifier = ">=0.7.7" },
+    { name = "browser-use", specifier = ">=0.8.0" },
     { name = "cachetools" },
     { name = "func-timeout", specifier = ">=4.3.5" },
     { name = "libtmux", specifier = ">=0.46.2" },
@@ -2027,7 +2097,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.0.0a3"
+version = "1.0.0"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-sdk" },
@@ -2040,6 +2110,197 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.11.7" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/08/d8/0f354c375628e048bd0570645b310797299754730079853095bf000fba69/opentelemetry_api-1.38.0.tar.gz", hash = "sha256:f4c193b5e8acb0912b06ac5b16321908dd0843d75049c091487322284a3eea12", size = 65242, upload-time = "2025-10-16T08:35:50.25Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/a2/d86e01c28300bd41bab8f18afd613676e2bd63515417b77636fc1add426f/opentelemetry_api-1.38.0-py3-none-any.whl", hash = "sha256:2891b0197f47124454ab9f0cf58f3be33faca394457ac3e09daba13ff50aa582", size = 65947, upload-time = "2025-10-16T08:35:30.23Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/83/dd4660f2956ff88ed071e9e0e36e830df14b8c5dc06722dbde1841accbe8/opentelemetry_exporter_otlp_proto_common-1.38.0.tar.gz", hash = "sha256:e333278afab4695aa8114eeb7bf4e44e65c6607d54968271a249c180b2cb605c", size = 20431, upload-time = "2025-10-16T08:35:53.285Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/9e/55a41c9601191e8cd8eb626b54ee6827b9c9d4a46d736f32abc80d8039fc/opentelemetry_exporter_otlp_proto_common-1.38.0-py3-none-any.whl", hash = "sha256:03cb76ab213300fe4f4c62b7d8f17d97fcfd21b89f0b5ce38ea156327ddda74a", size = 18359, upload-time = "2025-10-16T08:35:34.099Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/c0/43222f5b97dc10812bc4f0abc5dc7cd0a2525a91b5151d26c9e2e958f52e/opentelemetry_exporter_otlp_proto_grpc-1.38.0.tar.gz", hash = "sha256:2473935e9eac71f401de6101d37d6f3f0f1831db92b953c7dcc912536158ebd6", size = 24676, upload-time = "2025-10-16T08:35:53.83Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/f0/bd831afbdba74ca2ce3982142a2fad707f8c487e8a3b6fef01f1d5945d1b/opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl", hash = "sha256:7c49fd9b4bd0dbe9ba13d91f764c2d20b0025649a6e4ac35792fb8d84d764bc7", size = 19695, upload-time = "2025-10-16T08:35:35.053Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/81/0a/debcdfb029fbd1ccd1563f7c287b89a6f7bef3b2902ade56797bfd020854/opentelemetry_exporter_otlp_proto_http-1.38.0.tar.gz", hash = "sha256:f16bd44baf15cbe07633c5112ffc68229d0edbeac7b37610be0b2def4e21e90b", size = 17282, upload-time = "2025-10-16T08:35:54.422Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/77/154004c99fb9f291f74aa0822a2f5bbf565a72d8126b3a1b63ed8e5f83c7/opentelemetry_exporter_otlp_proto_http-1.38.0-py3-none-any.whl", hash = "sha256:84b937305edfc563f08ec69b9cb2298be8188371217e867c1854d77198d0825b", size = 19579, upload-time = "2025-10-16T08:35:36.269Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.59b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/ed/9c65cd209407fd807fa05be03ee30f159bdac8d59e7ea16a8fe5a1601222/opentelemetry_instrumentation-0.59b0.tar.gz", hash = "sha256:6010f0faaacdaf7c4dff8aac84e226d23437b331dcda7e70367f6d73a7db1adc", size = 31544, upload-time = "2025-10-16T08:39:31.959Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/f5/7a40ff3f62bfe715dad2f633d7f1174ba1a7dd74254c15b2558b3401262a/opentelemetry_instrumentation-0.59b0-py3-none-any.whl", hash = "sha256:44082cc8fe56b0186e87ee8f7c17c327c4c2ce93bdbe86496e600985d74368ee", size = 33020, upload-time = "2025-10-16T08:38:31.463Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-threading"
+version = "0.59b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/7a/84e97d8992808197006e607ae410c2219bdbbc23d1289ba0c244d3220741/opentelemetry_instrumentation_threading-0.59b0.tar.gz", hash = "sha256:ce5658730b697dcbc0e0d6d13643a69fd8aeb1b32fa8db3bade8ce114c7975f3", size = 8770, upload-time = "2025-10-16T08:40:03.587Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b8/50/32d29076aaa1c91983cdd3ca8c6bb4d344830cd7d87a7c0fdc2d98c58509/opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl", hash = "sha256:76da2fc01fe1dccebff6581080cff9e42ac7b27cc61eb563f3c4435c727e8eca", size = 9313, upload-time = "2025-10-16T08:39:15.876Z" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/14/f0c4f0f6371b9cb7f9fa9ee8918bfd59ac7040c7791f1e6da32a1839780d/opentelemetry_proto-1.38.0.tar.gz", hash = "sha256:88b161e89d9d372ce723da289b7da74c3a8354a8e5359992be813942969ed468", size = 46152, upload-time = "2025-10-16T08:36:01.612Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b6/6a/82b68b14efca5150b2632f3692d627afa76b77378c4999f2648979409528/opentelemetry_proto-1.38.0-py3-none-any.whl", hash = "sha256:b6ebe54d3217c42e45462e2a1ae28c3e2bf2ec5a5645236a490f55f45f1a0a18", size = 72535, upload-time = "2025-10-16T08:35:45.749Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.38.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/85/cb/f0eee1445161faf4c9af3ba7b848cc22a50a3d3e2515051ad8628c35ff80/opentelemetry_sdk-1.38.0.tar.gz", hash = "sha256:93df5d4d871ed09cb4272305be4d996236eedb232253e3ab864c8620f051cebe", size = 171942, upload-time = "2025-10-16T08:36:02.257Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/2e/e93777a95d7d9c40d270a371392b6d6f1ff170c2a3cb32d6176741b5b723/opentelemetry_sdk-1.38.0-py3-none-any.whl", hash = "sha256:1c66af6564ecc1553d72d811a01df063ff097cdc82ce188da9951f93b8d10f6b", size = 132349, upload-time = "2025-10-16T08:35:46.995Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.59b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/40/bc/8b9ad3802cd8ac6583a4eb7de7e5d7db004e89cb7efe7008f9c8a537ee75/opentelemetry_semantic_conventions-0.59b0.tar.gz", hash = "sha256:7a6db3f30d70202d5bf9fa4b69bc866ca6a30437287de6c510fb594878aed6b0", size = 129861, upload-time = "2025-10-16T08:36:03.346Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions-ai"
+version = "0.4.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/e6/40b59eda51ac47009fb47afcdf37c6938594a0bd7f3b9fadcbc6058248e3/opentelemetry_semantic_conventions_ai-0.4.13.tar.gz", hash = "sha256:94efa9fb4ffac18c45f54a3a338ffeb7eedb7e1bb4d147786e77202e159f0036", size = 5368, upload-time = "2025-08-22T10:14:17.387Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/b5/cf25da2218910f0d6cdf7f876a06bed118c4969eacaf60a887cbaef44f44/opentelemetry_semantic_conventions_ai-0.4.13-py3-none-any.whl", hash = "sha256:883a30a6bb5deaec0d646912b5f9f6dcbb9f6f72557b73d0f2560bf25d13e2d5", size = 6080, upload-time = "2025-08-22T10:14:16.477Z" },
+]
+
+[[package]]
+name = "orjson"
+version = "3.11.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c6/fe/ed708782d6709cc60eb4c2d8a361a440661f74134675c72990f2c48c785f/orjson-3.11.4.tar.gz", hash = "sha256:39485f4ab4c9b30a3943cfe99e1a213c4776fb69e8abd68f66b83d5a0b0fdc6d", size = 5945188, upload-time = "2025-10-24T15:50:38.027Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/51/6b556192a04595b93e277a9ff71cd0cc06c21a7df98bcce5963fa0f5e36f/orjson-3.11.4-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d4371de39319d05d3f482f372720b841c841b52f5385bd99c61ed69d55d9ab50", size = 243571, upload-time = "2025-10-24T15:49:10.008Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/2c/2602392ddf2601d538ff11848b98621cd465d1a1ceb9db9e8043181f2f7b/orjson-3.11.4-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:e41fd3b3cac850eaae78232f37325ed7d7436e11c471246b87b2cd294ec94853", size = 128891, upload-time = "2025-10-24T15:49:11.297Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/47/bf85dcf95f7a3a12bf223394a4f849430acd82633848d52def09fa3f46ad/orjson-3.11.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:600e0e9ca042878c7fdf189cf1b028fe2c1418cc9195f6cb9824eb6ed99cb938", size = 130137, upload-time = "2025-10-24T15:49:12.544Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/4d/a0cb31007f3ab6f1fd2a1b17057c7c349bc2baf8921a85c0180cc7be8011/orjson-3.11.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7bbf9b333f1568ef5da42bc96e18bf30fd7f8d54e9ae066d711056add508e415", size = 129152, upload-time = "2025-10-24T15:49:13.754Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/ef/2811def7ce3d8576b19e3929fff8f8f0d44bc5eb2e0fdecb2e6e6cc6c720/orjson-3.11.4-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4806363144bb6e7297b8e95870e78d30a649fdc4e23fc84daa80c8ebd366ce44", size = 136834, upload-time = "2025-10-24T15:49:15.307Z" },
+    { url = "https://files.pythonhosted.org/packages/00/d4/9aee9e54f1809cec8ed5abd9bc31e8a9631d19460e3b8470145d25140106/orjson-3.11.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad355e8308493f527d41154e9053b86a5be892b3b359a5c6d5d95cda23601cb2", size = 137519, upload-time = "2025-10-24T15:49:16.557Z" },
+    { url = "https://files.pythonhosted.org/packages/db/ea/67bfdb5465d5679e8ae8d68c11753aaf4f47e3e7264bad66dc2f2249e643/orjson-3.11.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a7517482667fb9f0ff1b2f16fe5829296ed7a655d04d68cd9711a4d8a4e708", size = 136749, upload-time = "2025-10-24T15:49:17.796Z" },
+    { url = "https://files.pythonhosted.org/packages/01/7e/62517dddcfce6d53a39543cd74d0dccfcbdf53967017c58af68822100272/orjson-3.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97eb5942c7395a171cbfecc4ef6701fc3c403e762194683772df4c54cfbb2210", size = 136325, upload-time = "2025-10-24T15:49:19.347Z" },
+    { url = "https://files.pythonhosted.org/packages/18/ae/40516739f99ab4c7ec3aaa5cc242d341fcb03a45d89edeeaabc5f69cb2cf/orjson-3.11.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:149d95d5e018bdd822e3f38c103b1a7c91f88d38a88aada5c4e9b3a73a244241", size = 140204, upload-time = "2025-10-24T15:49:20.545Z" },
+    { url = "https://files.pythonhosted.org/packages/82/18/ff5734365623a8916e3a4037fcef1cd1782bfc14cf0992afe7940c5320bf/orjson-3.11.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:624f3951181eb46fc47dea3d221554e98784c823e7069edb5dbd0dc826ac909b", size = 406242, upload-time = "2025-10-24T15:49:21.884Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/43/96436041f0a0c8c8deca6a05ebeaf529bf1de04839f93ac5e7c479807aec/orjson-3.11.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:03bfa548cf35e3f8b3a96c4e8e41f753c686ff3d8e182ce275b1751deddab58c", size = 150013, upload-time = "2025-10-24T15:49:23.185Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/48/78302d98423ed8780479a1e682b9aecb869e8404545d999d34fa486e573e/orjson-3.11.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:525021896afef44a68148f6ed8a8bf8375553d6066c7f48537657f64823565b9", size = 139951, upload-time = "2025-10-24T15:49:24.428Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/7b/ad613fdcdaa812f075ec0875143c3d37f8654457d2af17703905425981bf/orjson-3.11.4-cp312-cp312-win32.whl", hash = "sha256:b58430396687ce0f7d9eeb3dd47761ca7d8fda8e9eb92b3077a7a353a75efefa", size = 136049, upload-time = "2025-10-24T15:49:25.973Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/3c/9cf47c3ff5f39b8350fb21ba65d789b6a1129d4cbb3033ba36c8a9023520/orjson-3.11.4-cp312-cp312-win_amd64.whl", hash = "sha256:c6dbf422894e1e3c80a177133c0dda260f81428f9de16d61041949f6a2e5c140", size = 131461, upload-time = "2025-10-24T15:49:27.259Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/3b/e2425f61e5825dc5b08c2a5a2b3af387eaaca22a12b9c8c01504f8614c36/orjson-3.11.4-cp312-cp312-win_arm64.whl", hash = "sha256:d38d2bc06d6415852224fcc9c0bfa834c25431e466dc319f0edd56cca81aa96e", size = 126167, upload-time = "2025-10-24T15:49:28.511Z" },
+    { url = "https://files.pythonhosted.org/packages/23/15/c52aa7112006b0f3d6180386c3a46ae057f932ab3425bc6f6ac50431cca1/orjson-3.11.4-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2d6737d0e616a6e053c8b4acc9eccea6b6cce078533666f32d140e4f85002534", size = 243525, upload-time = "2025-10-24T15:49:29.737Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/38/05340734c33b933fd114f161f25a04e651b0c7c33ab95e9416ade5cb44b8/orjson-3.11.4-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:afb14052690aa328cc118a8e09f07c651d301a72e44920b887c519b313d892ff", size = 128871, upload-time = "2025-10-24T15:49:31.109Z" },
+    { url = "https://files.pythonhosted.org/packages/55/b9/ae8d34899ff0c012039b5a7cb96a389b2476e917733294e498586b45472d/orjson-3.11.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38aa9e65c591febb1b0aed8da4d469eba239d434c218562df179885c94e1a3ad", size = 130055, upload-time = "2025-10-24T15:49:33.382Z" },
+    { url = "https://files.pythonhosted.org/packages/33/aa/6346dd5073730451bee3681d901e3c337e7ec17342fb79659ec9794fc023/orjson-3.11.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f2cf4dfaf9163b0728d061bebc1e08631875c51cd30bf47cb9e3293bfbd7dcd5", size = 129061, upload-time = "2025-10-24T15:49:34.935Z" },
+    { url = "https://files.pythonhosted.org/packages/39/e4/8eea51598f66a6c853c380979912d17ec510e8e66b280d968602e680b942/orjson-3.11.4-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89216ff3dfdde0e4070932e126320a1752c9d9a758d6a32ec54b3b9334991a6a", size = 136541, upload-time = "2025-10-24T15:49:36.923Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/47/cb8c654fa9adcc60e99580e17c32b9e633290e6239a99efa6b885aba9dbc/orjson-3.11.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9daa26ca8e97fae0ce8aa5d80606ef8f7914e9b129b6b5df9104266f764ce436", size = 137535, upload-time = "2025-10-24T15:49:38.307Z" },
+    { url = "https://files.pythonhosted.org/packages/43/92/04b8cc5c2b729f3437ee013ce14a60ab3d3001465d95c184758f19362f23/orjson-3.11.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c8b2769dc31883c44a9cd126560327767f848eb95f99c36c9932f51090bfce9", size = 136703, upload-time = "2025-10-24T15:49:40.795Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/fd/d0733fcb9086b8be4ebcfcda2d0312865d17d0d9884378b7cffb29d0763f/orjson-3.11.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1469d254b9884f984026bd9b0fa5bbab477a4bfe558bba6848086f6d43eb5e73", size = 136293, upload-time = "2025-10-24T15:49:42.347Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/d7/3c5514e806837c210492d72ae30ccf050ce3f940f45bf085bab272699ef4/orjson-3.11.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:68e44722541983614e37117209a194e8c3ad07838ccb3127d96863c95ec7f1e0", size = 140131, upload-time = "2025-10-24T15:49:43.638Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/dd/ba9d32a53207babf65bd510ac4d0faaa818bd0df9a9c6f472fe7c254f2e3/orjson-3.11.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8e7805fda9672c12be2f22ae124dcd7b03928d6c197544fe12174b86553f3196", size = 406164, upload-time = "2025-10-24T15:49:45.498Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/f9/f68ad68f4af7c7bde57cd514eaa2c785e500477a8bc8f834838eb696a685/orjson-3.11.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:04b69c14615fb4434ab867bf6f38b2d649f6f300af30a6705397e895f7aec67a", size = 149859, upload-time = "2025-10-24T15:49:46.981Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/d2/7f847761d0c26818395b3d6b21fb6bc2305d94612a35b0a30eae65a22728/orjson-3.11.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:639c3735b8ae7f970066930e58cf0ed39a852d417c24acd4a25fc0b3da3c39a6", size = 139926, upload-time = "2025-10-24T15:49:48.321Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/37/acd14b12dc62db9a0e1d12386271b8661faae270b22492580d5258808975/orjson-3.11.4-cp313-cp313-win32.whl", hash = "sha256:6c13879c0d2964335491463302a6ca5ad98105fc5db3565499dcb80b1b4bd839", size = 136007, upload-time = "2025-10-24T15:49:49.938Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/a9/967be009ddf0a1fffd7a67de9c36656b28c763659ef91352acc02cbe364c/orjson-3.11.4-cp313-cp313-win_amd64.whl", hash = "sha256:09bf242a4af98732db9f9a1ec57ca2604848e16f132e3f72edfd3c5c96de009a", size = 131314, upload-time = "2025-10-24T15:49:51.248Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/db/399abd6950fbd94ce125cb8cd1a968def95174792e127b0642781e040ed4/orjson-3.11.4-cp313-cp313-win_arm64.whl", hash = "sha256:a85f0adf63319d6c1ba06fb0dbf997fced64a01179cf17939a6caca662bf92de", size = 126152, upload-time = "2025-10-24T15:49:52.922Z" },
+    { url = "https://files.pythonhosted.org/packages/25/e3/54ff63c093cc1697e758e4fceb53164dd2661a7d1bcd522260ba09f54533/orjson-3.11.4-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:42d43a1f552be1a112af0b21c10a5f553983c2a0938d2bbb8ecd8bc9fb572803", size = 243501, upload-time = "2025-10-24T15:49:54.288Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/7d/e2d1076ed2e8e0ae9badca65bf7ef22710f93887b29eaa37f09850604e09/orjson-3.11.4-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:26a20f3fbc6c7ff2cb8e89c4c5897762c9d88cf37330c6a117312365d6781d54", size = 128862, upload-time = "2025-10-24T15:49:55.961Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/37/ca2eb40b90621faddfa9517dfe96e25f5ae4d8057a7c0cdd613c17e07b2c/orjson-3.11.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e3f20be9048941c7ffa8fc523ccbd17f82e24df1549d1d1fe9317712d19938e", size = 130047, upload-time = "2025-10-24T15:49:57.406Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/62/1021ed35a1f2bad9040f05fa4cc4f9893410df0ba3eaa323ccf899b1c90a/orjson-3.11.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aac364c758dc87a52e68e349924d7e4ded348dedff553889e4d9f22f74785316", size = 129073, upload-time = "2025-10-24T15:49:58.782Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/3f/f84d966ec2a6fd5f73b1a707e7cd876813422ae4bf9f0145c55c9c6a0f57/orjson-3.11.4-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d5c54a6d76e3d741dcc3f2707f8eeb9ba2a791d3adbf18f900219b62942803b1", size = 136597, upload-time = "2025-10-24T15:50:00.12Z" },
+    { url = "https://files.pythonhosted.org/packages/32/78/4fa0aeca65ee82bbabb49e055bd03fa4edea33f7c080c5c7b9601661ef72/orjson-3.11.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f28485bdca8617b79d44627f5fb04336897041dfd9fa66d383a49d09d86798bc", size = 137515, upload-time = "2025-10-24T15:50:01.57Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/9d/0c102e26e7fde40c4c98470796d050a2ec1953897e2c8ab0cb95b0759fa2/orjson-3.11.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bfc2a484cad3585e4ba61985a6062a4c2ed5c7925db6d39f1fa267c9d166487f", size = 136703, upload-time = "2025-10-24T15:50:02.944Z" },
+    { url = "https://files.pythonhosted.org/packages/df/ac/2de7188705b4cdfaf0b6c97d2f7849c17d2003232f6e70df98602173f788/orjson-3.11.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e34dbd508cb91c54f9c9788923daca129fe5b55c5b4eebe713bf5ed3791280cf", size = 136311, upload-time = "2025-10-24T15:50:04.441Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/52/847fcd1a98407154e944feeb12e3b4d487a0e264c40191fb44d1269cbaa1/orjson-3.11.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b13c478fa413d4b4ee606ec8e11c3b2e52683a640b006bb586b3041c2ca5f606", size = 140127, upload-time = "2025-10-24T15:50:07.398Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ae/21d208f58bdb847dd4d0d9407e2929862561841baa22bdab7aea10ca088e/orjson-3.11.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:724ca721ecc8a831b319dcd72cfa370cc380db0bf94537f08f7edd0a7d4e1780", size = 406201, upload-time = "2025-10-24T15:50:08.796Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/55/0789d6de386c8366059db098a628e2ad8798069e94409b0d8935934cbcb9/orjson-3.11.4-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:977c393f2e44845ce1b540e19a786e9643221b3323dae190668a98672d43fb23", size = 149872, upload-time = "2025-10-24T15:50:10.234Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/1d/7ff81ea23310e086c17b41d78a72270d9de04481e6113dbe2ac19118f7fb/orjson-3.11.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1e539e382cf46edec157ad66b0b0872a90d829a6b71f17cb633d6c160a223155", size = 139931, upload-time = "2025-10-24T15:50:11.623Z" },
+    { url = "https://files.pythonhosted.org/packages/77/92/25b886252c50ed64be68c937b562b2f2333b45afe72d53d719e46a565a50/orjson-3.11.4-cp314-cp314-win32.whl", hash = "sha256:d63076d625babab9db5e7836118bdfa086e60f37d8a174194ae720161eb12394", size = 136065, upload-time = "2025-10-24T15:50:13.025Z" },
+    { url = "https://files.pythonhosted.org/packages/63/b8/718eecf0bb7e9d64e4956afaafd23db9f04c776d445f59fe94f54bdae8f0/orjson-3.11.4-cp314-cp314-win_amd64.whl", hash = "sha256:0a54d6635fa3aaa438ae32e8570b9f0de36f3f6562c308d2a2a452e8b0592db1", size = 131310, upload-time = "2025-10-24T15:50:14.46Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/bf/def5e25d4d8bfce296a9a7c8248109bf58622c21618b590678f945a2c59c/orjson-3.11.4-cp314-cp314-win_arm64.whl", hash = "sha256:78b999999039db3cf58f6d230f524f04f75f129ba3d1ca2ed121f8657e575d3d", size = 126151, upload-time = "2025-10-24T15:50:15.878Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -6135,6 +6396,55 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/ea/c67e1dee1ba208ed22c06d1d547ae5e293374bfc43e0eb0ef5e262b68561/werkzeug-3.1.1-py3-none-any.whl", hash = "sha256:a71124d1ef06008baafa3d266c02f56e1836a5984afd6dd6c9230669d60d9fb5", size = 224371, upload-time = "2024-11-01T16:40:43.994Z" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "1.17.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" },
+    { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" },
+    { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" },
+    { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" },
+    { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" },
+    { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" },
+    { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" },
+    { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" },
+    { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" },
+    { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" },
+    { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" },
+    { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" },
+    { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" },
+    { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" },
+    { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" },
+    { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" },
+    { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" },
+    { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" },
+    { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
+]
+
 [[package]]
 name = "wsproto"
 version = "1.2.0"

From cdd72003423b0c1dbb8951f6aafb919675a54212 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 6 Nov 2025 19:23:57 +0000
Subject: [PATCH 17/66] Revert "Fix Docker cache tag length exceeding 128
 character limit"

This reverts commit 3ba1e46f29b01376d90c23087562373eb2f8e5d8.
---
 .github/workflows/build-swe-bench-images.yml |  6 ---
 .github/workflows/fix-cache-tag-length.patch | 51 --------------------
 2 files changed, 57 deletions(-)
 delete mode 100644 .github/workflows/fix-cache-tag-length.patch

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index e56d3343..bd4d741a 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -96,12 +96,6 @@ jobs:
         with:
           enable-cache: true
 
-      - name: Apply fix for Docker cache tag length limit
-        run: |
-          cd vendor/software-agent-sdk
-          git apply ../../.github/workflows/fix-cache-tag-length.patch
-          echo "Applied patch to fix cache tag length limit"
-
       - name: Install dependencies
         run: |
           make build
diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch
deleted file mode 100644
index 72306970..00000000
--- a/.github/workflows/fix-cache-tag-length.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py
-index 1cc9cd4d..c5023d79 100755
---- a/openhands-agent-server/openhands/agent_server/docker/build.py
-+++ b/openhands-agent-server/openhands/agent_server/docker/build.py
-@@ -14,6 +14,7 @@ Single-entry build helper for agent-server images.
- """
- 
- import argparse
-+import hashlib
- import os
- import re
- import shutil
-@@ -284,7 +285,37 @@ class BuildOptions(BaseModel):
- 
-     @property
-     def cache_tags(self) -> tuple[str, str]:
--        base = f"buildcache-{self.target}-{self.base_image_slug}"
-+        # Docker image tags have a 128-character limit.
-+        # If the base slug is too long, hash it to create a shorter unique identifier.
-+        MAX_TAG_LENGTH = 128
-+        base_slug = self.base_image_slug
-+
-+        # Reserve space for prefix, branch, and separators
-+        prefix = f"buildcache-{self.target}-"
-+        branch_suffix = (
-+            f"-{_sanitize_branch(GIT_REF)}"
-+            if GIT_REF not in ("main", "refs/heads/main", "unknown")
-+            else ""
-+        )
-+        main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else ""
-+
-+        # Calculate available space for base_slug
-+        reserved = len(prefix) + max(len(branch_suffix), len(main_suffix))
-+        available = MAX_TAG_LENGTH - reserved
-+
-+        # If base_slug is too long, use a hash
-+        if len(base_slug) > available:
-+            # Use first 8 chars of SHA256 hash for uniqueness while keeping it short
-+            hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12]
-+            base_slug_short = hash_digest
-+            logger.debug(
-+                f"[build] Base image slug too long ({len(base_slug)} chars), "
-+                f"using hash: {base_slug_short}"
-+            )
-+        else:
-+            base_slug_short = base_slug
-+
-+        base = f"{prefix}{base_slug_short}"
-         if GIT_REF in ("main", "refs/heads/main"):
-             return f"{base}-main", base
-         elif GIT_REF != "unknown":

From 001bcee9f4a93c1a0695045f364aad6ebcc99a0a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 19:39:52 +0000
Subject: [PATCH 18/66] Fix log file mixing issue by using ProcessPoolExecutor

The build workflow was experiencing log file corruption and I/O errors due to
concurrent builds writing to the wrong log files. This was caused by using
ThreadPoolExecutor with contextlib.redirect_stdout/stderr, which only provides
thread-local redirection of Python-level writes.

The SDK's build() function spawns subprocesses and uses logger.info()/warning()
to output build logs. Logger handlers write to process-wide file descriptors,
not thread-local redirected streams, causing output from concurrent threads to:
- Write to the wrong log files
- Attempt writing to closed file handles
- Result in ValueError('I/O operation on closed file.')

Solution: Replace ThreadPoolExecutor with ProcessPoolExecutor to provide
complete process-level isolation with separate stdout/stderr/logging per
build. The additional overhead is negligible compared to Docker build time.

Changes:
- Import ProcessPoolExecutor instead of ThreadPoolExecutor
- Move build_one_fn to module level (_build_with_logging) for pickle support
- Update executor initialization to use ProcessPoolExecutor
- Add explanatory comments about isolation requirements

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swe_bench/build_images.py | 30 ++++++++++++++++++----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 08054c35..48a408de 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -12,7 +12,7 @@
 import contextlib
 import io
 import sys
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import UTC, datetime
 from pathlib import Path
 from threading import Lock
@@ -162,6 +162,19 @@ def _default_build_output_dir(
     return root
 
 
+def _build_with_logging(
+    base: str, log_dir: Path, args: argparse.Namespace
+) -> BuildOutput:
+    """
+    Module-level function for building a single image with output capture.
+    Must be at module level to be picklable for ProcessPoolExecutor.
+    """
+    with capture_output(base, log_dir) as log_path:
+        result = build_one(base, args)
+        result.log_path = str(log_path)
+        return result
+
+
 def _update_pbar(
     pbar: tqdm,
     successes: int,
@@ -191,12 +204,6 @@ def main(argv: list[str]) -> int:
     manifest_path = BUILD_DIR / "manifest.jsonl"
     manifest_path.parent.mkdir(parents=True, exist_ok=True)
 
-    def build_one_fn(base: str, args) -> BuildOutput:
-        with capture_output(base, BUILD_LOG_DIR) as log_path:
-            result = build_one(base, args)
-            result.log_path = str(log_path)
-            return result
-
     if args.dry_run:
         print("\n".join(bases))
         return 0
@@ -212,13 +219,14 @@ def build_one_fn(base: str, args) -> BuildOutput:
     ):
         _update_pbar(pbar, successes, failures, 0, None, "Queueing")
 
-        # Single unified path: ThreadPoolExecutor( max_workers = args.max_workers ),
-        # even if it's 1
-        with ThreadPoolExecutor(max_workers=args.max_workers) as ex:
+        # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
+        # even if it's 1. Using processes instead of threads ensures proper isolation
+        # of stdout/stderr and logging handlers, preventing output mixing between builds.
+        with ProcessPoolExecutor(max_workers=args.max_workers) as ex:
             futures = {}
             for base in bases:
                 in_progress.add(base)
-                fut = ex.submit(build_one_fn, base, args)
+                fut = ex.submit(_build_with_logging, base, BUILD_LOG_DIR, args)
                 futures[fut] = base
 
             _update_pbar(

From 271b5271a1c22375a85f3b771c78189764439a8f Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 20:33:36 +0000
Subject: [PATCH 19/66] Improve Docker image tagging for reproducibility

This commit improves the tagging system for SWE-Bench Docker images to enable
better reproducibility and clarity.

## Changes

### 1. Benchmarks Build System

**benchmarks/swe_bench/build_images.py:**
- Added `get_sdk_commit_hash()`: Extracts 7-char SDK submodule commit hash
- Added `extract_instance_id()`: Parses SWE-Bench base images to extract instance IDs
- Modified `main()`: Sets SDK_VERSION_OVERRIDE env var with SDK commit hash
- Modified `build_one()`:
  - Generates custom tags: `swebench-{instance_id}`
  - Disables versioned tags via `include_versioned_tag=False`

### 2. SDK Submodule Update

**vendor/software-agent-sdk:**
Updated to commit 77d50e61 which includes:
- `SDK_VERSION_OVERRIDE` environment variable support
- `include_versioned_tag` option in BuildOptions
- Target-based tag suffixes (replaces `-dev` suffix)
- See: https://github.com/OpenHands/software-agent-sdk/pull/1088

### 3. Documentation

**TAGGING_CHANGES.md:**
Comprehensive documentation explaining:
- Why these changes are needed (submodule git context issues)
- Tag format comparison (before/after)
- Benefits (reproducibility, usability, maintainability)
- Implementation details and examples

## Tag Format

### Before
```
v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev
```
- 137 characters
- Package version (non-reproducible)
- Unclear `-dev` suffix

### After
```
a612c0a-swebench-django-12155-source-minimal
main-swebench-django-12155-source-minimal
```
- 84 characters (39% shorter)
- Exact commit hash (reproducible)
- Clear target indication

## Benefits

1. **Reproducibility**: Git commit hash ensures exact SDK version tracking
2. **Clarity**: Instance ID and target clearly visible in tag
3. **Consistency**: All builds use same suffix pattern
4. **Backward Compatible**: SDK changes only apply when explicitly enabled

## Related

- SDK PR: https://github.com/OpenHands/software-agent-sdk/pull/1088
- Issue: Improve SWE-Bench image build workflow

Co-authored-by: openhands <openhands@all-hands.dev>
---
 TAGGING_CHANGES.md                   | 185 +++++++++++++++++++++++++++
 benchmarks/swe_bench/build_images.py |  64 ++++++++-
 vendor/software-agent-sdk            |   2 +-
 3 files changed, 249 insertions(+), 2 deletions(-)
 create mode 100644 TAGGING_CHANGES.md

diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md
new file mode 100644
index 00000000..79ee855a
--- /dev/null
+++ b/TAGGING_CHANGES.md
@@ -0,0 +1,185 @@
+# Docker Image Tagging Improvements
+
+## Summary
+
+This change replaces the long, auto-generated versioned tags with short, meaningful tags that include:
+- **SDK commit hash** (exact reproducibility)
+- **SWE-Bench instance ID** (clear identification)
+
+## Changes Made
+
+### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`)
+
+**Added three features:**
+
+1. **`SDK_VERSION_OVERRIDE` environment variable**
+   - Allows overriding the package version with a commit hash
+   - Falls back to `importlib.metadata.version("openhands-sdk")` if not set
+   - Critical for git submodule contexts where package version != actual commit
+   - Follows existing pattern (SDK already uses `GITHUB_REF` env var)
+
+2. **`include_versioned_tag` option in BuildOptions**
+   - When `False`, skips the long versioned tag
+   - Defaults to `True` for backward compatibility
+   - Gives consumers control over tag format
+
+3. **Target-based tag suffixes** (replaces `-dev` suffix)
+   - All tags now include `-{target}` suffix: `-binary`, `-source`, `-binary-minimal`, `-source-minimal`
+   - More descriptive than previous `-dev` suffix (which only applied to source builds)
+   - Makes tag meaning immediately clear without needing to check build config
+   - Removed deprecated `is_dev` property
+
+### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`)
+
+**Added two functions:**
+
+1. **`get_sdk_commit_hash()`**
+   - Extracts the 7-character commit hash from SDK submodule
+   - Returns "unknown" if git fails (with warning)
+
+2. **`extract_instance_id(base_image)`**
+   - Parses SWE-Bench base image name to extract instance ID
+   - Examples:
+     - `...django_1776_django-12155:latest` → `django-12155`
+     - `...sympy_1776_sympy-18189:latest` → `sympy-18189`
+     - `...scikit-learn_3742_scikit-learn-25973:latest` → `scikit-learn-25973`
+
+**Modified build flow:**
+
+1. At startup: Set `SDK_VERSION_OVERRIDE` env var to SDK commit hash
+2. Per image: Extract instance ID and create custom tag `swebench-{instance_id}`
+3. Pass `include_versioned_tag=False` to disable long tag
+
+## Tag Format Comparison
+
+### Before (Old Format)
+```
+ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev
+```
+- **Length**: 137 characters
+- **Includes**: Package version (v1.0.0), full base image path, target
+- **Problem**: No git commit info, hard to parse
+
+### After (New Format)
+```
+ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal
+ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal
+```
+- **Length**: 84 characters (**39% shorter**)
+- **Includes**: SDK commit hash, instance ID, build target
+- **Benefits**: 
+  - Exact reproducibility (commit hash)
+  - Easy to parse and filter
+  - Clear instance identification
+  - Explicit target indication (no more ambiguous `-dev` suffix)
+
+## Tag Generation Logic
+
+The SDK's `all_tags` property generates:
+
+1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}-{target}{arch_suffix}`
+   - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`)
+   - `custom_tag` = `swebench-{instance_id}`
+   - `target` = Build target (`binary`, `source`, `binary-minimal`, `source-minimal`)
+   - Example: `a612c0a-swebench-django-12155-source-minimal`
+
+2. **Main branch tag** (if on main): `{image}:main-{custom_tag}-{target}{arch_suffix}`
+   - Example: `main-swebench-django-12155-source-minimal`
+
+3. **Versioned tag** (now disabled): `{image}:{versioned_tag}-{target}{arch_suffix}`
+   - Skipped when `include_versioned_tag=False`
+
+All tags now include `-{target}` suffix for clarity (replaces old `-dev` suffix pattern).
+
+## Benefits
+
+### 1. Reproducibility
+- Git commit hash ensures exact SDK version tracking
+- Can reconstruct exact build environment from tag alone
+- No ambiguity (version 1.0.0 could be many commits)
+
+### 2. Usability
+- **39% shorter tags** (137 → 84 chars)
+- Easy to filter: `docker images | grep a612c0a`
+- Easy to identify: `swebench-django-12155-source-minimal` is self-documenting
+- Explicit target indication (no more guessing what `-dev` means)
+- Fits in terminal/log output better
+
+### 3. Maintainability
+- SDK changes are backward compatible (env var is optional)
+- Benchmarks repo has full control over tag format
+- Can easily extend with more metadata later
+
+## Example Build Command
+
+```bash
+uv run benchmarks/swe_bench/build_images.py \
+  --dataset princeton-nlp/SWE-bench_Verified \
+  --split test \
+  --image ghcr.io/openhands/eval-agent-server \
+  --target source-minimal \
+  --platforms linux/amd64 \
+  --push \
+  --max-workers 2
+```
+
+## Testing
+
+To test the tagging logic without building:
+
+```python
+from benchmarks.swe_bench.build_images import extract_instance_id, get_sdk_commit_hash
+
+# Test instance ID extraction
+base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest"
+print(extract_instance_id(base))  # → django-12155
+
+# Get SDK commit
+print(get_sdk_commit_hash())  # → a612c0a
+```
+
+## Migration Notes
+
+### For existing workflows:
+- No changes needed - SDK defaults to old behavior
+- Opt-in by setting `include_versioned_tag=False`
+
+### For CI/CD:
+- New tags will be generated automatically
+- Old tags (if any exist) remain unchanged
+- Can coexist during transition period
+
+### For consumers:
+- Update image references to use new tag format
+- Can filter by SDK version: `grep a612c0a`
+- Can filter by instance: `grep django-12155`
+
+## Future Enhancements
+
+Possible additions:
+1. **Docker labels** for metadata (see `docker inspect`)
+2. **Benchmarks commit** in tag or label
+3. **Build timestamp** in labels
+4. **Platform/architecture** in tag (already supported via `arch` param)
+
+## Files Changed
+
+1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py`
+   - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()`
+   - Added `include_versioned_tag` field to `BuildOptions`
+   - Changed tag suffix logic: All tags get `-{target}` suffix (replaces `-dev`)
+   - Removed deprecated `is_dev` property
+   - Modified `all_tags` property to respect new flag and suffix logic
+
+2. `benchmarks/swe_bench/build_images.py`
+   - Added `get_sdk_commit_hash()` function
+   - Added `extract_instance_id()` function
+   - Modified `main()` to set `SDK_VERSION_OVERRIDE`
+   - Modified `build_one()` to use custom tags and disable versioned tag
+
+## Related PRs
+
+- **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088
+  - Adds `SDK_VERSION_OVERRIDE` support
+  - Changes `-dev` suffix to `-{target}` for all builds (more descriptive)
+  - Adds `include_versioned_tag` option
diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 48a408de..24c743cc 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -11,6 +11,8 @@
 import argparse
 import contextlib
 import io
+import os
+import subprocess
 import sys
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import UTC, datetime
@@ -30,6 +32,52 @@
 logger = get_logger(__name__)
 
 
+def get_sdk_commit_hash() -> str:
+    """Get the short commit hash of the SDK submodule."""
+    sdk_path = Path(__file__).parent.parent.parent / "vendor" / "software-agent-sdk"
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--short=7", "HEAD"],
+            cwd=sdk_path,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        return result.stdout.strip()
+    except subprocess.CalledProcessError:
+        logger.warning("Failed to get SDK commit hash, using 'unknown'")
+        return "unknown"
+
+
+def extract_instance_id(base_image: str) -> str:
+    """
+    Extract SWE-Bench instance ID from base image name.
+
+    Example:
+        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
+        -> django-12155
+
+        docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
+        -> sympy-18189
+
+        docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest
+        -> scikit-learn-25973
+    """
+    # SWE-Bench images pattern: ..._{repo}_{version}_{instance_id}:tag
+    # We want to extract just the instance_id (last part before colon)
+    # Instance ID format: {repo}-{number} or {repo}_{number}
+
+    parts = base_image.split("_")
+    if len(parts) >= 2:
+        # Last part contains the instance ID and tag
+        last_part = parts[-1]  # e.g., "django-12155:latest"
+        instance_id = last_part.split(":")[0]  # Remove tag
+        return instance_id
+
+    logger.warning(f"Could not extract instance ID from: {base_image}")
+    return "unknown"
+
+
 @contextlib.contextmanager
 def capture_output(base_name: str, out_dir: Path):
     """
@@ -138,13 +186,22 @@ class BuildOutput(BaseModel):
 
 
 def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput:
+    # Extract instance ID and build custom tag
+    instance_id = extract_instance_id(base_image)
+    custom_tag = f"swebench-{instance_id}"
+
+    # Combine with user-provided custom tags if any
+    if args.custom_tags:
+        custom_tag = f"{custom_tag},{args.custom_tags}"
+
     opts = BuildOptions(
         base_image=base_image,
-        custom_tags=args.custom_tags,
+        custom_tags=custom_tag,
         image=args.image,
         target=args.target,
         platforms=[p.strip() for p in args.platforms.split(",") if p.strip()],
         push=args.push,
+        include_versioned_tag=False,  # Disable long versioned tag
     )
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)
@@ -195,6 +252,11 @@ def main(argv: list[str]) -> int:
     parser = extend_parser()
     args = parser.parse_args(argv)
 
+    # Set SDK commit hash as version override for image tags
+    sdk_commit = get_sdk_commit_hash()
+    os.environ["SDK_VERSION_OVERRIDE"] = sdk_commit
+    logger.info(f"Using SDK commit: {sdk_commit}")
+
     bases: list[str] = collect_unique_base_images(
         args.dataset, args.split, args.docker_image_prefix, args.n_limit
     )
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index a612c0a6..77d50e61 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit a612c0a685fa96bc725085ac81c59492d4a88974
+Subproject commit 77d50e61093d7725893996fd1d6e528b9a6220a3

From 92f04c1fb3ecbd6f0b6183770c11be77d3c8b6e9 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 20:41:12 +0000
Subject: [PATCH 20/66] refactor: omit target suffix for binary builds (default
 case)

Updated SDK submodule to bc25aa0d which omits the target suffix for binary
builds since it's the default/common case. This keeps tags cleaner.

Tag examples:
- Binary: a612c0a-swebench-django-12155 (no suffix)
- Source: a612c0a-swebench-django-12155-source
- Source-minimal: a612c0a-swebench-django-12155-source-minimal

Updated TAGGING_CHANGES.md to reflect this behavior with updated examples
showing both binary and source-minimal formats.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 TAGGING_CHANGES.md        | 41 ++++++++++++++++++++++++++-------------
 vendor/software-agent-sdk |  2 +-
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md
index 79ee855a..534189c1 100644
--- a/TAGGING_CHANGES.md
+++ b/TAGGING_CHANGES.md
@@ -24,7 +24,8 @@ This change replaces the long, auto-generated versioned tags with short, meaning
    - Gives consumers control over tag format
 
 3. **Target-based tag suffixes** (replaces `-dev` suffix)
-   - All tags now include `-{target}` suffix: `-binary`, `-source`, `-binary-minimal`, `-source-minimal`
+   - Non-binary builds include `-{target}` suffix: `-source`, `-binary-minimal`, `-source-minimal`
+   - Binary builds have no suffix (it's the default/common case)
    - More descriptive than previous `-dev` suffix (which only applied to source builds)
    - Makes tag meaning immediately clear without needing to check build config
    - Removed deprecated `is_dev` property
@@ -61,35 +62,49 @@ ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_
 - **Problem**: No git commit info, hard to parse
 
 ### After (New Format)
+
+For source-minimal (most common for SWE-Bench):
 ```
 ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal
 ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal
 ```
 - **Length**: 84 characters (**39% shorter**)
-- **Includes**: SDK commit hash, instance ID, build target
-- **Benefits**: 
+
+For binary (no suffix, it's the default):
+```
+ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155
+ghcr.io/openhands/eval-agent-server:main-swebench-django-12155
+```
+- **Length**: 69 characters (**50% shorter**)
+
+**Benefits**: 
   - Exact reproducibility (commit hash)
   - Easy to parse and filter
   - Clear instance identification
-  - Explicit target indication (no more ambiguous `-dev` suffix)
+  - Clean tags for common case (binary has no suffix)
 
 ## Tag Generation Logic
 
 The SDK's `all_tags` property generates:
 
-1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}-{target}{arch_suffix}`
+1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}[-{target}]{arch_suffix}`
    - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`)
    - `custom_tag` = `swebench-{instance_id}`
-   - `target` = Build target (`binary`, `source`, `binary-minimal`, `source-minimal`)
-   - Example: `a612c0a-swebench-django-12155-source-minimal`
+   - `target` = Build target (omitted for `binary`, included for others)
+   - Examples: 
+     - Binary: `a612c0a-swebench-django-12155`
+     - Source: `a612c0a-swebench-django-12155-source`
+     - Source-minimal: `a612c0a-swebench-django-12155-source-minimal`
 
-2. **Main branch tag** (if on main): `{image}:main-{custom_tag}-{target}{arch_suffix}`
-   - Example: `main-swebench-django-12155-source-minimal`
+2. **Main branch tag** (if on main): `{image}:main-{custom_tag}[-{target}]{arch_suffix}`
+   - Examples:
+     - Binary: `main-swebench-django-12155`
+     - Source-minimal: `main-swebench-django-12155-source-minimal`
 
-3. **Versioned tag** (now disabled): `{image}:{versioned_tag}-{target}{arch_suffix}`
+3. **Versioned tag** (now disabled): `{image}:{versioned_tag}[-{target}]{arch_suffix}`
    - Skipped when `include_versioned_tag=False`
 
-All tags now include `-{target}` suffix for clarity (replaces old `-dev` suffix pattern).
+Non-binary targets include `-{target}` suffix for clarity. Binary has no suffix (default case).
 
 ## Benefits
 
@@ -167,7 +182,7 @@ Possible additions:
 1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py`
    - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()`
    - Added `include_versioned_tag` field to `BuildOptions`
-   - Changed tag suffix logic: All tags get `-{target}` suffix (replaces `-dev`)
+   - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix
    - Removed deprecated `is_dev` property
    - Modified `all_tags` property to respect new flag and suffix logic
 
@@ -181,5 +196,5 @@ Possible additions:
 
 - **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088
   - Adds `SDK_VERSION_OVERRIDE` support
-  - Changes `-dev` suffix to `-{target}` for all builds (more descriptive)
+  - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive)
   - Adds `include_versioned_tag` option
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 77d50e61..bc25aa0d 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 77d50e61093d7725893996fd1d6e528b9a6220a3
+Subproject commit bc25aa0de519591b44047061f8f402a84d322c70

From 49d96678b8efc2f025da7038b61482cbcdee1e56 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 21:02:25 +0000
Subject: [PATCH 21/66] fix: update SDK to use SDK_VERSION for commit tags

Updates SDK submodule to 27f37dc0 which fixes an issue where SHORT_SHA
was using git info from the benchmarks repo instead of the SDK repo.

Now tags correctly use the SDK commit hash when SDK_VERSION_OVERRIDE
is set, ensuring proper versioning in vendored/submodule contexts.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index bc25aa0d..27f37dc0 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit bc25aa0de519591b44047061f8f402a84d322c70
+Subproject commit 27f37dc03543e9f41a07762e5bd120a0fb8d6f55

From c2711a331334744553c2ecf1e570cce5f69fe728 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 21:08:30 +0000
Subject: [PATCH 22/66] refactor: remove SDK_VERSION_OVERRIDE logic

SDK now automatically detects its own commit hash, so we don't need
to manually extract and override it. This simplifies the build script
significantly:

- Removed get_sdk_commit_hash() function
- Removed SDK_VERSION_OVERRIDE env var setting
- Removed unused imports (subprocess, os)
- Updated documentation to reflect simpler approach

The SDK's _sdk_version() now automatically finds the SDK repo root
and gets the commit hash directly, regardless of whether it's used
as a submodule or vendored dependency.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 TAGGING_CHANGES.md                   | 44 ++++++++++++----------------
 benchmarks/swe_bench/build_images.py | 24 ---------------
 2 files changed, 19 insertions(+), 49 deletions(-)

diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md
index 534189c1..1c362a36 100644
--- a/TAGGING_CHANGES.md
+++ b/TAGGING_CHANGES.md
@@ -10,13 +10,13 @@ This change replaces the long, auto-generated versioned tags with short, meaning
 
 ### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`)
 
-**Added three features:**
+**Added two features:**
 
-1. **`SDK_VERSION_OVERRIDE` environment variable**
-   - Allows overriding the package version with a commit hash
-   - Falls back to `importlib.metadata.version("openhands-sdk")` if not set
-   - Critical for git submodule contexts where package version != actual commit
-   - Follows existing pattern (SDK already uses `GITHUB_REF` env var)
+1. **SDK_VERSION now uses git commit hash**
+   - `_sdk_version()` now automatically detects the SDK repo root and gets its commit hash
+   - Falls back to package version only if git info unavailable
+   - Works correctly in submodule contexts (uses SDK repo, not calling repo)
+   - No environment variable override needed - automatic and robust
 
 2. **`include_versioned_tag` option in BuildOptions**
    - When `False`, skips the long versioned tag
@@ -32,13 +32,9 @@ This change replaces the long, auto-generated versioned tags with short, meaning
 
 ### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`)
 
-**Added two functions:**
+**Added one function:**
 
-1. **`get_sdk_commit_hash()`**
-   - Extracts the 7-character commit hash from SDK submodule
-   - Returns "unknown" if git fails (with warning)
-
-2. **`extract_instance_id(base_image)`**
+1. **`extract_instance_id(base_image)`**
    - Parses SWE-Bench base image name to extract instance ID
    - Examples:
      - `...django_1776_django-12155:latest` → `django-12155`
@@ -47,9 +43,9 @@ This change replaces the long, auto-generated versioned tags with short, meaning
 
 **Modified build flow:**
 
-1. At startup: Set `SDK_VERSION_OVERRIDE` env var to SDK commit hash
-2. Per image: Extract instance ID and create custom tag `swebench-{instance_id}`
-3. Pass `include_versioned_tag=False` to disable long tag
+1. Per image: Extract instance ID and create custom tag `swebench-{instance_id}`
+2. Pass `include_versioned_tag=False` to disable long tag
+3. SDK automatically uses its own commit hash (no manual override needed)
 
 ## Tag Format Comparison
 
@@ -87,8 +83,8 @@ ghcr.io/openhands/eval-agent-server:main-swebench-django-12155
 
 The SDK's `all_tags` property generates:
 
-1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}[-{target}]{arch_suffix}`
-   - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`)
+1. **Commit-based tag**: `{image}:{SDK_VERSION[:7]}-{custom_tag}[-{target}]{arch_suffix}`
+   - `SDK_VERSION[:7]` = First 7 chars of SDK commit hash (automatically detected)
    - `custom_tag` = `swebench-{instance_id}`
    - `target` = Build target (omitted for `binary`, included for others)
    - Examples: 
@@ -143,14 +139,11 @@ uv run benchmarks/swe_bench/build_images.py \
 To test the tagging logic without building:
 
 ```python
-from benchmarks.swe_bench.build_images import extract_instance_id, get_sdk_commit_hash
+from benchmarks.swe_bench.build_images import extract_instance_id
 
 # Test instance ID extraction
 base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest"
 print(extract_instance_id(base))  # → django-12155
-
-# Get SDK commit
-print(get_sdk_commit_hash())  # → a612c0a
 ```
 
 ## Migration Notes
@@ -180,21 +173,22 @@ Possible additions:
 ## Files Changed
 
 1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py`
-   - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()`
+   - Refactored `_sdk_version()` to automatically use SDK repo commit hash
+   - Added `_git_info_for_repo()` to get git info from specific directories
    - Added `include_versioned_tag` field to `BuildOptions`
    - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix
    - Removed deprecated `is_dev` property
    - Modified `all_tags` property to respect new flag and suffix logic
 
 2. `benchmarks/swe_bench/build_images.py`
-   - Added `get_sdk_commit_hash()` function
    - Added `extract_instance_id()` function
-   - Modified `main()` to set `SDK_VERSION_OVERRIDE`
    - Modified `build_one()` to use custom tags and disable versioned tag
+   - Removed unnecessary SDK_VERSION_OVERRIDE logic (now automatic)
 
 ## Related PRs
 
 - **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088
-  - Adds `SDK_VERSION_OVERRIDE` support
+  - SDK_VERSION now automatically uses commit hash from SDK repo
   - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive)
   - Adds `include_versioned_tag` option
+  - Works correctly in submodule/vendored contexts
diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 24c743cc..e80fc69b 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -11,8 +11,6 @@
 import argparse
 import contextlib
 import io
-import os
-import subprocess
 import sys
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import UTC, datetime
@@ -32,23 +30,6 @@
 logger = get_logger(__name__)
 
 
-def get_sdk_commit_hash() -> str:
-    """Get the short commit hash of the SDK submodule."""
-    sdk_path = Path(__file__).parent.parent.parent / "vendor" / "software-agent-sdk"
-    try:
-        result = subprocess.run(
-            ["git", "rev-parse", "--short=7", "HEAD"],
-            cwd=sdk_path,
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        return result.stdout.strip()
-    except subprocess.CalledProcessError:
-        logger.warning("Failed to get SDK commit hash, using 'unknown'")
-        return "unknown"
-
-
 def extract_instance_id(base_image: str) -> str:
     """
     Extract SWE-Bench instance ID from base image name.
@@ -252,11 +233,6 @@ def main(argv: list[str]) -> int:
     parser = extend_parser()
     args = parser.parse_args(argv)
 
-    # Set SDK commit hash as version override for image tags
-    sdk_commit = get_sdk_commit_hash()
-    os.environ["SDK_VERSION_OVERRIDE"] = sdk_commit
-    logger.info(f"Using SDK commit: {sdk_commit}")
-
     bases: list[str] = collect_unique_base_images(
         args.dataset, args.split, args.docker_image_prefix, args.n_limit
     )

From 6d6845ee074980b90a62270f003b977aa8c3b8f3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 6 Nov 2025 21:08:39 +0000
Subject: [PATCH 23/66] chore: update SDK to commit 85e436df

Update SDK submodule to include automatic SDK_VERSION detection.
SDK now auto-detects its own commit hash without requiring external
override, making the tagging system fully automatic.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 27f37dc0..85e436df 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 27f37dc03543e9f41a07762e5bd120a0fb8d6f55
+Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f

From 8d8ed8cbc62df048a5958cc3d84b9d52d27ca81e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 16:54:12 +0000
Subject: [PATCH 24/66] update agent-sdk version

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 85e436df..204d3a4b 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f
+Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715

From 8763fade1291693d0f6d0b72043371ecea90cc91 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 16:58:15 +0000
Subject: [PATCH 25/66] improve custom tags for swebench image

---
 benchmarks/swe_bench/build_images.py | 33 +++++++---------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index e80fc69b..30d489a0 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -30,33 +30,23 @@
 logger = get_logger(__name__)
 
 
-def extract_instance_id(base_image: str) -> str:
+def extract_custom_tag(base_image: str) -> str:
     """
     Extract SWE-Bench instance ID from base image name.
 
     Example:
         docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
-        -> django-12155
+        -> sweb.eval.x86_64.django_1776_django-12155
 
         docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
-        -> sympy-18189
+        -> sweb.eval.x86_64.sympy_1776_sympy-18189
 
         docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest
-        -> scikit-learn-25973
+        -> sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973
     """
-    # SWE-Bench images pattern: ..._{repo}_{version}_{instance_id}:tag
-    # We want to extract just the instance_id (last part before colon)
-    # Instance ID format: {repo}-{number} or {repo}_{number}
-
-    parts = base_image.split("_")
-    if len(parts) >= 2:
-        # Last part contains the instance ID and tag
-        last_part = parts[-1]  # e.g., "django-12155:latest"
-        instance_id = last_part.split(":")[0]  # Remove tag
-        return instance_id
-
-    logger.warning(f"Could not extract instance ID from: {base_image}")
-    return "unknown"
+    name_tag = base_image.split("/")[-1]
+    name = name_tag.split(":")[0]
+    return name
 
 
 @contextlib.contextmanager
@@ -133,7 +123,6 @@ def extend_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--platforms", default="linux/amd64", help="Comma-separated platforms"
     )
-    parser.add_argument("--custom-tags", default="", help="Comma-separated custom tags")
     parser.add_argument(
         "--push", action="store_true", help="Push via buildx instead of load locally"
     )
@@ -168,12 +157,7 @@ class BuildOutput(BaseModel):
 
 def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput:
     # Extract instance ID and build custom tag
-    instance_id = extract_instance_id(base_image)
-    custom_tag = f"swebench-{instance_id}"
-
-    # Combine with user-provided custom tags if any
-    if args.custom_tags:
-        custom_tag = f"{custom_tag},{args.custom_tags}"
+    custom_tag = extract_custom_tag(base_image)
 
     opts = BuildOptions(
         base_image=base_image,
@@ -182,7 +166,6 @@ def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput:
         target=args.target,
         platforms=[p.strip() for p in args.platforms.split(",") if p.strip()],
         push=args.push,
-        include_versioned_tag=False,  # Disable long versioned tag
     )
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)

From 99927f8f6fd8857f72ba9061448c391a94c9bad8 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 17:04:10 +0000
Subject: [PATCH 26/66] Revert "update agent-sdk version"

This reverts commit 8d8ed8cbc62df048a5958cc3d84b9d52d27ca81e.
---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 204d3a4b..85e436df 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715
+Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f

From 7e3c50ef56c8a78f481017b0dee945fd3ad195cf Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 17:05:46 +0000
Subject: [PATCH 27/66] update sha

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 85e436df..204d3a4b 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f
+Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715

From c1182973aef5ff28d29f7d6ec04f1bb144b7c2ad Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 18:32:08 +0000
Subject: [PATCH 28/66] fix: update run_infer.py to use new SDK tag format

- Replace SDK_VERSION with SHORT_SHA (renamed in SDK PR #1088)
- Add extract_custom_tag() function to avoid circular import
- Update get_agent_server_docker_image() to use new tag format:
  - Binary target: {SHORT_SHA}-{custom_tag}
  - Other targets: {SHORT_SHA}-{custom_tag}-{target}
- Aligns with SDK's git commit-based tagging strategy

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swe_bench/run_infer.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index f9562ffb..782e2ca0 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -16,7 +16,7 @@
     EvalMetadata,
     EvalOutput,
 )
-from openhands.agent_server.docker.build import SDK_VERSION, _base_slug
+from openhands.agent_server.docker.build import SHORT_SHA
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -26,6 +26,19 @@
 logger = get_logger(__name__)
 
 
+def extract_custom_tag(base_image: str) -> str:
+    """
+    Extract SWE-Bench instance ID from base image name.
+
+    Example:
+        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
+        -> sweb.eval.x86_64.django_1776_django-12155
+    """
+    name_tag = base_image.split("/")[-1]
+    name = name_tag.split(":")[0]
+    return name
+
+
 def get_official_docker_image(
     instance_id: str,
     docker_image_prefix="docker.io/swebench/",
@@ -45,10 +58,14 @@ def get_agent_server_docker_image(
     target: str = "source-minimal",
 ) -> str:
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
-    return (
-        "ghcr.io/openhands/eval-agent-server"
-        + f":v{SDK_VERSION}_{_base_slug(official_image_name)}_{target}"
-    )
+    custom_tag = extract_custom_tag(official_image_name)
+
+    # New tag format: {SHORT_SHA}-{custom_tag}-{target}
+    # For non-binary targets, append target suffix
+    if target == "binary":
+        return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}"
+    else:
+        return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}-{target}"
 
 
 def get_instruction(

From 4f3f9b1ed3ee12ad591e20f9cf4258bd91d8a0eb Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 18:58:54 +0000
Subject: [PATCH 29/66] refactor: deduplicate extract_custom_tag by importing
 from run_infer

Remove duplicate implementation of extract_custom_tag in build_images.py
and import it from run_infer.py instead. This avoids code duplication and
ensures both modules use the same implementation.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swe_bench/build_images.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 30d489a0..3951f559 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -20,7 +20,7 @@
 from pydantic import BaseModel, Field
 from tqdm.auto import tqdm
 
-from benchmarks.swe_bench.run_infer import get_official_docker_image
+from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.dataset import get_dataset
 from openhands.agent_server.docker.build import BuildOptions, build
@@ -30,25 +30,6 @@
 logger = get_logger(__name__)
 
 
-def extract_custom_tag(base_image: str) -> str:
-    """
-    Extract SWE-Bench instance ID from base image name.
-
-    Example:
-        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
-        -> sweb.eval.x86_64.django_1776_django-12155
-
-        docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
-        -> sweb.eval.x86_64.sympy_1776_sympy-18189
-
-        docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest
-        -> sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973
-    """
-    name_tag = base_image.split("/")[-1]
-    name = name_tag.split(":")[0]
-    return name
-
-
 @contextlib.contextmanager
 def capture_output(base_name: str, out_dir: Path):
     """

From 26c3f0226c6caa5de3ff64ec9b15046a912e8ac1 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 18:59:30 +0000
Subject: [PATCH 30/66] docs: clarify SHORT_SHA source in run_infer.py

Add comment explaining that SHORT_SHA is computed from the benchmarks
repo's git commit (via git rev-parse HEAD in cwd), not the SDK submodule.
This makes it clear that images are tagged with the benchmarks repo commit
for reproducibility and traceability.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swe_bench/run_infer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 782e2ca0..cb6751a3 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -16,6 +16,10 @@
     EvalMetadata,
     EvalOutput,
 )
+
+# SHORT_SHA is computed from git rev-parse HEAD in the current working directory
+# (benchmarks repo), not the SDK submodule. This ensures images are tagged with
+# the benchmarks repo commit, making them reproducible and traceable.
 from openhands.agent_server.docker.build import SHORT_SHA
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace

From 89e4cda7d25bd7e5f9549116938f6827e921ab2c Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:40:45 +0000
Subject: [PATCH 31/66] update sdk

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 204d3a4b..a0d35851 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715
+Subproject commit a0d3585104558b70a915419b9a9a17b3fa0a8a54

From eacfe0b62dc50ea11b219c129c9401307d249966 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:53:27 +0000
Subject: [PATCH 32/66] refactor

---
 benchmarks/swe_bench/build_images.py |  3 ++-
 benchmarks/swe_bench/run_infer.py    | 14 ++++----------
 benchmarks/utils/constants.py        |  1 +
 benchmarks/utils/version.py          | 27 +++++++++++++++++++++++++++
 vendor/software-agent-sdk            |  2 +-
 5 files changed, 35 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/utils/version.py

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 3951f559..999fc442 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -22,6 +22,7 @@
 
 from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image
 from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.dataset import get_dataset
 from openhands.agent_server.docker.build import BuildOptions, build
 from openhands.sdk import get_logger
@@ -93,7 +94,7 @@ def extend_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument(
         "--image",
-        default="ghcr.io/openhands/eval-agent-server",
+        default=EVAL_AGENT_SERVER_IMAGE,
         help="Target repo/name for built image",
     )
     parser.add_argument(
diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index cb6751a3..91e00b2f 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -5,6 +5,7 @@
 from jinja2 import Environment, FileSystemLoader
 
 from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import (
@@ -16,11 +17,7 @@
     EvalMetadata,
     EvalOutput,
 )
-
-# SHORT_SHA is computed from git rev-parse HEAD in the current working directory
-# (benchmarks repo), not the SDK submodule. This ensures images are tagged with
-# the benchmarks repo commit, making them reproducible and traceable.
-from openhands.agent_server.docker.build import SHORT_SHA
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -64,12 +61,9 @@ def get_agent_server_docker_image(
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     custom_tag = extract_custom_tag(official_image_name)
 
-    # New tag format: {SHORT_SHA}-{custom_tag}-{target}
     # For non-binary targets, append target suffix
-    if target == "binary":
-        return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}"
-    else:
-        return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}-{target}"
+    suffix = f"-{target}" if target != "binary" else ""
+    return f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
 
 
 def get_instruction(
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index aa912a43..9337b847 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1 +1,2 @@
 OUTPUT_FILENAME = "output.jsonl"
+EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"
diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py
new file mode 100644
index 00000000..951c6592
--- /dev/null
+++ b/benchmarks/utils/version.py
@@ -0,0 +1,27 @@
+import subprocess
+from pathlib import Path
+
+
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+
+
+def _get_submodule_sha(submodule_path: Path) -> str:
+    result = subprocess.run(
+        ["git", "submodule", "status", str(submodule_path)],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    sha = result.stdout.strip().split()[0].lstrip("+-")
+    return sha
+
+
+def get_sdk_sha() -> str:
+    """
+    Get the current git sha from the SDK submodule.
+    """
+    return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk")
+
+
+SDK_SHA = get_sdk_sha()
+SDK_SHORT_SHA = SDK_SHA[:7]
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index a0d35851..006e8db4 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit a0d3585104558b70a915419b9a9a17b3fa0a8a54
+Subproject commit 006e8db45c66c05f9d7b9ed00449bcf59d301f5c

From 3a2c0095c7c8a48c5a09168effe56a456afb89dd Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:53:48 +0000
Subject: [PATCH 33/66] remove tagging changes

---
 TAGGING_CHANGES.md | 194 ---------------------------------------------
 1 file changed, 194 deletions(-)
 delete mode 100644 TAGGING_CHANGES.md

diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md
deleted file mode 100644
index 1c362a36..00000000
--- a/TAGGING_CHANGES.md
+++ /dev/null
@@ -1,194 +0,0 @@
-# Docker Image Tagging Improvements
-
-## Summary
-
-This change replaces the long, auto-generated versioned tags with short, meaningful tags that include:
-- **SDK commit hash** (exact reproducibility)
-- **SWE-Bench instance ID** (clear identification)
-
-## Changes Made
-
-### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`)
-
-**Added two features:**
-
-1. **SDK_VERSION now uses git commit hash**
-   - `_sdk_version()` now automatically detects the SDK repo root and gets its commit hash
-   - Falls back to package version only if git info unavailable
-   - Works correctly in submodule contexts (uses SDK repo, not calling repo)
-   - No environment variable override needed - automatic and robust
-
-2. **`include_versioned_tag` option in BuildOptions**
-   - When `False`, skips the long versioned tag
-   - Defaults to `True` for backward compatibility
-   - Gives consumers control over tag format
-
-3. **Target-based tag suffixes** (replaces `-dev` suffix)
-   - Non-binary builds include `-{target}` suffix: `-source`, `-binary-minimal`, `-source-minimal`
-   - Binary builds have no suffix (it's the default/common case)
-   - More descriptive than previous `-dev` suffix (which only applied to source builds)
-   - Makes tag meaning immediately clear without needing to check build config
-   - Removed deprecated `is_dev` property
-
-### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`)
-
-**Added one function:**
-
-1. **`extract_instance_id(base_image)`**
-   - Parses SWE-Bench base image name to extract instance ID
-   - Examples:
-     - `...django_1776_django-12155:latest` → `django-12155`
-     - `...sympy_1776_sympy-18189:latest` → `sympy-18189`
-     - `...scikit-learn_3742_scikit-learn-25973:latest` → `scikit-learn-25973`
-
-**Modified build flow:**
-
-1. Per image: Extract instance ID and create custom tag `swebench-{instance_id}`
-2. Pass `include_versioned_tag=False` to disable long tag
-3. SDK automatically uses its own commit hash (no manual override needed)
-
-## Tag Format Comparison
-
-### Before (Old Format)
-```
-ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev
-```
-- **Length**: 137 characters
-- **Includes**: Package version (v1.0.0), full base image path, target
-- **Problem**: No git commit info, hard to parse
-
-### After (New Format)
-
-For source-minimal (most common for SWE-Bench):
-```
-ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal
-ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal
-```
-- **Length**: 84 characters (**39% shorter**)
-
-For binary (no suffix, it's the default):
-```
-ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155
-ghcr.io/openhands/eval-agent-server:main-swebench-django-12155
-```
-- **Length**: 69 characters (**50% shorter**)
-
-**Benefits**: 
-  - Exact reproducibility (commit hash)
-  - Easy to parse and filter
-  - Clear instance identification
-  - Clean tags for common case (binary has no suffix)
-
-## Tag Generation Logic
-
-The SDK's `all_tags` property generates:
-
-1. **Commit-based tag**: `{image}:{SDK_VERSION[:7]}-{custom_tag}[-{target}]{arch_suffix}`
-   - `SDK_VERSION[:7]` = First 7 chars of SDK commit hash (automatically detected)
-   - `custom_tag` = `swebench-{instance_id}`
-   - `target` = Build target (omitted for `binary`, included for others)
-   - Examples: 
-     - Binary: `a612c0a-swebench-django-12155`
-     - Source: `a612c0a-swebench-django-12155-source`
-     - Source-minimal: `a612c0a-swebench-django-12155-source-minimal`
-
-2. **Main branch tag** (if on main): `{image}:main-{custom_tag}[-{target}]{arch_suffix}`
-   - Examples:
-     - Binary: `main-swebench-django-12155`
-     - Source-minimal: `main-swebench-django-12155-source-minimal`
-
-3. **Versioned tag** (now disabled): `{image}:{versioned_tag}[-{target}]{arch_suffix}`
-   - Skipped when `include_versioned_tag=False`
-
-Non-binary targets include `-{target}` suffix for clarity. Binary has no suffix (default case).
-
-## Benefits
-
-### 1. Reproducibility
-- Git commit hash ensures exact SDK version tracking
-- Can reconstruct exact build environment from tag alone
-- No ambiguity (version 1.0.0 could be many commits)
-
-### 2. Usability
-- **39% shorter tags** (137 → 84 chars)
-- Easy to filter: `docker images | grep a612c0a`
-- Easy to identify: `swebench-django-12155-source-minimal` is self-documenting
-- Explicit target indication (no more guessing what `-dev` means)
-- Fits in terminal/log output better
-
-### 3. Maintainability
-- SDK changes are backward compatible (env var is optional)
-- Benchmarks repo has full control over tag format
-- Can easily extend with more metadata later
-
-## Example Build Command
-
-```bash
-uv run benchmarks/swe_bench/build_images.py \
-  --dataset princeton-nlp/SWE-bench_Verified \
-  --split test \
-  --image ghcr.io/openhands/eval-agent-server \
-  --target source-minimal \
-  --platforms linux/amd64 \
-  --push \
-  --max-workers 2
-```
-
-## Testing
-
-To test the tagging logic without building:
-
-```python
-from benchmarks.swe_bench.build_images import extract_instance_id
-
-# Test instance ID extraction
-base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest"
-print(extract_instance_id(base))  # → django-12155
-```
-
-## Migration Notes
-
-### For existing workflows:
-- No changes needed - SDK defaults to old behavior
-- Opt-in by setting `include_versioned_tag=False`
-
-### For CI/CD:
-- New tags will be generated automatically
-- Old tags (if any exist) remain unchanged
-- Can coexist during transition period
-
-### For consumers:
-- Update image references to use new tag format
-- Can filter by SDK version: `grep a612c0a`
-- Can filter by instance: `grep django-12155`
-
-## Future Enhancements
-
-Possible additions:
-1. **Docker labels** for metadata (see `docker inspect`)
-2. **Benchmarks commit** in tag or label
-3. **Build timestamp** in labels
-4. **Platform/architecture** in tag (already supported via `arch` param)
-
-## Files Changed
-
-1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py`
-   - Refactored `_sdk_version()` to automatically use SDK repo commit hash
-   - Added `_git_info_for_repo()` to get git info from specific directories
-   - Added `include_versioned_tag` field to `BuildOptions`
-   - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix
-   - Removed deprecated `is_dev` property
-   - Modified `all_tags` property to respect new flag and suffix logic
-
-2. `benchmarks/swe_bench/build_images.py`
-   - Added `extract_instance_id()` function
-   - Modified `build_one()` to use custom tags and disable versioned tag
-   - Removed unnecessary SDK_VERSION_OVERRIDE logic (now automatic)
-
-## Related PRs
-
-- **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088
-  - SDK_VERSION now automatically uses commit hash from SDK repo
-  - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive)
-  - Adds `include_versioned_tag` option
-  - Works correctly in submodule/vendored contexts

From 84c88760d8fa73ad08cd78118ee29e8b166edcd9 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:55:32 +0000
Subject: [PATCH 34/66] bump commit

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 006e8db4..5481fc8f 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 006e8db45c66c05f9d7b9ed00449bcf59d301f5c
+Subproject commit 5481fc8fb527078b74937cb800f001f4aba6882b

From de46db7265f4692428a66d4e77f0317cf5301b03 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:56:00 +0000
Subject: [PATCH 35/66] simplify build script

---
 benchmarks/swe_bench/build_images.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 999fc442..983062a2 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -138,9 +138,7 @@ class BuildOutput(BaseModel):
 
 
 def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput:
-    # Extract instance ID and build custom tag
     custom_tag = extract_custom_tag(base_image)
-
     opts = BuildOptions(
         base_image=base_image,
         custom_tags=custom_tag,

From bcbd455b74d2f225e9b63f3fdc6e2797c53ba4ba Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 19:58:41 +0000
Subject: [PATCH 36/66] bump version

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 5481fc8f..6eef51b3 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 5481fc8fb527078b74937cb800f001f4aba6882b
+Subproject commit 6eef51b3627a2f95709150b10cfb2094ad3a677e

From 96f2da678a00eaa893fb3b2c9396149cbca5dd33 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 20:01:41 +0000
Subject: [PATCH 37/66] bump

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 6eef51b3..681c9610 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 6eef51b3627a2f95709150b10cfb2094ad3a677e
+Subproject commit 681c9610f599134ad4d3bd9c3c7fd3750fe550c4

From aad870b0b021728d6bce3bb7e82338f010c6a669 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 20:10:37 +0000
Subject: [PATCH 38/66] bump

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 681c9610..a7a93a7a 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 681c9610f599134ad4d3bd9c3c7fd3750fe550c4
+Subproject commit a7a93a7a48f13ee6398f15b67b2bf339e647786a

From acee9cb175cdde6e16cdc11eb587b6f6451b261e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 20:32:19 +0000
Subject: [PATCH 39/66] refactor build util into shared file

---
 benchmarks/swe_bench/build_images.py | 308 ++++-----------------------
 benchmarks/utils/build_utils.py      | 301 ++++++++++++++++++++++++++
 2 files changed, 342 insertions(+), 267 deletions(-)
 create mode 100644 benchmarks/utils/build_utils.py

diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py
index 983062a2..2236a647 100644
--- a/benchmarks/swe_bench/build_images.py
+++ b/benchmarks/swe_bench/build_images.py
@@ -8,299 +8,73 @@
     --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
-import argparse
-import contextlib
-import io
 import sys
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from datetime import UTC, datetime
-from pathlib import Path
-from threading import Lock
 
-from pydantic import BaseModel, Field
-from tqdm.auto import tqdm
-
-from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image
-from benchmarks.utils.args_parser import get_parser
-from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.build_utils import (
+    build_all_images,
+    default_build_output_dir,
+    get_build_parser,
+)
 from benchmarks.utils.dataset import get_dataset
-from openhands.agent_server.docker.build import BuildOptions, build
 from openhands.sdk import get_logger
 
 
 logger = get_logger(__name__)
 
 
-@contextlib.contextmanager
-def capture_output(base_name: str, out_dir: Path):
-    """
-    Capture stdout/stderr during a block and stream them to:
-      <out_dir>/<base_name>/build-<timestamp>.log
-
-    Keeps redirect_* semantics; writes are realtime (line-buffered + flush).
-    Yields the log_path.
-    """
-    ts = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
-    log_path = Path(out_dir) / base_name / f"build-{ts}.log"
-    log_path.parent.mkdir(parents=True, exist_ok=True)
-
-    # tell the user where we’re logging, without being swallowed by the redirect
-    # (goes to the original stderr so it’s visible immediately)
-    logger.info(f"Logging build output to {log_path}")
-
-    # Open line-buffered so writes flush on newlines;
-    # also wrap to hard-flush every write.
-    f = log_path.open("w", encoding="utf-8", buffering=1)
-
-    class _FlushOnWrite(io.TextIOBase):
-        encoding = f.encoding
-
-        def __init__(self, sink):
-            self._sink = sink
-
-        def write(self, s):
-            n = self._sink.write(s)
-            self._sink.flush()
-            return n
-
-        def flush(self):
-            self._sink.flush()
-
-        def fileno(self):
-            # allow libs that try to detect fileno()
-            return self._sink.fileno()
-
-    sink = _FlushOnWrite(f)
-
-    # Redirect stdout/stderr to the same realtime sink.
-    with contextlib.redirect_stdout(sink), contextlib.redirect_stderr(sink):  # type: ignore[arg-type]
-        try:
-            yield log_path
-        finally:
-            # make sure everything is on disk
-            sink.flush()
-            f.close()
+def get_official_docker_image(
+    instance_id: str,
+    docker_image_prefix="docker.io/swebench/",
+) -> str:
+    # Official SWE-Bench image
+    # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
+    repo, name = instance_id.split("__")
+    official_image_name = docker_image_prefix.rstrip("/")
+    official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower()
+    logger.debug(f"Official SWE-Bench image: {official_image_name}")
+    return official_image_name
 
 
-def extend_parser() -> argparse.ArgumentParser:
-    """Reuse benchmark parser and extend with build-related options."""
-    parser = get_parser(add_llm_config=False)
-    parser.description = "Build all agent-server images for SWE-Bench base images."
+def extract_custom_tag(base_image: str) -> str:
+    """
+    Extract SWE-Bench instance ID from official SWE-Bench image name.
 
-    parser.add_argument(
-        "--docker-image-prefix",
-        default="docker.io/swebench/",
-        help="Prefix for SWE-Bench images",
-    )
-    parser.add_argument(
-        "--image",
-        default=EVAL_AGENT_SERVER_IMAGE,
-        help="Target repo/name for built image",
-    )
-    parser.add_argument(
-        "--target",
-        default="source-minimal",
-        help="Build target (source | source-minimal | binary | binary-minimal)",
-    )
-    parser.add_argument(
-        "--platforms", default="linux/amd64", help="Comma-separated platforms"
-    )
-    parser.add_argument(
-        "--push", action="store_true", help="Push via buildx instead of load locally"
-    )
-    parser.add_argument(
-        "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)"
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true", help="List base images only, don’t build"
-    )
-    return parser
+    Example:
+        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
+        -> sweb.eval.x86_64.django_1776_django-12155
+    """
+    name_tag = base_image.split("/")[-1]
+    name = name_tag.split(":")[0]
+    return name
 
 
-def collect_unique_base_images(dataset, split, prefix, n_limit):
+def collect_unique_base_images(dataset, split, n_limit):
     df = get_dataset(
         dataset_name=dataset, split=split, eval_limit=n_limit if n_limit else None
     )
     return sorted(
-        {
-            get_official_docker_image(str(row["instance_id"]), prefix)
-            for _, row in df.iterrows()
-        }
-    )
-
-
-class BuildOutput(BaseModel):
-    time: str = Field(default_factory=lambda: datetime.now(UTC).isoformat())
-    base_image: str
-    tags: list[str]
-    error: str | None = None
-    log_path: str | None = None
-
-
-def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput:
-    custom_tag = extract_custom_tag(base_image)
-    opts = BuildOptions(
-        base_image=base_image,
-        custom_tags=custom_tag,
-        image=args.image,
-        target=args.target,
-        platforms=[p.strip() for p in args.platforms.split(",") if p.strip()],
-        push=args.push,
+        {get_official_docker_image(str(row["instance_id"])) for _, row in df.iterrows()}
     )
-    tags = build(opts)
-    return BuildOutput(base_image=base_image, tags=tags, error=None)
-
-
-def _default_build_output_dir(
-    dataset: str, split: str, base_dir: Path | None = None
-) -> Path:
-    """
-    Default: ./builds/<dataset>/<split>
-    Keeps build outputs in one predictable place, easy to .gitignore.
-    """
-    root = (base_dir or Path.cwd()) / "builds" / dataset / split
-    root.mkdir(parents=True, exist_ok=True)
-    return root
-
-
-def _build_with_logging(
-    base: str, log_dir: Path, args: argparse.Namespace
-) -> BuildOutput:
-    """
-    Module-level function for building a single image with output capture.
-    Must be at module level to be picklable for ProcessPoolExecutor.
-    """
-    with capture_output(base, log_dir) as log_path:
-        result = build_one(base, args)
-        result.log_path = str(log_path)
-        return result
-
-
-def _update_pbar(
-    pbar: tqdm,
-    successes: int,
-    failures: int,
-    running: int,
-    sample: str | None,
-    last_event: str | None,
-):
-    postfix = f"✅ {successes}  ❌ {failures}  🏃 {running}"
-    if sample:
-        postfix += f" ({sample})"
-    if last_event:
-        pbar.set_description(last_event)
-    pbar.set_postfix_str(postfix, refresh=True)
 
 
 def main(argv: list[str]) -> int:
-    parser = extend_parser()
+    parser = get_build_parser()
     args = parser.parse_args(argv)
 
-    bases: list[str] = collect_unique_base_images(
-        args.dataset, args.split, args.docker_image_prefix, args.n_limit
+    base_images: list[str] = collect_unique_base_images(
+        args.dataset, args.split, args.n_limit
     )
-    # Decide manifest path under ./builds/<dataset>/<split>/
-    BUILD_DIR = _default_build_output_dir(args.dataset, args.split)
-    BUILD_LOG_DIR = BUILD_DIR / "logs"
-    manifest_path = BUILD_DIR / "manifest.jsonl"
-    manifest_path.parent.mkdir(parents=True, exist_ok=True)
-
-    if args.dry_run:
-        print("\n".join(bases))
-        return 0
-
-    successes = 0
-    failures = 0
-    in_progress: set[str] = set()
-    mu = Lock()
-
-    with (
-        manifest_path.open("w") as writer,
-        tqdm(total=len(bases), desc="Building agent-server images", leave=True) as pbar,
-    ):
-        _update_pbar(pbar, successes, failures, 0, None, "Queueing")
-
-        # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
-        # even if it's 1. Using processes instead of threads ensures proper isolation
-        # of stdout/stderr and logging handlers, preventing output mixing between builds.
-        with ProcessPoolExecutor(max_workers=args.max_workers) as ex:
-            futures = {}
-            for base in bases:
-                in_progress.add(base)
-                fut = ex.submit(_build_with_logging, base, BUILD_LOG_DIR, args)
-                futures[fut] = base
-
-            _update_pbar(
-                pbar,
-                successes,
-                failures,
-                len(in_progress),
-                next(iter(in_progress), None),
-                "Running",
-            )
-
-            for fut in as_completed(futures):
-                base = futures[fut]
-                try:
-                    result: BuildOutput = fut.result()
-                    writer.write(result.model_dump_json() + "\n")
-                    writer.flush()
-                    with mu:
-                        successes += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "✅ Done"
-                    )
-                except Exception as e:
-                    logger.error("Build failed for %s: %r", base, e)
-                    # Write a failure line to manifest; keep going.
-                    writer.write(
-                        BuildOutput(
-                            base_image=base, tags=[], error=repr(e)
-                        ).model_dump_json()
-                        + "\n"
-                    )
-                    writer.flush()
-                    with mu:
-                        failures += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "❌ Failed"
-                    )
-                finally:
-                    with mu:
-                        in_progress.discard(base)
-                    pbar.update(1)
-                    _update_pbar(
-                        pbar,
-                        successes,
-                        failures,
-                        len(in_progress),
-                        next(iter(in_progress), None),
-                        None,
-                    )
-
-    # Optional: write a tiny summary JSON next to the manifest for quick reads
-    summary_path = manifest_path.with_name("summary.json")
-    summary_path.write_text(
-        (
-            "{"
-            f'"dataset":"{args.dataset}",'
-            f'"split":"{args.split}",'
-            f'"total_unique_base_images":{len(bases)},'
-            f'"built":{successes},'
-            f'"failed":{failures}'
-            "}"
-        ),
-        encoding="utf-8",
-    )
-
-    logger.info(
-        "Done. Built=%d  Failed=%d  Manifest=%s  Summary=%s",
-        successes,
-        failures,
-        str(manifest_path),
-        str(summary_path),
+    build_dir = default_build_output_dir(args.dataset, args.split)
+    return build_all_images(
+        base_images=base_images,
+        target=args.target,
+        build_dir=build_dir,
+        image=args.image,
+        push=args.push,
+        max_workers=args.max_workers,
+        dry_run=args.dry_run,
+        base_image_to_custom_tag_fn=extract_custom_tag,
     )
-    return 1 if failures else 0
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
new file mode 100644
index 00000000..cdbfd8ca
--- /dev/null
+++ b/benchmarks/utils/build_utils.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""
+Shared utilities for batch building agent-server images.
+"""
+
+import argparse
+import contextlib
+import io
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from datetime import UTC, datetime
+from pathlib import Path
+from threading import Lock
+from typing import Callable
+
+from pydantic import BaseModel, Field
+from tqdm.auto import tqdm
+
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from openhands.agent_server.docker.build import BuildOptions, TargetType, build
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class BuildOutput(BaseModel):
+    time: str = Field(default_factory=lambda: datetime.now(UTC).isoformat())
+    base_image: str
+    tags: list[str]
+    error: str | None = None
+    log_path: str | None = None
+
+
+@contextlib.contextmanager
+def capture_output(base_name: str, out_dir: Path):
+    """
+    Capture stdout/stderr during a block and stream them to:
+      <out_dir>/<base_name>/build-<timestamp>.log
+
+    Keeps redirect_* semantics; writes are realtime (line-buffered + flush).
+    Yields the log_path.
+    """
+    ts = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
+    log_path = Path(out_dir) / base_name / f"build-{ts}.log"
+    log_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # tell the user where we’re logging, without being swallowed by the redirect
+    # (goes to the original stderr so it’s visible immediately)
+    logger.info(f"Logging build output to {log_path}")
+
+    # Open line-buffered so writes flush on newlines;
+    # also wrap to hard-flush every write.
+    f = log_path.open("w", encoding="utf-8", buffering=1)
+
+    class _FlushOnWrite(io.TextIOBase):
+        encoding = f.encoding
+
+        def __init__(self, sink):
+            self._sink = sink
+
+        def write(self, s):
+            n = self._sink.write(s)
+            self._sink.flush()
+            return n
+
+        def flush(self):
+            self._sink.flush()
+
+        def fileno(self):
+            # allow libs that try to detect fileno()
+            return self._sink.fileno()
+
+    sink = _FlushOnWrite(f)
+
+    # Redirect stdout/stderr to the same realtime sink.
+    with contextlib.redirect_stdout(sink), contextlib.redirect_stderr(sink):  # type: ignore[arg-type]
+        try:
+            yield log_path
+        finally:
+            # make sure everything is on disk
+            sink.flush()
+            f.close()
+
+
+def get_build_parser() -> argparse.ArgumentParser:
+    """Reuse benchmark parser and extend with build-related options."""
+    parser = get_parser(add_llm_config=False)
+    parser.description = "Script for build agent-server images."
+    parser.add_argument(
+        "--image",
+        default=EVAL_AGENT_SERVER_IMAGE,
+        help="Target repo/name for built image",
+    )
+    parser.add_argument(
+        "--target",
+        default="source-minimal",
+        help="Build target (source | source-minimal | binary | binary-minimal)",
+    )
+    parser.add_argument(
+        "--push", action="store_true", help="Push via buildx instead of load locally"
+    )
+    parser.add_argument(
+        "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)"
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true", help="List base images only, don’t build"
+    )
+    return parser
+
+
+def build_image(
+    base_image: str,
+    target_image: str,
+    custom_tag: str,
+    target: TargetType = "source-minimal",
+    push: bool = False,
+) -> BuildOutput:
+    opts = BuildOptions(
+        base_image=base_image,
+        custom_tags=custom_tag,
+        image=target_image,
+        target=target,
+        # SWE-Bench only supports linux/amd64 images
+        platforms=["linux/amd64"],
+        push=push,
+    )
+    tags = build(opts)
+    return BuildOutput(base_image=base_image, tags=tags, error=None)
+
+
+def _build_with_logging(
+    log_dir: Path,
+    base_image: str,
+    target_image: str,
+    target: TargetType = "source-minimal",
+    push: bool = False,
+    base_image_to_custom_tag_fn: Callable[[str], str] | None = None,
+) -> BuildOutput:
+    """
+    Module-level function for building a single image with output capture.
+    Must be at module level to be picklable for ProcessPoolExecutor.
+    """
+    with capture_output(base_image, log_dir) as log_path:
+        custom_tag = ""
+        if base_image_to_custom_tag_fn:
+            custom_tag = base_image_to_custom_tag_fn(base_image)
+        result = build_image(base_image, target_image, custom_tag, target, push)
+        result.log_path = str(log_path)
+        return result
+
+
+def _update_pbar(
+    pbar: tqdm,
+    successes: int,
+    failures: int,
+    running: int,
+    sample: str | None,
+    last_event: str | None,
+):
+    postfix = f"✅ {successes}  ❌ {failures}  🏃 {running}"
+    if sample:
+        postfix += f" ({sample})"
+    if last_event:
+        pbar.set_description(last_event)
+    pbar.set_postfix_str(postfix, refresh=True)
+
+
+def default_build_output_dir(
+    dataset: str, split: str, base_dir: Path | None = None
+) -> Path:
+    """
+    Default: ./builds/<dataset>/<split>
+    Keeps build outputs in one predictable place, easy to .gitignore.
+    """
+    root = (base_dir or Path.cwd()) / "builds" / dataset / split
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+def build_all_images(
+    base_images: list[str],
+    target: TargetType,
+    build_dir: Path,
+    image: str = EVAL_AGENT_SERVER_IMAGE,
+    push: bool = False,
+    base_image_to_custom_tag_fn: Callable[[str], str] | None = None,
+    max_workers: int = 1,
+    dry_run: bool = False,
+) -> int:
+    """
+    Build all specified base images concurrently, logging output and
+    writing a manifest file.
+
+    Args:
+        base_images: List of base images to build from.
+        target: Build target type.
+        build_dir: Directory to store build logs and manifest.
+        image: Target image name for built images.
+        push: Whether to push images via buildx.
+        base_image_to_custom_tag_fn: Function to extract custom tag from base image.
+        max_workers: Number of concurrent builds.
+        dry_run: If True, only list base images without building.
+
+    Returns:
+        Exit code: 0 if all builds succeeded, 1 if any failed.
+    """
+
+    build_log_dir = build_dir / "logs"
+    manifest_path = build_dir / "manifest.jsonl"
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if dry_run:
+        print("\n".join(base_images))
+        return 0
+
+    successes = 0
+    failures = 0
+    in_progress: set[str] = set()
+    mu = Lock()
+
+    with (
+        manifest_path.open("w") as writer,
+        tqdm(
+            total=len(base_images), desc="Building agent-server images", leave=True
+        ) as pbar,
+    ):
+        _update_pbar(pbar, successes, failures, 0, None, "Queueing")
+
+        # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
+        # even if it's 1. Using processes instead of threads ensures proper isolation
+        # of stdout/stderr and logging handlers, preventing output mixing between builds.
+        with ProcessPoolExecutor(max_workers=max_workers) as ex:
+            futures = {}
+            for base in base_images:
+                in_progress.add(base)
+                fut = ex.submit(
+                    _build_with_logging,
+                    build_log_dir,
+                    base,
+                    image,
+                    target,
+                    push,
+                    base_image_to_custom_tag_fn,
+                )
+                futures[fut] = base
+
+            _update_pbar(
+                pbar,
+                successes,
+                failures,
+                len(in_progress),
+                next(iter(in_progress), None),
+                "Running",
+            )
+
+            for fut in as_completed(futures):
+                base = futures[fut]
+                try:
+                    result: BuildOutput = fut.result()
+                    writer.write(result.model_dump_json() + "\n")
+                    writer.flush()
+                    with mu:
+                        successes += 1
+                    _update_pbar(
+                        pbar, successes, failures, len(in_progress), base, "✅ Done"
+                    )
+                except Exception as e:
+                    logger.error("Build failed for %s: %r", base, e)
+                    # Write a failure line to manifest; keep going.
+                    writer.write(
+                        BuildOutput(
+                            base_image=base, tags=[], error=repr(e)
+                        ).model_dump_json()
+                        + "\n"
+                    )
+                    writer.flush()
+                    with mu:
+                        failures += 1
+                    _update_pbar(
+                        pbar, successes, failures, len(in_progress), base, "❌ Failed"
+                    )
+                finally:
+                    with mu:
+                        in_progress.discard(base)
+                    pbar.update(1)
+                    _update_pbar(
+                        pbar,
+                        successes,
+                        failures,
+                        len(in_progress),
+                        next(iter(in_progress), None),
+                        None,
+                    )
+    logger.info(
+        "Done. Built=%d  Failed=%d  Manifest=%s",
+        successes,
+        failures,
+        str(manifest_path),
+    )
+    return 1 if failures else 0

From a4bf9e44b790855518f416eef315d8be9c89af4e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 20:38:33 +0000
Subject: [PATCH 40/66] simplify build on the fly logic

---
 benchmarks/swe_bench/run_infer.py | 89 +++++++++++++------------------
 1 file changed, 36 insertions(+), 53 deletions(-)

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 91e00b2f..32d5ed63 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -4,7 +4,12 @@
 
 from jinja2 import Environment, FileSystemLoader
 
+from benchmarks.swe_bench.build_images import (
+    extract_custom_tag,
+    get_official_docker_image,
+)
 from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -27,45 +32,6 @@
 logger = get_logger(__name__)
 
 
-def extract_custom_tag(base_image: str) -> str:
-    """
-    Extract SWE-Bench instance ID from base image name.
-
-    Example:
-        docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest
-        -> sweb.eval.x86_64.django_1776_django-12155
-    """
-    name_tag = base_image.split("/")[-1]
-    name = name_tag.split(":")[0]
-    return name
-
-
-def get_official_docker_image(
-    instance_id: str,
-    docker_image_prefix="docker.io/swebench/",
-) -> str:
-    # Official SWE-Bench image
-    # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
-    repo, name = instance_id.split("__")
-    official_image_name = docker_image_prefix.rstrip("/")
-    official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower()
-    logger.debug(f"Official SWE-Bench image: {official_image_name}")
-    return official_image_name
-
-
-def get_agent_server_docker_image(
-    instance_id: str,
-    docker_image_prefix="docker.io/swebench/",
-    target: str = "source-minimal",
-) -> str:
-    official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
-    custom_tag = extract_custom_tag(official_image_name)
-
-    # For non-binary targets, append target suffix
-    suffix = f"-{target}" if target != "binary" else ""
-    return f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
-
-
 def get_instruction(
     instance: dict,
     metadata: EvalMetadata,
@@ -132,26 +98,43 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
         """
         SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
         logger.info(f"SKIP_BUILD={SKIP_BUILD}")
-        if SKIP_BUILD:
-            agent_server_image = get_agent_server_docker_image(instance.id)
-            workspace = DockerWorkspace(
-                server_image=agent_server_image,
-                working_dir="/workspace",
-            )
-        else:
-            official_docker_image = get_official_docker_image(instance.id)
-            workspace = DockerWorkspace(
-                base_image=official_docker_image,
-                working_dir="/workspace",
-                target="source-minimal",
-            )
+        official_docker_image = get_official_docker_image(instance.id)
+        build_target = "source-minimal"
+        custom_tag = extract_custom_tag(official_docker_image)
+
+        # For non-binary targets, append target suffix
+        suffix = f"-{build_target}" if build_target != "binary" else ""
+        agent_server_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+        )
+        if not SKIP_BUILD:
             logger.info(
-                f"Building workspace from {official_docker_image}. "
+                f"Building workspace from {official_docker_image} "
+                f"for instance {instance.id}. "
                 "This may take a while...\n"
                 "You can run benchmarks/swe_bench/build_images.py and set "
                 "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
                 "agent-server image."
             )
+            output = build_image(
+                base_image=official_docker_image,
+                target_image=EVAL_AGENT_SERVER_IMAGE,
+                custom_tag=custom_tag,
+                target=build_target,
+                push=False,
+            )
+            logger.info(f"Image build output: {output}")
+            assert output.error is None, f"Image build failed: {output.error}"
+            if agent_server_image not in output.tags:
+                raise RuntimeError(
+                    f"Built image tags {output.tags} do not include expected tag "
+                    f"{agent_server_image}"
+                )
+
+        workspace = DockerWorkspace(
+            server_image=agent_server_image,
+            working_dir="/workspace",
+        )
         for cmd in self.metadata.env_setup_commands or []:
             res = workspace.execute_command(cmd)
             if res.exit_code != 0:

From 9ef0d4849df66507832f0a622010024dafcd0ba5 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 20:45:29 +0000
Subject: [PATCH 41/66] remove targets and platform

---
 .github/workflows/build-swe-bench-images.yml | 21 --------------------
 1 file changed, 21 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index bd4d741a..a1e1d4b9 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -14,21 +14,6 @@ on:
         required: true
         default: 'test'
         type: string
-      target:
-        description: 'Build target (source | source-minimal | binary | binary-minimal)'
-        required: false
-        default: 'source-minimal'
-        type: choice
-        options:
-          - source
-          - source-minimal
-          - binary
-          - binary-minimal
-      platforms:
-        description: 'Comma-separated platforms (e.g., linux/amd64,linux/arm64)'
-        required: false
-        default: 'linux/amd64'
-        type: string
       max-workers:
         description: 'Number of concurrent builds'
         required: false
@@ -44,8 +29,6 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  TARGET: source-minimal
-  PLATFORMS: linux/amd64
   MAX_WORKERS: '2'      # modest concurrency for reliability
   N_LIMIT: '10'           # empty = no limit
 
@@ -75,8 +58,6 @@ jobs:
         run: |
           if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi
           if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi
-          if [ -n "${{ inputs.target }}" ]; then echo "TARGET=${{ inputs.target }}" >> "$GITHUB_ENV"; fi
-          if [ -n "${{ inputs.platforms }}" ]; then echo "PLATFORMS=${{ inputs.platforms }}" >> "$GITHUB_ENV"; fi
           if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi
           # Empty string means "no limit"
           if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi
@@ -108,8 +89,6 @@ jobs:
             --dataset '${DATASET}' \
             --split '${SPLIT}' \
             --image ghcr.io/openhands/eval-agent-server \
-            --target '${TARGET}' \
-            --platforms '${PLATFORMS}' \
             --push \
             --max-workers '${MAX_WORKERS}'"
 

From 06e994a02c9db113b18466f1340438b0d0020342 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 20:57:56 +0000
Subject: [PATCH 42/66] Add automatic comment to issue #81 on successful build

This adds a new step to the build-and-push workflow that:
- Posts a comment to issue #81 when the build completes successfully
- Includes dataset name, split, SDK version, and workflow run link
- Lists all built image tags in a collapsible markdown section

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 51 +++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index a1e1d4b9..e6686281 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -41,10 +41,11 @@ jobs:
     runs-on:
       labels: blacksmith-32vcpu-ubuntu-2204
 
-    # Allow pushing to GHCR
+    # Allow pushing to GHCR and commenting on issues
     permissions:
       contents: read
       packages: write
+      issues: write
 
     steps:
       - name: Checkout repository
@@ -140,3 +141,51 @@ jobs:
             echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
             cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY"
           fi
+
+      - name: Comment on tracker issue
+        if: success()
+        run: |
+          # Get SDK version from submodule
+          SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//')
+          
+          # Count total images built
+          TOTAL_IMAGES=$(cat builds/*/manifest.jsonl | wc -l)
+          
+          # Extract all tags and format them as a markdown list
+          TAGS=$(cat builds/*/manifest.jsonl | python -c "
+          import sys
+          import json
+          for line in sys.stdin:
+              data = json.loads(line.strip())
+              if data.get('tags'):
+                  for tag in data['tags']:
+                      print(f'- \`{tag}\`')
+          ")
+          
+          # Create the comment body
+          COMMENT_BODY=$(cat <<EOF
+          ## Build Complete ✅
+
+          **Dataset:** \`${DATASET}\`
+          **Split:** \`${SPLIT}\`
+          **SDK Version:** [\`${SDK_SHA:0:7}\`](https://github.com/All-Hands-AI/agent-sdk/commit/${SDK_SHA})
+          **Workflow Run:** [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+
+          <details>
+          <summary>Built Tags (${TOTAL_IMAGES} images)</summary>
+
+          ${TAGS}
+
+          </details>
+          EOF
+          )
+          
+          # Post comment to issue #81
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \
+            -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From fba2a557bff7d5386da1c1aa6e2f1e7a6442a8dd Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 21:02:16 +0000
Subject: [PATCH 43/66] Fix SDK URL and add workflow trigger information

- Corrected SDK repository URL from All-Hands-AI/agent-sdk to OpenHands/software-agent-sdk
- Added 'Triggered by' field to comment to show workflow trigger source
- Updated .openhands/microagents/repo.md with correct SDK URL

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 12 +++++++++++-
 .openhands/microagents/repo.md               |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index e6686281..dc3cc849 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -162,14 +162,24 @@ jobs:
                       print(f'- \`{tag}\`')
           ")
           
+          # Determine how the workflow was triggered
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            TRIGGER="Manual trigger (workflow_dispatch)"
+          elif [ "${{ github.event_name }}" = "pull_request" ]; then
+            TRIGGER="Pull request [#${{ github.event.pull_request.number }}](${{ github.event.pull_request.html_url }})"
+          else
+            TRIGGER="${{ github.event_name }}"
+          fi
+          
           # Create the comment body
           COMMENT_BODY=$(cat <<EOF
           ## Build Complete ✅
 
           **Dataset:** \`${DATASET}\`
           **Split:** \`${SPLIT}\`
-          **SDK Version:** [\`${SDK_SHA:0:7}\`](https://github.com/All-Hands-AI/agent-sdk/commit/${SDK_SHA})
+          **SDK Version:** [\`${SDK_SHA:0:7}\`](https://github.com/OpenHands/software-agent-sdk/commit/${SDK_SHA})
           **Workflow Run:** [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          **Triggered by:** ${TRIGGER}
 
           <details>
           <summary>Built Tags (${TOTAL_IMAGES} images)</summary>
diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md
index 8aa0c1dc..0206a51d 100644
--- a/.openhands/microagents/repo.md
+++ b/.openhands/microagents/repo.md
@@ -84,7 +84,7 @@ make build  # Rebuild environment
 5. Update README.md with usage instructions
 
 # LLM Configuration
-LLM configs use JSON matching the [LLM class schema](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
+LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
 ```json
 {
   "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",

From 0ab219fa542dfddd31a6cd4321b6f208332af326 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 21:02:40 +0000
Subject: [PATCH 44/66] Update .gitignore to properly allow
 .openhands/microagents/

Changed .openhands/ to .openhands/* so that negation patterns work correctly

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 77513d9d..459fad58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -205,7 +205,7 @@ cython_debug/
 workspace/
 
 # IDE and editor directories
-.openhands/
+.openhands/*
 !.openhands/setup.sh
 !.openhands/microagents/
 .vscode/

From aa8b452fdf28906b01f3b914ed1844ed6d0ab1eb Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 22:30:57 +0000
Subject: [PATCH 45/66] Add error handling to skip comment when no images are
 built

The comment step now checks if manifest.jsonl files exist and contain
data before attempting to post a comment. This prevents posting comments
with '0 images' when builds complete successfully but produce no output
(e.g., during PR testing or when the build step is skipped).

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index dc3cc849..eb78f195 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -148,8 +148,21 @@ jobs:
           # Get SDK version from submodule
           SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//')
           
+          # Check if manifest files exist
+          if ! ls builds/*/manifest.jsonl >/dev/null 2>&1; then
+            echo "No manifest.jsonl files found in builds directory"
+            echo "Build may have completed but produced no images"
+            exit 0
+          fi
+          
           # Count total images built
-          TOTAL_IMAGES=$(cat builds/*/manifest.jsonl | wc -l)
+          TOTAL_IMAGES=$(cat builds/*/manifest.jsonl 2>/dev/null | wc -l)
+          
+          if [ "$TOTAL_IMAGES" -eq 0 ]; then
+            echo "No images found in manifest files"
+            echo "Skipping comment as there are no built images to report"
+            exit 0
+          fi
           
           # Extract all tags and format them as a markdown list
           TAGS=$(cat builds/*/manifest.jsonl | python -c "

From a95969eba03a80d714745c0a252580d3fb3a8a09 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 22:45:32 +0000
Subject: [PATCH 46/66] Fix manifest file path detection using find command

The previous check using 'builds/*/manifest.jsonl' only looked one level deep,
but the actual path is 'builds/princeton-nlp/SWE-bench_Verified/test/manifest.jsonl'
which is three levels deep. Using 'find' command now correctly locates manifest
files at any depth within the builds directory.

Tested with actual artifact from run #19182998503 containing 10 images.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index eb78f195..ded68a76 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -148,15 +148,17 @@ jobs:
           # Get SDK version from submodule
           SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//')
           
-          # Check if manifest files exist
-          if ! ls builds/*/manifest.jsonl >/dev/null 2>&1; then
+          # Find all manifest.jsonl files
+          MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null)
+          
+          if [ -z "$MANIFEST_FILES" ]; then
             echo "No manifest.jsonl files found in builds directory"
             echo "Build may have completed but produced no images"
             exit 0
           fi
           
           # Count total images built
-          TOTAL_IMAGES=$(cat builds/*/manifest.jsonl 2>/dev/null | wc -l)
+          TOTAL_IMAGES=$(cat $MANIFEST_FILES 2>/dev/null | wc -l)
           
           if [ "$TOTAL_IMAGES" -eq 0 ]; then
             echo "No images found in manifest files"
@@ -165,7 +167,7 @@ jobs:
           fi
           
           # Extract all tags and format them as a markdown list
-          TAGS=$(cat builds/*/manifest.jsonl | python -c "
+          TAGS=$(cat $MANIFEST_FILES | python -c "
           import sys
           import json
           for line in sys.stdin:

From 46b52667181118742a003204ae41b224b487d3e3 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 22:48:50 +0000
Subject: [PATCH 47/66] bump sdk

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index a7a93a7a..a90d1345 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit a7a93a7a48f13ee6398f15b67b2bf339e647786a
+Subproject commit a90d1345274403dd32a08f4415a0297d8b87e790

From 16526b3ece575f8c7d9fc2345703f175d888e8d4 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 7 Nov 2025 22:49:08 +0000
Subject: [PATCH 48/66] increase n work and n limit

---
 .github/workflows/build-swe-bench-images.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index ded68a76..10970a92 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -29,8 +29,8 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  MAX_WORKERS: '2'      # modest concurrency for reliability
-  N_LIMIT: '10'           # empty = no limit
+  MAX_WORKERS: '16'      # modest concurrency for reliability
+  N_LIMIT: '50'           # empty = no limit
 
 concurrency:
   group: build-swe-bench-${{ github.ref }}

From 90ee94eb2358e3e38144d67699c3eac7c9fbebe9 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 7 Nov 2025 22:50:25 +0000
Subject: [PATCH 49/66] Show only one tag per image in issue comment

Each image has multiple tags (base tag + detailed tag with hash).
Now showing only the first (cleaner) tag per image to reduce clutter
in the issue comment, making it easier to read.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 10970a92..792a078c 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -166,15 +166,15 @@ jobs:
             exit 0
           fi
           
-          # Extract all tags and format them as a markdown list
+          # Extract all tags and format them as a markdown list (one tag per image)
           TAGS=$(cat $MANIFEST_FILES | python -c "
           import sys
           import json
           for line in sys.stdin:
               data = json.loads(line.strip())
-              if data.get('tags'):
-                  for tag in data['tags']:
-                      print(f'- \`{tag}\`')
+              if data.get('tags') and len(data['tags']) > 0:
+                  # Only show the first tag per image to reduce clutter
+                  print(f'- \`{data[\"tags\"][0]}\`')
           ")
           
           # Determine how the workflow was triggered

From 2d10954776bd96edba994f3dcfd50320cc68eff1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Sat, 8 Nov 2025 18:53:50 +0000
Subject: [PATCH 50/66] bump sdk commit

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index a90d1345..aa954ce8 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit a90d1345274403dd32a08f4415a0297d8b87e790
+Subproject commit aa954ce876c55cf4b18e2296ca4458a7b1a44620

From 178123e8b5a216992ba31e85e18620f298b0b9cd Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Sat, 8 Nov 2025 18:54:20 +0000
Subject: [PATCH 51/66] increase to 500 limit and 32 concurrency

---
 .github/workflows/build-swe-bench-images.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 792a078c..de1aac2b 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -29,8 +29,8 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  MAX_WORKERS: '16'      # modest concurrency for reliability
-  N_LIMIT: '50'           # empty = no limit
+  MAX_WORKERS: '32'
+  N_LIMIT: '500'
 
 concurrency:
   group: build-swe-bench-${{ github.ref }}

From 061913409ec0222b8bde4141691dcfa6a0c4696d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Mon, 10 Nov 2025 19:59:41 +0000
Subject: [PATCH 52/66] disable rebuild on every push

---
 .github/workflows/build-swe-bench-images.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index de1aac2b..007976e1 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -1,7 +1,7 @@
 name: Build SWE-Bench Images
 
 on:
-  pull_request:  # for debugging
+  # pull_request:  # for debugging
   workflow_dispatch:
     inputs:
       dataset:

From e67b9b081a50052693f67d632c5adc54ef9ab055 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 10 Nov 2025 20:28:57 +0000
Subject: [PATCH 53/66] Fix workflow summary mismatch: use manifest.jsonl
 instead of summary.json

The new builder in build_utils.py only writes manifest.jsonl, not summary.json.
This commit updates the workflow to:
- Remove summary.json from artifact upload path
- Generate build summary from manifest.jsonl instead of summary.json
- Display total/successful/failed counts and list failed builds

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 50 +++++++++++++++++---
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 007976e1..f5c8d63f 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -109,9 +109,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: build-manifest-${{ github.run_id }}
-          path: |
-            builds/**/manifest.jsonl
-            builds/**/summary.json
+          path: builds/**/manifest.jsonl
           retention-days: 30
 
       - name: Archive build logs
@@ -137,9 +135,49 @@ jobs:
       - name: Display build summary
         if: always()
         run: |
-          if ls builds/*/summary.json >/dev/null 2>&1; then
-            echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
-            cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY"
+          # Find all manifest.jsonl files
+          MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null)
+          
+          if [ -z "$MANIFEST_FILES" ]; then
+            echo "No manifest.jsonl files found"
+            exit 0
+          fi
+          
+          # Generate summary from manifest files
+          echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          
+          # Count successes and failures
+          TOTAL=$(cat $MANIFEST_FILES 2>/dev/null | wc -l)
+          SUCCESSES=$(cat $MANIFEST_FILES 2>/dev/null | python -c "
+          import sys
+          import json
+          count = 0
+          for line in sys.stdin:
+              data = json.loads(line.strip())
+              if data.get('error') is None and len(data.get('tags', [])) > 0:
+                  count += 1
+          print(count)
+          ")
+          FAILURES=$((TOTAL - SUCCESSES))
+          
+          echo "**Total Images:** $TOTAL" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Successful Builds:** ✅ $SUCCESSES" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Failed Builds:** ❌ $FAILURES" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          
+          # Show failed builds if any
+          if [ "$FAILURES" -gt 0 ]; then
+            echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            cat $MANIFEST_FILES | python -c "
+          import sys
+          import json
+          for line in sys.stdin:
+              data = json.loads(line.strip())
+              if data.get('error') is not None or len(data.get('tags', [])) == 0:
+                  print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\")
+          " >> "$GITHUB_STEP_SUMMARY"
           fi
 
       - name: Comment on tracker issue

From 822e41747c5735a5252b18b91ca92514d58b9267 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 10 Nov 2025 20:34:57 +0000
Subject: [PATCH 54/66] Remove redundant 'Upload build manifest' step

The 'Archive build logs' step already packages the entire builds/ directory
(including manifest.jsonl files) into build-logs.tar.gz, so a separate step
to upload manifest.jsonl is redundant.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swe-bench-images.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index f5c8d63f..7a783335 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -104,14 +104,6 @@ jobs:
           DOCKER_BUILDKIT: 1
           BUILDKIT_PROGRESS: plain
 
-      - name: Upload build manifest
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: build-manifest-${{ github.run_id }}
-          path: builds/**/manifest.jsonl
-          retention-days: 30
-
       - name: Archive build logs
         if: always()
         run: |

From 04f0cf4fbf8822ac6ba669dc5f99402ef6d1caa4 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 11 Nov 2025 19:29:04 +0000
Subject: [PATCH 55/66] bump sdk to v1.1

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index aa954ce8..f45c900e 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit aa954ce876c55cf4b18e2296ca4458a7b1a44620
+Subproject commit f45c900e98db4d6623ff724ddfc769307494ab89

From a1c93c9c36bc8f8b359ed5aab6ae946942e17922 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 11 Nov 2025 19:34:48 +0000
Subject: [PATCH 56/66] support remote runtime & bump ver again

---
 benchmarks/swe_bench/run_infer.py | 90 ++++++++++++++++++++-----------
 benchmarks/utils/args_parser.py   |  7 +++
 benchmarks/utils/models.py        |  6 ++-
 uv.lock                           |  8 +--
 vendor/software-agent-sdk         |  2 +-
 5 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 32d5ed63..79b66ed2 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -26,7 +26,7 @@
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import DockerWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
 
 
 logger = get_logger(__name__)
@@ -96,45 +96,72 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
         """
         Use DockerWorkspace by default.
         """
-        SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
-        logger.info(f"SKIP_BUILD={SKIP_BUILD}")
         official_docker_image = get_official_docker_image(instance.id)
         build_target = "source-minimal"
         custom_tag = extract_custom_tag(official_docker_image)
-
         # For non-binary targets, append target suffix
         suffix = f"-{build_target}" if build_target != "binary" else ""
-        agent_server_image = (
-            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
-        )
-        if not SKIP_BUILD:
-            logger.info(
-                f"Building workspace from {official_docker_image} "
-                f"for instance {instance.id}. "
-                "This may take a while...\n"
-                "You can run benchmarks/swe_bench/build_images.py and set "
-                "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
-                "agent-server image."
+
+        if self.metadata.workspace_type == "docker":
+            agent_server_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
             )
-            output = build_image(
-                base_image=official_docker_image,
-                target_image=EVAL_AGENT_SERVER_IMAGE,
-                custom_tag=custom_tag,
-                target=build_target,
-                push=False,
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swe_bench/build_images.py and set "
+                    "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if agent_server_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{agent_server_image}"
+                    )
+
+            workspace = DockerWorkspace(
+                server_image=agent_server_image,
+                working_dir="/workspace",
             )
-            logger.info(f"Image build output: {output}")
-            assert output.error is None, f"Image build failed: {output.error}"
-            if agent_server_image not in output.tags:
-                raise RuntimeError(
-                    f"Built image tags {output.tags} do not include expected tag "
-                    f"{agent_server_image}"
+        elif self.metadata.workspace_type == "remote":
+            runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
+            if not runtime_api_key:
+                raise ValueError(
+                    "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
-        workspace = DockerWorkspace(
-            server_image=agent_server_image,
-            working_dir="/workspace",
-        )
+            agent_server_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+            )
+            logger.info(
+                f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})"
+            )
+            workspace = APIRemoteWorkspace(
+                runtime_api_url=os.getenv(
+                    "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
+                ),
+                runtime_api_key=runtime_api_key,
+                server_image="ghcr.io/openhands/agent-server:main-python",
+            )
+        else:
+            raise ValueError(
+                f"Unsupported workspace_type: {self.metadata.workspace_type}"
+            )
+
         for cmd in self.metadata.env_setup_commands or []:
             res = workspace.execute_command(cmd)
             if res.exit_code != 0:
@@ -297,6 +324,7 @@ def main() -> None:
         critic_name=args.critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        workspace_type=args.workspace,
     )
 
     # Run orchestrator with a simple JSONL writer
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index cb1584e5..56f950ad 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -25,6 +25,13 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         help="Dataset name",
     )
     parser.add_argument("--split", type=str, default="test", help="Dataset split")
+    parser.add_argument(
+        "--workspace",
+        type=str,
+        default="docker",
+        choices=["docker", "remote"],
+        help="Type of workspace to use (default: docker)",
+    )
     parser.add_argument(
         "--max-iterations", type=int, default=100, help="Maximum iterations"
     )
diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py
index b10df1f3..d3599772 100644
--- a/benchmarks/utils/models.py
+++ b/benchmarks/utils/models.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Literal
 
 from pydantic import BaseModel, Field
 
@@ -45,6 +45,10 @@ class EvalMetadata(BaseModel):
         ge=0,
         description="Maximum number of retries for instances that throw exceptions",
     )
+    workspace_type: Literal["docker", "remote"] = Field(
+        default="docker",
+        description="Type of workspace to use, e.g., 'docker' or 'remote'",
+    )
 
 
 EvalInstanceID = str
diff --git a/uv.lock b/uv.lock
index 6ea29413..ab6872cf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1942,7 +1942,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.0.0"
+version = "1.1.0"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2034,7 +2034,7 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.0.0"
+version = "1.1.0"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "fastmcp" },
@@ -2070,7 +2070,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.0.0"
+version = "1.1.0"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2097,7 +2097,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.0.0"
+version = "1.1.0"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-sdk" },
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index f45c900e..85803a23 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit f45c900e98db4d6623ff724ddfc769307494ab89
+Subproject commit 85803a23fc51edf2990a46be39b28df60f99ebea

From 07abd723af145fcc7f7d3024ebeb2ef6a614d563 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 11 Nov 2025 19:36:54 +0000
Subject: [PATCH 57/66] fix target type

---
 benchmarks/swe_bench/run_infer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 79b66ed2..6e3d08f9 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -155,7 +155,8 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                     "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
                 ),
                 runtime_api_key=runtime_api_key,
-                server_image="ghcr.io/openhands/agent-server:main-python",
+                server_image=agent_server_image,
+                target_type="source" if "source" in build_target else "binary",
             )
         else:
             raise ValueError(

From 49499571d7b65d1011e84c47580e2e808e8bb993 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 11 Nov 2025 22:57:25 +0000
Subject: [PATCH 58/66] bump sdk

---
 vendor/software-agent-sdk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index 85803a23..d67bd848 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit 85803a23fc51edf2990a46be39b28df60f99ebea
+Subproject commit d67bd8485bd1389e4e30a5b89d2c9d8f790cd521

From 94c4326dab74e06d9a79c15f63fa49457ed6dd89 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 12 Nov 2025 16:58:55 +0000
Subject: [PATCH 59/66] check image exists before launching remote runtime job

---
 benchmarks/swe_bench/run_infer.py |   6 ++
 benchmarks/utils/image_utils.py   | 105 ++++++++++++++++++++++++++++++
 pyproject.toml                    |   1 +
 uv.lock                           |  31 +++++++++
 4 files changed, 143 insertions(+)
 create mode 100644 benchmarks/utils/image_utils.py

diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py
index 6e3d08f9..f7e840f3 100644
--- a/benchmarks/swe_bench/run_infer.py
+++ b/benchmarks/swe_bench/run_infer.py
@@ -17,6 +17,7 @@
     construct_eval_output_dir,
     get_default_on_result_writer,
 )
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -147,6 +148,11 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
             agent_server_image = (
                 f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
+            if not image_exists(agent_server_image):
+                raise RuntimeError(
+                    f"Agent server image {agent_server_image} does not exist in container registry, "
+                    "make sure to build, push it, and make it public accessible before using remote workspace."
+                )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})"
             )
diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
new file mode 100644
index 00000000..a463f3b4
--- /dev/null
+++ b/benchmarks/utils/image_utils.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+import base64
+import sys
+
+import requests
+
+
+ACCEPT = ",".join(
+    [
+        "application/vnd.oci.image.index.v1+json",
+        "application/vnd.oci.image.manifest.v1+json",
+        "application/vnd.docker.distribution.manifest.v2+json",
+        "application/vnd.docker.distribution.manifest.list.v2+json",
+    ]
+)
+
+
+def _parse(image: str):
+    digest = None
+    if "@" in image:
+        image, digest = image.split("@", 1)
+    tag = None
+    last = image.rsplit("/", 1)[-1]
+    if ":" in last:  # tag after last slash (not registry:port)
+        image, tag = image.rsplit(":", 1)
+    parts = image.split("/")
+    if "." in parts[0] or ":" in parts[0] or parts[0] == "localhost":
+        registry, repo = parts[0], "/".join(parts[1:])
+    else:
+        registry, repo = "registry-1.docker.io", "/".join(parts)
+    ref = digest or tag or "latest"
+    return registry, repo, ref
+
+
+def _dockerhub_token(repo: str) -> str | None:
+    url = f"https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull"
+    r = requests.get(url, timeout=10)
+    if r.ok:
+        return r.json().get("token")
+    return None
+
+
+def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None:
+    # Public: anonymous works; Private: Basic auth with PAT (read:packages) to get bearer
+    url = f"https://ghcr.io/token?service=ghcr.io&scope=repository:{repo}:pull"
+    headers = {}
+    if username and pat:
+        headers["Authorization"] = (
+            "Basic " + base64.b64encode(f"{username}:{pat}".encode()).decode()
+        )
+    r = requests.get(url, headers=headers, timeout=10)
+    if r.ok:
+        return r.json().get("token")
+    return None
+
+
+def image_exists(
+    image_ref: str,
+    gh_username: str | None = None,
+    gh_pat: str | None = None,  # GitHub PAT with read:packages for private GHCR
+    docker_token: str | None = None,  # Docker Hub JWT if you already have one
+) -> bool:
+    registry, repo, ref = _parse(image_ref)
+    headers = {"Accept": ACCEPT}
+
+    if registry in ("docker.io", "index.docker.io", "registry-1.docker.io"):
+        base = "https://registry-1.docker.io"
+        token = docker_token or _dockerhub_token(repo)
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+    elif registry == "ghcr.io":
+        base = "https://ghcr.io"
+        token = _ghcr_token(repo, gh_username, gh_pat)
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+    else:
+        base = f"https://{registry}"
+
+    url = f"{base}/v2/{repo}/manifests/{ref}"
+    try:
+        r = requests.head(url, headers=headers, timeout=10)
+        if r.status_code in (
+            405,
+            406,
+        ):  # some registries disallow HEAD or need GET for content-negotiation
+            r = requests.get(url, headers=headers, timeout=10)
+        # 200 -> exists; 401/403 -> exists but unauthorized; 404 -> not found
+        return r.status_code == 200
+    except requests.RequestException:
+        return False
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(
+            "Usage: python image_check.py <image[:tag]|image@sha256:...> [gh_user] [gh_pat]"
+        )
+        sys.exit(1)
+
+    image = sys.argv[1]
+    gh_user = sys.argv[2] if len(sys.argv) > 2 else None
+    gh_pat = sys.argv[3] if len(sys.argv) > 3 else None
+
+    ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat)
+    print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}")
diff --git a/pyproject.toml b/pyproject.toml
index 5e924d58..8561951d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "openhands-workspace",
     "modal>=1.1.4",
     "swebench",
+    "docker-registry-client>=0.5.2",
 ]
 
 [project.scripts]
diff --git a/uv.lock b/uv.lock
index ab6872cf..7c233247 100644
--- a/uv.lock
+++ b/uv.lock
@@ -719,6 +719,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" },
 ]
 
+[[package]]
+name = "docker-registry-client"
+version = "0.5.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ecdsa" },
+    { name = "jws" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/3c/287104dcdbd6fd3d367b8bc50f1387f8326fb8026312af61b2bcf5c09387/docker-registry-client-0.5.2.tar.gz", hash = "sha256:8482efc9ec9ec708dfb74193cdfa530eee23c93596c63d704c5a3702b049e58f", size = 8037, upload-time = "2017-06-16T16:05:24.387Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/b4/f1b3b2da3024fc20fe1e359871dc3c4f8e0ade1b0bbd85294f244c6a29d7/docker_registry_client-0.5.2-py2.py3-none-any.whl", hash = "sha256:cb6c1c5e72e091ada9b32499c8529850e247bafb2202bc31bbe45e9710bf9038", size = 11731, upload-time = "2017-06-16T16:05:26.057Z" },
+]
+
 [[package]]
 name = "docstring-parser"
 version = "0.17.0"
@@ -737,6 +751,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/66/dd/f95350e853a4468ec37478414fc04ae2d61dad7a947b3015c3dcc51a09b9/docutils-0.22.2-py3-none-any.whl", hash = "sha256:b0e98d679283fc3bb0ead8a5da7f501baa632654e7056e9c5846842213d674d8", size = 632667, upload-time = "2025-09-20T17:55:43.052Z" },
 ]
 
+[[package]]
+name = "ecdsa"
+version = "0.13.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/d8/9c3596fd0f18ae0a76333492a119c00183323d8e64de1a4f4bd642856963/ecdsa-0.13.3.tar.gz", hash = "sha256:163c80b064a763ea733870feb96f9dd9b92216cfcacd374837af18e4e8ec3d4d", size = 60477, upload-time = "2019-10-07T14:05:24.318Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a6/81/2b170b460c84fdc8700cf08aa077ac6a9ff41f4ad3f05d0b3a64ba9f8f2e/ecdsa-0.13.3-py2.py3-none-any.whl", hash = "sha256:9814e700890991abeceeb2242586024d4758c8fc18445b194a49bd62d85861db", size = 52113, upload-time = "2019-10-07T14:05:22.583Z" },
+]
+
 [[package]]
 name = "email-validator"
 version = "2.3.0"
@@ -1470,6 +1493,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "jws"
+version = "0.1.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/9e/1536d578ed50f5fe8196310ddcc921a3cd8e973312d60ac74488b805d395/jws-0.1.3.tar.gz", hash = "sha256:0e3d4cb06ae7c5c1d16d357b4e7acb5c5ecab0cccb3a4b998035b85052488053", size = 8104, upload-time = "2015-03-10T15:53:37.844Z" }
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.12.0"
@@ -1975,6 +2004,7 @@ version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "datasets" },
+    { name = "docker-registry-client" },
     { name = "huggingface-hub" },
     { name = "jinja2" },
     { name = "modal" },
@@ -2005,6 +2035,7 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "datasets" },
+    { name = "docker-registry-client", specifier = ">=0.5.2" },
     { name = "huggingface-hub" },
     { name = "jinja2" },
     { name = "modal", specifier = ">=1.1.4" },

From 5d734aac4bf2c6c7db08f1693b52f23a67a94204 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 16:10:28 +0000
Subject: [PATCH 60/66] trying fixing docker build trigger

---
 .github/workflows/build-swe-bench-images.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 9a4aee6f..a87c5e09 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -1,7 +1,7 @@
 name: Build SWE-Bench Images
 
 on:
-  pull_request:
+  pull_request:pull_request_target:
     types: [labeled]
   workflow_dispatch:
     inputs:
@@ -40,9 +40,10 @@ concurrency:
 jobs:
   build-and-push:
     # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench'
-    if: |
+    if: >
       github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' && github.event.label.name == 'build-swebench')
+      (github.event_name == 'pull_request_target' && github.event.action == 'labeled' && github.event.label.name == 'build-swebench')
+
     runs-on:
       labels: blacksmith-32vcpu-ubuntu-2204
 

From 3e1f8f9c20e70304fc6391736b01334a7945717c Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 16:24:12 +0000
Subject: [PATCH 61/66] fix typo

---
 .github/workflows/build-swe-bench-images.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index a87c5e09..f7821da9 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -1,7 +1,7 @@
 name: Build SWE-Bench Images
 
 on:
-  pull_request:pull_request_target:
+  pull_request_target:
     types: [labeled]
   workflow_dispatch:
     inputs:

From 860187557555506b4a9423ec9813c4ee9d9e4882 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 16:30:51 +0000
Subject: [PATCH 62/66] tweak

---
 .github/workflows/build-swe-bench-images.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index f7821da9..db3612c9 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -2,7 +2,8 @@ name: Build SWE-Bench Images
 
 on:
   pull_request_target:
-    types: [labeled]
+    types:
+      - labeled
   workflow_dispatch:
     inputs:
       dataset:
@@ -42,8 +43,7 @@ jobs:
     # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench'
     if: >
       github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request_target' && github.event.action == 'labeled' && github.event.label.name == 'build-swebench')
-
+      (github.event_name == 'pull_request_target' && github.event.label.name == 'build-swebench')
     runs-on:
       labels: blacksmith-32vcpu-ubuntu-2204
 

From af6966a8307c31847b9631c7df957c8d22655a7b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 16:34:02 +0000
Subject: [PATCH 63/66] tweak

---
 .github/workflows/build-swe-bench-images.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index db3612c9..9cc4844c 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -2,8 +2,7 @@ name: Build SWE-Bench Images
 
 on:
   pull_request_target:
-    types:
-      - labeled
+    types: [labeled]
   workflow_dispatch:
     inputs:
       dataset:
@@ -40,10 +39,11 @@ concurrency:
 
 jobs:
   build-and-push:
-    # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench'
     if: >
       github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request_target' && github.event.label.name == 'build-swebench')
+      (github.event_name == 'pull_request_target' &&
+       github.event.label.name == 'build-swebench')
+
     runs-on:
       labels: blacksmith-32vcpu-ubuntu-2204
 

From 2160810d21aae96d7f81f9b6c2d898b2a2cac439 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 16:36:02 +0000
Subject: [PATCH 64/66] drop default

---
 .github/workflows/build-swe-bench-images.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml
index 9cc4844c..52b0d6e6 100644
--- a/.github/workflows/build-swe-bench-images.yml
+++ b/.github/workflows/build-swe-bench-images.yml
@@ -18,7 +18,7 @@ on:
       max-workers:
         description: 'Number of concurrent builds'
         required: false
-        default: '64'
+        default: '32'
         type: string
       n-limit:
         description: 'Limit number of images to build (for testing). Leave blank for no limit.'
@@ -30,7 +30,7 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  MAX_WORKERS: '64'
+  MAX_WORKERS: '32'
   N_LIMIT: '500'
 
 concurrency:

From fd5c0c68cdc8d8a13f5b692c6f8ffd10a810b2fb Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 17:22:26 +0000
Subject: [PATCH 65/66] sleep after failure

---
 benchmarks/utils/build_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index cdcb9e61..31b718d4 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -7,6 +7,7 @@
 import contextlib
 import io
 import subprocess
+import time
 import tomllib
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import UTC, datetime
@@ -224,6 +225,7 @@ def _build_with_logging(
                 logger.info(
                     f"Retrying build for {base_image} (attempt {attempt + 1}/{max_retries})"
                 )
+                time.sleep(2 + attempt * 2)
             result = build_image(base_image, target_image, custom_tag, target, push)
             result.log_path = str(log_path)
             if not result.error:

From ea3f69fbe7de07b0f68cec5c12777f54b947d6a6 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 13 Nov 2025 17:25:25 +0000
Subject: [PATCH 66/66] check target image existence before build

---
 benchmarks/utils/build_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index 31b718d4..e55b3273 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -20,6 +20,7 @@
 
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.image_utils import image_exists
 from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
@@ -196,6 +197,11 @@ def build_image(
         git_sha=git_sha,
         sdk_version=sdk_version,
     )
+    for t in opts.all_tags[0]:
+        # Check if image exists or not
+        if image_exists(t):
+            logger.info(f"Image {t} already exists. Skipping build.")
+            return BuildOutput(base_image=base_image, tags=[t], error=None)
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)