From c9b86c1f69a01fc0d9d0353150ebb94e6034470b Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 27 Oct 2025 21:39:45 +0000 Subject: [PATCH 01/66] Add GitHub workflow for building SWE-Bench images with Blacksmith caching - Create workflow that can be manually triggered via workflow_dispatch - Integrate Blacksmith caching for faster Docker builds - Configure workflow to push images to ghcr.io/openhands/eval-agent-server - Make --critic parameter optional in build_images.py for build-only usage - Fix .gitignore patterns for eval_outputs and builds directories This workflow follows Blacksmith documentation for Docker builds and allows building SWE-Bench evaluation images with configurable parameters like dataset, split, target, platforms, and concurrent workers. Closes #37 --- .github/workflows/build-swe-bench-images.yml | 141 +++++++++++++++++++ .gitignore | 4 +- benchmarks/swe_bench/build_images.py | 7 + 3 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-swe-bench-images.yml diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml new file mode 100644 index 00000000..a412e653 --- /dev/null +++ b/.github/workflows/build-swe-bench-images.yml @@ -0,0 +1,141 @@ +name: Build SWE-Bench Images + +on: + workflow_dispatch: + inputs: + dataset: + description: 'Dataset name (e.g., princeton-nlp/SWE-bench_Verified)' + required: true + default: 'princeton-nlp/SWE-bench_Verified' + type: string + split: + description: 'Dataset split (e.g., test, dev)' + required: true + default: 'test' + type: string + target: + description: 'Build target (source | source-minimal | binary | binary-minimal)' + required: false + default: 'source-minimal' + type: choice + options: + - source + - source-minimal + - binary + - binary-minimal + platforms: + description: 'Comma-separated platforms (e.g., linux/amd64,linux/arm64)' + required: false + default: 'linux/amd64' + type: string + max-workers: + description: 'Number of concurrent builds' + required: false + default: '1' + type: string + n-limit: + description: 'Limit number of images to build (for testing)' + required: false + default: '' + type: string + +jobs: + build-and-push: + runs-on: + labels: blacksmith-32vcpu-ubuntu-2204 + + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + network=host + + - name: Cache Docker layers + uses: useblacksmith/cache@v6 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + version: "0.8.13" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + uv sync --dev + + - name: Build and push SWE-Bench images + run: | + # Construct the command with required arguments + CMD="uv run benchmarks/swe_bench/build_images.py \ + --dataset ${{ inputs.dataset }} \ + --split ${{ inputs.split }} \ + --image ghcr.io/openhands/eval-agent-server \ + --target ${{ inputs.target }} \ + --platforms ${{ inputs.platforms }} \ + --push \ + --max-workers ${{ inputs.max-workers }}" + + # Add optional n-limit if provided + if [ -n "${{ inputs.n-limit }}" ]; then + CMD="$CMD --n-limit ${{ inputs.n-limit }}" + fi + + # Execute the build command + eval $CMD + env: + DOCKER_BUILDKIT: 1 + BUILDKIT_PROGRESS: plain + + - name: Upload build manifest + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-manifest-${{ inputs.dataset }}-${{ inputs.split }} + path: | + builds/**/manifest.jsonl + builds/**/summary.json + retention-days: 30 + + - name: Upload build logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-logs-${{ inputs.dataset }}-${{ inputs.split }} + path: builds/**/logs/**/*.log + retention-days: 7 + + - name: Display build summary + if: always() + run: | + if [ -f builds/*/summary.json ]; then + echo "## Build Summary" >> $GITHUB_STEP_SUMMARY + cat builds/*/summary.json | python -m json.tool >> $GITHUB_STEP_SUMMARY + fi diff --git a/.gitignore b/.gitignore index 43135338..89a2b5d6 100644 --- a/.gitignore +++ b/.gitignore @@ -213,5 +213,5 @@ workspace/ !.llm_config/example.json # Evaluation outputs -./eval_outputs -./builds +eval_outputs/ +builds/ diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 6bbc739a..823aa176 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -86,6 +86,13 @@ def extend_parser() -> argparse.ArgumentParser: parser = get_parser(add_llm_config=False) parser.description = "Build all agent-server images for SWE-Bench base images." + # Make --critic optional for build_images use case + for action in parser._actions: + if action.dest == "critic": + action.required = False + action.default = "none" + break + parser.add_argument( "--docker-image-prefix", default="docker.io/swebench/", From 57520433a5902922bbbbaf02100d1c8ad94e21d4 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 3 Nov 2025 21:02:28 +0000 Subject: [PATCH 02/66] Use Blacksmith's setup-docker-builder action for faster Docker layer caching Following the pattern from https://github.com/OpenHands/software-agent-sdk/pull/990 and Blacksmith's official documentation (https://docs.blacksmith.sh/blacksmith-caching/docker-builds), this change replaces the standard docker/setup-buildx-action with useblacksmith/setup-docker-builder@v1. Key improvements: - Replaces docker/setup-buildx-action@v3 with useblacksmith/setup-docker-builder@v1 - Removes manual cache configuration (useblacksmith/cache@v6) - Blacksmith's Docker builder automatically manages Docker layer caching via NVMe-backed sticky disks - Provides 2x to 40x improvements in build times according to Blacksmith's customers - Since we only build amd64 images, we don't need the complex multi-platform matrix strategy This approach is recommended for workflows that use Docker commands directly (as opposed to using docker/build-push-action). Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index a412e653..680869b7 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -54,20 +54,8 @@ jobs: with: submodules: recursive - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - driver-opts: | - image=moby/buildkit:latest - network=host - - - name: Cache Docker layers - uses: useblacksmith/cache@v6 - with: - path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} - restore-keys: | - ${{ runner.os }}-buildx- + - name: Set up Docker Buildx with Blacksmith + uses: useblacksmith/setup-docker-builder@v1 - name: Log in to GitHub Container Registry uses: docker/login-action@v3 From 85080062806bf2ba90b1a0340140c6e634e80e0b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 20:41:17 +0000 Subject: [PATCH 03/66] revert unneed stuff --- benchmarks/swe_bench/build_images.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index dcd6301e..fc7c22e1 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -86,13 +86,6 @@ def extend_parser() -> argparse.ArgumentParser: parser = get_parser(add_llm_config=False) parser.description = "Build all agent-server images for SWE-Bench base images." - # Make --critic optional for build_images use case - for action in parser._actions: - if action.dest == "critic": - action.required = False - action.default = "none" - break - parser.add_argument( "--docker-image-prefix", default="docker.io/swebench/", From a565e77a6bd53475ab97a5952ab246e3f41b69c1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 20:46:24 +0000 Subject: [PATCH 04/66] simplify setup dependency --- .github/workflows/build-swe-bench-images.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 680869b7..e2f78007 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -68,16 +68,10 @@ jobs: uses: astral-sh/setup-uv@v7 with: enable-cache: true - version: "0.8.13" - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - + - name: Install dependencies run: | - uv sync --dev + make build - name: Build and push SWE-Bench images run: | From 9bbd7fbbd0448fba75ab6cfbbe2d644d4103568d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 20:56:01 +0000 Subject: [PATCH 05/66] set eval-agent-server --- README.md | 4 ++-- benchmarks/swe_bench/build_images.py | 4 ++-- benchmarks/swe_bench/run_infer.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fb4695e6..ef2f86c2 100644 --- a/README.md +++ b/README.md @@ -95,8 +95,8 @@ Build ALL docker images for SWE-Bench. ```bash uv run benchmarks/swe_bench/build_images.py \ --dataset princeton-nlp/SWE-bench_Verified --split test \ - --critic pass \ - --image ghcr.io/openhands/agent-server --target binary-minimal + --image ghcr.io/openhands/eval- + agent-server --target binary-minimal ``` diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index fc7c22e1..08054c35 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -5,7 +5,7 @@ Example: uv run benchmarks/swe_bench/build_images.py \ --dataset princeton-nlp/SWE-bench_Verified --split test \ - --image ghcr.io/openhands/agent-server --target source-minimal + --image ghcr.io/openhands/eval-agent-server --target source-minimal """ import argparse @@ -93,7 +93,7 @@ def extend_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--image", - default="ghcr.io/openhands/agent-server", + default="ghcr.io/openhands/eval-agent-server", help="Target repo/name for built image", ) parser.add_argument( diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 4bb67edc..98eb396f 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -46,7 +46,7 @@ def get_agent_server_docker_image( ) -> str: official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( - "ghcr.io/openhands/agent-server" + "ghcr.io/openhands/eval-agent-server" + f":v{SDK_VERSION}_{_base_slug(official_image_name)}_{target}" ) From c661b2cd02c08d13ecf27c87345cc271365a5c98 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 20:57:00 +0000 Subject: [PATCH 06/66] fix line break --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ef2f86c2..0fcc3725 100644 --- a/README.md +++ b/README.md @@ -95,8 +95,7 @@ Build ALL docker images for SWE-Bench. ```bash uv run benchmarks/swe_bench/build_images.py \ --dataset princeton-nlp/SWE-bench_Verified --split test \ - --image ghcr.io/openhands/eval- - agent-server --target binary-minimal + --image ghcr.io/openhands/eval-agent-server --target binary-minimal ``` From 632432e7f1330131ce8e80ec3dcbfe6014cb3bb2 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 20:58:28 +0000 Subject: [PATCH 07/66] default to 10 for testing --- .github/workflows/build-swe-bench-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index e2f78007..8dcbc234 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -36,7 +36,7 @@ on: n-limit: description: 'Limit number of images to build (for testing)' required: false - default: '' + default: '10' type: string jobs: From c536903f65976f5cc648b43d6e3ded9a66468f9e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 4 Nov 2025 21:08:25 +0000 Subject: [PATCH 08/66] run on all prs for debugging --- .github/workflows/build-swe-bench-images.yml | 86 +++++++++++++------- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 8dcbc234..7cc3c68e 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -1,6 +1,7 @@ name: Build SWE-Bench Images on: + pull_request: # for debugging workflow_dispatch: inputs: dataset: @@ -31,39 +32,65 @@ on: max-workers: description: 'Number of concurrent builds' required: false - default: '1' + default: '2' type: string n-limit: - description: 'Limit number of images to build (for testing)' + description: 'Limit number of images to build (for testing). Leave blank for no limit.' required: false default: '10' type: string +# Reasonable defaults for automatic (push) runs; workflow_dispatch can override these. +env: + DATASET: princeton-nlp/SWE-bench_Verified + SPLIT: test + TARGET: source-minimal + PLATFORMS: linux/amd64 + MAX_WORKERS: '2' # modest concurrency for reliability + N_LIMIT: '10' # empty = no limit + +concurrency: + group: build-swe-bench-${{ github.ref }} + cancel-in-progress: false + jobs: build-and-push: runs-on: labels: blacksmith-32vcpu-ubuntu-2204 - + + # Allow pushing to GHCR permissions: contents: read packages: write - + steps: - name: Checkout repository uses: actions/checkout@v4 with: submodules: recursive - + + # If this was a manual dispatch, override defaults with provided inputs. + - name: Apply workflow_dispatch overrides (if any) + if: ${{ github.event_name == 'workflow_dispatch' }} + run: | + if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.target }}" ]; then echo "TARGET=${{ inputs.target }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.platforms }}" ]; then echo "PLATFORMS=${{ inputs.platforms }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi + # Empty string means "no limit" + if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi + - name: Set up Docker Buildx with Blacksmith uses: useblacksmith/setup-docker-builder@v1 - + - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - + - name: Install uv uses: astral-sh/setup-uv@v7 with: @@ -72,52 +99,53 @@ jobs: - name: Install dependencies run: | make build - + - name: Build and push SWE-Bench images run: | - # Construct the command with required arguments + set -euo pipefail + CMD="uv run benchmarks/swe_bench/build_images.py \ - --dataset ${{ inputs.dataset }} \ - --split ${{ inputs.split }} \ + --dataset '${DATASET}' \ + --split '${SPLIT}' \ --image ghcr.io/openhands/eval-agent-server \ - --target ${{ inputs.target }} \ - --platforms ${{ inputs.platforms }} \ + --target '${TARGET}' \ + --platforms '${PLATFORMS}' \ --push \ - --max-workers ${{ inputs.max-workers }}" - - # Add optional n-limit if provided - if [ -n "${{ inputs.n-limit }}" ]; then - CMD="$CMD --n-limit ${{ inputs.n-limit }}" + --max-workers '${MAX_WORKERS}'" + + # Only include --n-limit if provided (non-empty) + if [ -n "${N_LIMIT}" ]; then + CMD="$CMD --n-limit '${N_LIMIT}'" fi - - # Execute the build command - eval $CMD + + echo "Running: $CMD" + eval "$CMD" env: DOCKER_BUILDKIT: 1 BUILDKIT_PROGRESS: plain - + - name: Upload build manifest if: always() uses: actions/upload-artifact@v4 with: - name: build-manifest-${{ inputs.dataset }}-${{ inputs.split }} + name: build-manifest-${{ env.DATASET }}-${{ env.SPLIT }} path: | builds/**/manifest.jsonl builds/**/summary.json retention-days: 30 - + - name: Upload build logs if: always() uses: actions/upload-artifact@v4 with: - name: build-logs-${{ inputs.dataset }}-${{ inputs.split }} + name: build-logs-${{ env.DATASET }}-${{ env.SPLIT }} path: builds/**/logs/**/*.log retention-days: 7 - + - name: Display build summary if: always() run: | - if [ -f builds/*/summary.json ]; then - echo "## Build Summary" >> $GITHUB_STEP_SUMMARY - cat builds/*/summary.json | python -m json.tool >> $GITHUB_STEP_SUMMARY + if ls builds/*/summary.json >/dev/null 2>&1; then + echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY" + cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY" fi From efb731f3a28e5e752cf740afc35a7d8e572ba3b2 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:24:02 +0000 Subject: [PATCH 09/66] Fix pyarrow build issue by forcing binary wheel installation The GitHub Actions workflow was failing because uv was trying to build pyarrow from source, which requires the Arrow C++ library and CMake. This change adds the --no-build-package pyarrow flag to force uv to use the pre-built binary wheel instead of attempting to build from source. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 7cc3c68e..b563b80b 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -98,7 +98,10 @@ jobs: - name: Install dependencies run: | - make build + # Install dependencies, preferring binary wheels for problematic packages + git submodule update --init --recursive + uv sync --dev --no-build-package pyarrow + uv run pre-commit install - name: Build and push SWE-Bench images run: | From 29084f237af47ca229f08e4a5a4982974a82d743 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:26:36 +0000 Subject: [PATCH 10/66] Pin Python version to 3.12 to fix pyarrow compatibility The root cause of the build failure was that uv was installing Python 3.14.0, which doesn't have binary wheels for pyarrow 21.0.0 yet. This caused uv to attempt building from source, which failed due to missing Arrow C++ libraries. Solution: Added .python-version file to pin Python to 3.12, which matches the project's target-version in pyproject.toml and has full binary wheel support for all dependencies. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 5 +---- .python-version | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) create mode 100644 .python-version diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index b563b80b..7cc3c68e 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -98,10 +98,7 @@ jobs: - name: Install dependencies run: | - # Install dependencies, preferring binary wheels for problematic packages - git submodule update --init --recursive - uv sync --dev --no-build-package pyarrow - uv run pre-commit install + make build - name: Build and push SWE-Bench images run: | diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..e4fba218 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 From 551405b2a6a96a1ece0ca4443afd9dc42289064c Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:36:02 +0000 Subject: [PATCH 11/66] Fix artifact upload naming to avoid invalid characters Use github.run_id instead of dataset/split names which contain slashes that are invalid in artifact names. Also added if-no-files-found: warn to provide better feedback if logs are missing. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 7cc3c68e..9d6f6f20 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -128,7 +128,7 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: build-manifest-${{ env.DATASET }}-${{ env.SPLIT }} + name: build-manifest-${{ github.run_id }} path: | builds/**/manifest.jsonl builds/**/summary.json @@ -138,9 +138,10 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: build-logs-${{ env.DATASET }}-${{ env.SPLIT }} + name: build-logs-${{ github.run_id }} path: builds/**/logs/**/*.log retention-days: 7 + if-no-files-found: warn - name: Display build summary if: always() From 90b6ed6bd9a7f5ea0300fa727ef4fba5910f8616 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:40:42 +0000 Subject: [PATCH 12/66] Fix artifact upload by archiving logs to avoid invalid filename characters GitHub Actions artifact upload doesn't allow colons in filenames, but our log paths contain colons from Docker image tags (e.g., 'django-11999:latest'). Archive the entire builds directory into a tar.gz before upload to work around this restriction. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 9d6f6f20..bd4d741a 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -134,12 +134,23 @@ jobs: builds/**/summary.json retention-days: 30 + - name: Archive build logs + if: always() + run: | + if [ -d builds ]; then + # Create tar archive to avoid filename restrictions (colons, etc.) + tar -czf build-logs.tar.gz builds/ + echo "Build logs archived successfully" + else + echo "No builds directory found" + fi + - name: Upload build logs if: always() uses: actions/upload-artifact@v4 with: name: build-logs-${{ github.run_id }} - path: builds/**/logs/**/*.log + path: build-logs.tar.gz retention-days: 7 if-no-files-found: warn From 3ba1e46f29b01376d90c23087562373eb2f8e5d8 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:48:35 +0000 Subject: [PATCH 13/66] Fix Docker cache tag length exceeding 128 character limit Docker image tags have a maximum length of 128 characters. When building SWE-Bench images with long base image names (e.g., scikit-learn), the generated cache tags exceed this limit and cause build failures with: 'ERROR: failed to configure registry cache exporter: invalid reference format' Solution: Apply a patch to vendor/software-agent-sdk that hashes the base_image_slug when it would cause the final tag to exceed 128 characters. Uses SHA256 hash (first 12 chars) to create a shorter unique identifier while maintaining cache efficiency. The patch is applied during the workflow setup before installing dependencies. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 6 +++ .github/workflows/fix-cache-tag-length.patch | 44 ++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 .github/workflows/fix-cache-tag-length.patch diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index bd4d741a..e56d3343 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -96,6 +96,12 @@ jobs: with: enable-cache: true + - name: Apply fix for Docker cache tag length limit + run: | + cd vendor/software-agent-sdk + git apply ../../.github/workflows/fix-cache-tag-length.patch + echo "Applied patch to fix cache tag length limit" + - name: Install dependencies run: | make build diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch new file mode 100644 index 00000000..f6679aaa --- /dev/null +++ b/.github/workflows/fix-cache-tag-length.patch @@ -0,0 +1,44 @@ +diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py +index 1cc9cd4d..a1add62e 100755 +--- a/openhands-agent-server/openhands/agent_server/docker/build.py ++++ b/openhands-agent-server/openhands/agent_server/docker/build.py +@@ -14,6 +14,7 @@ Single-entry build helper for agent-server images. + """ + + import argparse ++import hashlib + import os + import re + import shutil +@@ -284,7 +285,30 @@ class BuildOptions(BaseModel): + + @property + def cache_tags(self) -> tuple[str, str]: +- base = f"buildcache-{self.target}-{self.base_image_slug}" ++ # Docker image tags have a 128-character limit. ++ # If the base slug is too long, hash it to create a shorter unique identifier. ++ MAX_TAG_LENGTH = 128 ++ base_slug = self.base_image_slug ++ ++ # Reserve space for prefix, branch, and separators ++ prefix = f"buildcache-{self.target}-" ++ branch_suffix = f"-{_sanitize_branch(GIT_REF)}" if GIT_REF not in ("main", "refs/heads/main", "unknown") else "" ++ main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else "" ++ ++ # Calculate available space for base_slug ++ reserved = len(prefix) + max(len(branch_suffix), len(main_suffix)) ++ available = MAX_TAG_LENGTH - reserved ++ ++ # If base_slug is too long, use a hash ++ if len(base_slug) > available: ++ # Use first 8 chars of SHA256 hash for uniqueness while keeping it short ++ hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12] ++ base_slug_short = hash_digest ++ logger.debug(f"[build] Base image slug too long ({len(base_slug)} chars), using hash: {base_slug_short}") ++ else: ++ base_slug_short = base_slug ++ ++ base = f"{prefix}{base_slug_short}" + if GIT_REF in ("main", "refs/heads/main"): + return f"{base}-main", base + elif GIT_REF != "unknown": From 21bb22616be7a54f8c8060866365f98184ae4b55 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 21:57:18 +0000 Subject: [PATCH 14/66] Update patch with pre-commit formatting fixes Updated the patch to match the formatting requirements from ruff and other pre-commit checks. This ensures the patch applies cleanly and passes all linting/formatting checks. Co-authored-by: openhands --- .github/workflows/fix-cache-tag-length.patch | 23 +++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch index f6679aaa..72306970 100644 --- a/.github/workflows/fix-cache-tag-length.patch +++ b/.github/workflows/fix-cache-tag-length.patch @@ -1,5 +1,5 @@ diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py -index 1cc9cd4d..a1add62e 100755 +index 1cc9cd4d..c5023d79 100755 --- a/openhands-agent-server/openhands/agent_server/docker/build.py +++ b/openhands-agent-server/openhands/agent_server/docker/build.py @@ -14,6 +14,7 @@ Single-entry build helper for agent-server images. @@ -10,7 +10,7 @@ index 1cc9cd4d..a1add62e 100755 import os import re import shutil -@@ -284,7 +285,30 @@ class BuildOptions(BaseModel): +@@ -284,7 +285,37 @@ class BuildOptions(BaseModel): @property def cache_tags(self) -> tuple[str, str]: @@ -19,25 +19,32 @@ index 1cc9cd4d..a1add62e 100755 + # If the base slug is too long, hash it to create a shorter unique identifier. + MAX_TAG_LENGTH = 128 + base_slug = self.base_image_slug -+ ++ + # Reserve space for prefix, branch, and separators + prefix = f"buildcache-{self.target}-" -+ branch_suffix = f"-{_sanitize_branch(GIT_REF)}" if GIT_REF not in ("main", "refs/heads/main", "unknown") else "" ++ branch_suffix = ( ++ f"-{_sanitize_branch(GIT_REF)}" ++ if GIT_REF not in ("main", "refs/heads/main", "unknown") ++ else "" ++ ) + main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else "" -+ ++ + # Calculate available space for base_slug + reserved = len(prefix) + max(len(branch_suffix), len(main_suffix)) + available = MAX_TAG_LENGTH - reserved -+ ++ + # If base_slug is too long, use a hash + if len(base_slug) > available: + # Use first 8 chars of SHA256 hash for uniqueness while keeping it short + hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12] + base_slug_short = hash_digest -+ logger.debug(f"[build] Base image slug too long ({len(base_slug)} chars), using hash: {base_slug_short}") ++ logger.debug( ++ f"[build] Base image slug too long ({len(base_slug)} chars), " ++ f"using hash: {base_slug_short}" ++ ) + else: + base_slug_short = base_slug -+ ++ + base = f"{prefix}{base_slug_short}" if GIT_REF in ("main", "refs/heads/main"): return f"{base}-main", base From 2f897757ebb00ab7ce3db9d0ff90c14bb6eb47f0 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 6 Nov 2025 17:52:38 +0000 Subject: [PATCH 15/66] checkout to v1.0.0 of sdk --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 448af7af..a612c0a6 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 448af7af9c64d5b0d373dcf9c84e2dc7d7e57b19 +Subproject commit a612c0a685fa96bc725085ac81c59492d4a88974 From dfb966bd2d3e4d2086223cf4ff85d998d15354d4 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 6 Nov 2025 17:52:59 +0000 Subject: [PATCH 16/66] update uv.lock --- uv.lock | 320 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 315 insertions(+), 5 deletions(-) diff --git a/uv.lock b/uv.lock index 38431b39..6ea29413 100644 --- a/uv.lock +++ b/uv.lock @@ -1125,6 +1125,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9d/08/24d62fccb01c4e86c59ba79073af7e5c8ab643846823c2fa3e957bde4b58/groq-0.32.0-py3-none-any.whl", hash = "sha256:0ed0be290042f8826f851f3a1defaac4f979dcfce86ec4a0681a23af00ec800b", size = 135387, upload-time = "2025-09-27T23:01:33.223Z" }, ] +[[package]] +name = "grpcio" +version = "1.76.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/05/8e29121994b8d959ffa0afd28996d452f291b48cfc0875619de0bde2c50c/grpcio-1.76.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8", size = 5799718, upload-time = "2025-10-21T16:21:17.939Z" }, + { url = "https://files.pythonhosted.org/packages/d9/75/11d0e66b3cdf998c996489581bdad8900db79ebd83513e45c19548f1cba4/grpcio-1.76.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280", size = 11825627, upload-time = "2025-10-21T16:21:20.466Z" }, + { url = "https://files.pythonhosted.org/packages/28/50/2f0aa0498bc188048f5d9504dcc5c2c24f2eb1a9337cd0fa09a61a2e75f0/grpcio-1.76.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4", size = 6359167, upload-time = "2025-10-21T16:21:23.122Z" }, + { url = "https://files.pythonhosted.org/packages/66/e5/bbf0bb97d29ede1d59d6588af40018cfc345b17ce979b7b45424628dc8bb/grpcio-1.76.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11", size = 7044267, upload-time = "2025-10-21T16:21:25.995Z" }, + { url = "https://files.pythonhosted.org/packages/f5/86/f6ec2164f743d9609691115ae8ece098c76b894ebe4f7c94a655c6b03e98/grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6", size = 6573963, upload-time = "2025-10-21T16:21:28.631Z" }, + { url = "https://files.pythonhosted.org/packages/60/bc/8d9d0d8505feccfdf38a766d262c71e73639c165b311c9457208b56d92ae/grpcio-1.76.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8", size = 7164484, upload-time = "2025-10-21T16:21:30.837Z" }, + { url = "https://files.pythonhosted.org/packages/67/e6/5d6c2fc10b95edf6df9b8f19cf10a34263b7fd48493936fffd5085521292/grpcio-1.76.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980", size = 8127777, upload-time = "2025-10-21T16:21:33.577Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c8/dce8ff21c86abe025efe304d9e31fdb0deaaa3b502b6a78141080f206da0/grpcio-1.76.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882", size = 7594014, upload-time = "2025-10-21T16:21:41.882Z" }, + { url = "https://files.pythonhosted.org/packages/e0/42/ad28191ebf983a5d0ecef90bab66baa5a6b18f2bfdef9d0a63b1973d9f75/grpcio-1.76.0-cp312-cp312-win32.whl", hash = "sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958", size = 3984750, upload-time = "2025-10-21T16:21:44.006Z" }, + { url = "https://files.pythonhosted.org/packages/9e/00/7bd478cbb851c04a48baccaa49b75abaa8e4122f7d86da797500cccdd771/grpcio-1.76.0-cp312-cp312-win_amd64.whl", hash = "sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347", size = 4704003, upload-time = "2025-10-21T16:21:46.244Z" }, + { url = "https://files.pythonhosted.org/packages/fc/ed/71467ab770effc9e8cef5f2e7388beb2be26ed642d567697bb103a790c72/grpcio-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2", size = 5807716, upload-time = "2025-10-21T16:21:48.475Z" }, + { url = "https://files.pythonhosted.org/packages/2c/85/c6ed56f9817fab03fa8a111ca91469941fb514e3e3ce6d793cb8f1e1347b/grpcio-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468", size = 11821522, upload-time = "2025-10-21T16:21:51.142Z" }, + { url = "https://files.pythonhosted.org/packages/ac/31/2b8a235ab40c39cbc141ef647f8a6eb7b0028f023015a4842933bc0d6831/grpcio-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3", size = 6362558, upload-time = "2025-10-21T16:21:54.213Z" }, + { url = "https://files.pythonhosted.org/packages/bd/64/9784eab483358e08847498ee56faf8ff6ea8e0a4592568d9f68edc97e9e9/grpcio-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb", size = 7049990, upload-time = "2025-10-21T16:21:56.476Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/8c12319a6369434e7a184b987e8e9f3b49a114c489b8315f029e24de4837/grpcio-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae", size = 6575387, upload-time = "2025-10-21T16:21:59.051Z" }, + { url = "https://files.pythonhosted.org/packages/15/0f/f12c32b03f731f4a6242f771f63039df182c8b8e2cf8075b245b409259d4/grpcio-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77", size = 7166668, upload-time = "2025-10-21T16:22:02.049Z" }, + { url = "https://files.pythonhosted.org/packages/ff/2d/3ec9ce0c2b1d92dd59d1c3264aaec9f0f7c817d6e8ac683b97198a36ed5a/grpcio-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03", size = 8124928, upload-time = "2025-10-21T16:22:04.984Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/fd3317be5672f4856bcdd1a9e7b5e17554692d3db9a3b273879dc02d657d/grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42", size = 7589983, upload-time = "2025-10-21T16:22:07.881Z" }, + { url = "https://files.pythonhosted.org/packages/45/bb/ca038cf420f405971f19821c8c15bcbc875505f6ffadafe9ffd77871dc4c/grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f", size = 3984727, upload-time = "2025-10-21T16:22:10.032Z" }, + { url = "https://files.pythonhosted.org/packages/41/80/84087dc56437ced7cdd4b13d7875e7439a52a261e3ab4e06488ba6173b0a/grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8", size = 4702799, upload-time = "2025-10-21T16:22:12.709Z" }, + { url = "https://files.pythonhosted.org/packages/b4/46/39adac80de49d678e6e073b70204091e76631e03e94928b9ea4ecf0f6e0e/grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62", size = 5808417, upload-time = "2025-10-21T16:22:15.02Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f5/a4531f7fb8b4e2a60b94e39d5d924469b7a6988176b3422487be61fe2998/grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd", size = 11828219, upload-time = "2025-10-21T16:22:17.954Z" }, + { url = "https://files.pythonhosted.org/packages/4b/1c/de55d868ed7a8bd6acc6b1d6ddc4aa36d07a9f31d33c912c804adb1b971b/grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc", size = 6367826, upload-time = "2025-10-21T16:22:20.721Z" }, + { url = "https://files.pythonhosted.org/packages/59/64/99e44c02b5adb0ad13ab3adc89cb33cb54bfa90c74770f2607eea629b86f/grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a", size = 7049550, upload-time = "2025-10-21T16:22:23.637Z" }, + { url = "https://files.pythonhosted.org/packages/43/28/40a5be3f9a86949b83e7d6a2ad6011d993cbe9b6bd27bea881f61c7788b6/grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba", size = 6575564, upload-time = "2025-10-21T16:22:26.016Z" }, + { url = "https://files.pythonhosted.org/packages/4b/a9/1be18e6055b64467440208a8559afac243c66a8b904213af6f392dc2212f/grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09", size = 7176236, upload-time = "2025-10-21T16:22:28.362Z" }, + { url = "https://files.pythonhosted.org/packages/0f/55/dba05d3fcc151ce6e81327541d2cc8394f442f6b350fead67401661bf041/grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc", size = 8125795, upload-time = "2025-10-21T16:22:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/4a/45/122df922d05655f63930cf42c9e3f72ba20aadb26c100ee105cad4ce4257/grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc", size = 7592214, upload-time = "2025-10-21T16:22:33.831Z" }, + { url = "https://files.pythonhosted.org/packages/4a/6e/0b899b7f6b66e5af39e377055fb4a6675c9ee28431df5708139df2e93233/grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e", size = 4062961, upload-time = "2025-10-21T16:22:36.468Z" }, + { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" }, +] + [[package]] name = "grpclib" version = "0.4.8" @@ -1493,6 +1534,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/50/53df2244d4aca2af73d2f2c6ad21c731cf24bd0dbe89d896184a1eaa874f/litellm-1.77.7-py3-none-any.whl", hash = "sha256:1b3a1b17bd521a0ad25226fb62a912602c803922aabb4a16adf83834673be574", size = 9223061, upload-time = "2025-10-05T00:22:34.112Z" }, ] +[[package]] +name = "lmnr" +version = "0.7.20" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio" }, + { name = "httpx" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-grpc" }, + { name = "opentelemetry-exporter-otlp-proto-http" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-instrumentation-threading" }, + { name = "opentelemetry-sdk" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "opentelemetry-semantic-conventions-ai" }, + { name = "orjson" }, + { name = "packaging" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "tenacity" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d4/c0/996403cc2f6967881a42af4b27ff8931956d57ab3ed2d8bf11e5b37aed40/lmnr-0.7.20.tar.gz", hash = "sha256:1f484cd618db2d71af65f90a0b8b36d20d80dc91a5138b811575c8677bf7c4fd", size = 194075, upload-time = "2025-11-04T16:53:34.49Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/df/4665a3931b2fbc5f5b66e4906ffab106f3f65ab7e78732ecdaf3ba4a3076/lmnr-0.7.20-py3-none-any.whl", hash = "sha256:5f9fa7444e6f96c25e097f66484ff29e632bdd1de0e9346948bf5595f4a8af38", size = 247465, upload-time = "2025-11-04T16:53:32.713Z" }, +] + [[package]] name = "mako" version = "1.3.10" @@ -1874,7 +1942,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.0.0a3" +version = "1.0.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -1966,12 +2034,13 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.0.0a3" +version = "1.0.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "fastmcp" }, { name = "httpx" }, { name = "litellm" }, + { name = "lmnr" }, { name = "pydantic" }, { name = "python-frontmatter" }, { name = "python-json-logger" }, @@ -1990,6 +2059,7 @@ requires-dist = [ { name = "fastmcp", specifier = ">=2.11.3" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.77.7.dev9" }, + { name = "lmnr", specifier = ">=0.7.20" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, @@ -2000,7 +2070,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.0.0a3" +version = "1.0.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2017,7 +2087,7 @@ dependencies = [ requires-dist = [ { name = "bashlex", specifier = ">=0.18" }, { name = "binaryornot", specifier = ">=0.4.4" }, - { name = "browser-use", specifier = ">=0.7.7" }, + { name = "browser-use", specifier = ">=0.8.0" }, { name = "cachetools" }, { name = "func-timeout", specifier = ">=4.3.5" }, { name = "libtmux", specifier = ">=0.46.2" }, @@ -2027,7 +2097,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.0.0a3" +version = "1.0.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-sdk" }, @@ -2040,6 +2110,197 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.11.7" }, ] +[[package]] +name = "opentelemetry-api" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/d8/0f354c375628e048bd0570645b310797299754730079853095bf000fba69/opentelemetry_api-1.38.0.tar.gz", hash = "sha256:f4c193b5e8acb0912b06ac5b16321908dd0843d75049c091487322284a3eea12", size = 65242, upload-time = "2025-10-16T08:35:50.25Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/a2/d86e01c28300bd41bab8f18afd613676e2bd63515417b77636fc1add426f/opentelemetry_api-1.38.0-py3-none-any.whl", hash = "sha256:2891b0197f47124454ab9f0cf58f3be33faca394457ac3e09daba13ff50aa582", size = 65947, upload-time = "2025-10-16T08:35:30.23Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-proto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/83/dd4660f2956ff88ed071e9e0e36e830df14b8c5dc06722dbde1841accbe8/opentelemetry_exporter_otlp_proto_common-1.38.0.tar.gz", hash = "sha256:e333278afab4695aa8114eeb7bf4e44e65c6607d54968271a249c180b2cb605c", size = 20431, upload-time = "2025-10-16T08:35:53.285Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/9e/55a41c9601191e8cd8eb626b54ee6827b9c9d4a46d736f32abc80d8039fc/opentelemetry_exporter_otlp_proto_common-1.38.0-py3-none-any.whl", hash = "sha256:03cb76ab213300fe4f4c62b7d8f17d97fcfd21b89f0b5ce38ea156327ddda74a", size = 18359, upload-time = "2025-10-16T08:35:34.099Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-grpc" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/c0/43222f5b97dc10812bc4f0abc5dc7cd0a2525a91b5151d26c9e2e958f52e/opentelemetry_exporter_otlp_proto_grpc-1.38.0.tar.gz", hash = "sha256:2473935e9eac71f401de6101d37d6f3f0f1831db92b953c7dcc912536158ebd6", size = 24676, upload-time = "2025-10-16T08:35:53.83Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/f0/bd831afbdba74ca2ce3982142a2fad707f8c487e8a3b6fef01f1d5945d1b/opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl", hash = "sha256:7c49fd9b4bd0dbe9ba13d91f764c2d20b0025649a6e4ac35792fb8d84d764bc7", size = 19695, upload-time = "2025-10-16T08:35:35.053Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-http" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/81/0a/debcdfb029fbd1ccd1563f7c287b89a6f7bef3b2902ade56797bfd020854/opentelemetry_exporter_otlp_proto_http-1.38.0.tar.gz", hash = "sha256:f16bd44baf15cbe07633c5112ffc68229d0edbeac7b37610be0b2def4e21e90b", size = 17282, upload-time = "2025-10-16T08:35:54.422Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/77/154004c99fb9f291f74aa0822a2f5bbf565a72d8126b3a1b63ed8e5f83c7/opentelemetry_exporter_otlp_proto_http-1.38.0-py3-none-any.whl", hash = "sha256:84b937305edfc563f08ec69b9cb2298be8188371217e867c1854d77198d0825b", size = 19579, upload-time = "2025-10-16T08:35:36.269Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation" +version = "0.59b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "packaging" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/ed/9c65cd209407fd807fa05be03ee30f159bdac8d59e7ea16a8fe5a1601222/opentelemetry_instrumentation-0.59b0.tar.gz", hash = "sha256:6010f0faaacdaf7c4dff8aac84e226d23437b331dcda7e70367f6d73a7db1adc", size = 31544, upload-time = "2025-10-16T08:39:31.959Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/f5/7a40ff3f62bfe715dad2f633d7f1174ba1a7dd74254c15b2558b3401262a/opentelemetry_instrumentation-0.59b0-py3-none-any.whl", hash = "sha256:44082cc8fe56b0186e87ee8f7c17c327c4c2ce93bdbe86496e600985d74368ee", size = 33020, upload-time = "2025-10-16T08:38:31.463Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-threading" +version = "0.59b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/7a/84e97d8992808197006e607ae410c2219bdbbc23d1289ba0c244d3220741/opentelemetry_instrumentation_threading-0.59b0.tar.gz", hash = "sha256:ce5658730b697dcbc0e0d6d13643a69fd8aeb1b32fa8db3bade8ce114c7975f3", size = 8770, upload-time = "2025-10-16T08:40:03.587Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/50/32d29076aaa1c91983cdd3ca8c6bb4d344830cd7d87a7c0fdc2d98c58509/opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl", hash = "sha256:76da2fc01fe1dccebff6581080cff9e42ac7b27cc61eb563f3c4435c727e8eca", size = 9313, upload-time = "2025-10-16T08:39:15.876Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/14/f0c4f0f6371b9cb7f9fa9ee8918bfd59ac7040c7791f1e6da32a1839780d/opentelemetry_proto-1.38.0.tar.gz", hash = "sha256:88b161e89d9d372ce723da289b7da74c3a8354a8e5359992be813942969ed468", size = 46152, upload-time = "2025-10-16T08:36:01.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/6a/82b68b14efca5150b2632f3692d627afa76b77378c4999f2648979409528/opentelemetry_proto-1.38.0-py3-none-any.whl", hash = "sha256:b6ebe54d3217c42e45462e2a1ae28c3e2bf2ec5a5645236a490f55f45f1a0a18", size = 72535, upload-time = "2025-10-16T08:35:45.749Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/85/cb/f0eee1445161faf4c9af3ba7b848cc22a50a3d3e2515051ad8628c35ff80/opentelemetry_sdk-1.38.0.tar.gz", hash = "sha256:93df5d4d871ed09cb4272305be4d996236eedb232253e3ab864c8620f051cebe", size = 171942, upload-time = "2025-10-16T08:36:02.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/2e/e93777a95d7d9c40d270a371392b6d6f1ff170c2a3cb32d6176741b5b723/opentelemetry_sdk-1.38.0-py3-none-any.whl", hash = "sha256:1c66af6564ecc1553d72d811a01df063ff097cdc82ce188da9951f93b8d10f6b", size = 132349, upload-time = "2025-10-16T08:35:46.995Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.59b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/40/bc/8b9ad3802cd8ac6583a4eb7de7e5d7db004e89cb7efe7008f9c8a537ee75/opentelemetry_semantic_conventions-0.59b0.tar.gz", hash = "sha256:7a6db3f30d70202d5bf9fa4b69bc866ca6a30437287de6c510fb594878aed6b0", size = 129861, upload-time = "2025-10-16T08:36:03.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions-ai" +version = "0.4.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e6/40b59eda51ac47009fb47afcdf37c6938594a0bd7f3b9fadcbc6058248e3/opentelemetry_semantic_conventions_ai-0.4.13.tar.gz", hash = "sha256:94efa9fb4ffac18c45f54a3a338ffeb7eedb7e1bb4d147786e77202e159f0036", size = 5368, upload-time = "2025-08-22T10:14:17.387Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/b5/cf25da2218910f0d6cdf7f876a06bed118c4969eacaf60a887cbaef44f44/opentelemetry_semantic_conventions_ai-0.4.13-py3-none-any.whl", hash = "sha256:883a30a6bb5deaec0d646912b5f9f6dcbb9f6f72557b73d0f2560bf25d13e2d5", size = 6080, upload-time = "2025-08-22T10:14:16.477Z" }, +] + +[[package]] +name = "orjson" +version = "3.11.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c6/fe/ed708782d6709cc60eb4c2d8a361a440661f74134675c72990f2c48c785f/orjson-3.11.4.tar.gz", hash = "sha256:39485f4ab4c9b30a3943cfe99e1a213c4776fb69e8abd68f66b83d5a0b0fdc6d", size = 5945188, upload-time = "2025-10-24T15:50:38.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/51/6b556192a04595b93e277a9ff71cd0cc06c21a7df98bcce5963fa0f5e36f/orjson-3.11.4-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d4371de39319d05d3f482f372720b841c841b52f5385bd99c61ed69d55d9ab50", size = 243571, upload-time = "2025-10-24T15:49:10.008Z" }, + { url = "https://files.pythonhosted.org/packages/1c/2c/2602392ddf2601d538ff11848b98621cd465d1a1ceb9db9e8043181f2f7b/orjson-3.11.4-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:e41fd3b3cac850eaae78232f37325ed7d7436e11c471246b87b2cd294ec94853", size = 128891, upload-time = "2025-10-24T15:49:11.297Z" }, + { url = "https://files.pythonhosted.org/packages/4e/47/bf85dcf95f7a3a12bf223394a4f849430acd82633848d52def09fa3f46ad/orjson-3.11.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:600e0e9ca042878c7fdf189cf1b028fe2c1418cc9195f6cb9824eb6ed99cb938", size = 130137, upload-time = "2025-10-24T15:49:12.544Z" }, + { url = "https://files.pythonhosted.org/packages/b4/4d/a0cb31007f3ab6f1fd2a1b17057c7c349bc2baf8921a85c0180cc7be8011/orjson-3.11.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7bbf9b333f1568ef5da42bc96e18bf30fd7f8d54e9ae066d711056add508e415", size = 129152, upload-time = "2025-10-24T15:49:13.754Z" }, + { url = "https://files.pythonhosted.org/packages/f7/ef/2811def7ce3d8576b19e3929fff8f8f0d44bc5eb2e0fdecb2e6e6cc6c720/orjson-3.11.4-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4806363144bb6e7297b8e95870e78d30a649fdc4e23fc84daa80c8ebd366ce44", size = 136834, upload-time = "2025-10-24T15:49:15.307Z" }, + { url = "https://files.pythonhosted.org/packages/00/d4/9aee9e54f1809cec8ed5abd9bc31e8a9631d19460e3b8470145d25140106/orjson-3.11.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad355e8308493f527d41154e9053b86a5be892b3b359a5c6d5d95cda23601cb2", size = 137519, upload-time = "2025-10-24T15:49:16.557Z" }, + { url = "https://files.pythonhosted.org/packages/db/ea/67bfdb5465d5679e8ae8d68c11753aaf4f47e3e7264bad66dc2f2249e643/orjson-3.11.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a7517482667fb9f0ff1b2f16fe5829296ed7a655d04d68cd9711a4d8a4e708", size = 136749, upload-time = "2025-10-24T15:49:17.796Z" }, + { url = "https://files.pythonhosted.org/packages/01/7e/62517dddcfce6d53a39543cd74d0dccfcbdf53967017c58af68822100272/orjson-3.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97eb5942c7395a171cbfecc4ef6701fc3c403e762194683772df4c54cfbb2210", size = 136325, upload-time = "2025-10-24T15:49:19.347Z" }, + { url = "https://files.pythonhosted.org/packages/18/ae/40516739f99ab4c7ec3aaa5cc242d341fcb03a45d89edeeaabc5f69cb2cf/orjson-3.11.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:149d95d5e018bdd822e3f38c103b1a7c91f88d38a88aada5c4e9b3a73a244241", size = 140204, upload-time = "2025-10-24T15:49:20.545Z" }, + { url = "https://files.pythonhosted.org/packages/82/18/ff5734365623a8916e3a4037fcef1cd1782bfc14cf0992afe7940c5320bf/orjson-3.11.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:624f3951181eb46fc47dea3d221554e98784c823e7069edb5dbd0dc826ac909b", size = 406242, upload-time = "2025-10-24T15:49:21.884Z" }, + { url = "https://files.pythonhosted.org/packages/e1/43/96436041f0a0c8c8deca6a05ebeaf529bf1de04839f93ac5e7c479807aec/orjson-3.11.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:03bfa548cf35e3f8b3a96c4e8e41f753c686ff3d8e182ce275b1751deddab58c", size = 150013, upload-time = "2025-10-24T15:49:23.185Z" }, + { url = "https://files.pythonhosted.org/packages/1b/48/78302d98423ed8780479a1e682b9aecb869e8404545d999d34fa486e573e/orjson-3.11.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:525021896afef44a68148f6ed8a8bf8375553d6066c7f48537657f64823565b9", size = 139951, upload-time = "2025-10-24T15:49:24.428Z" }, + { url = "https://files.pythonhosted.org/packages/4a/7b/ad613fdcdaa812f075ec0875143c3d37f8654457d2af17703905425981bf/orjson-3.11.4-cp312-cp312-win32.whl", hash = "sha256:b58430396687ce0f7d9eeb3dd47761ca7d8fda8e9eb92b3077a7a353a75efefa", size = 136049, upload-time = "2025-10-24T15:49:25.973Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3c/9cf47c3ff5f39b8350fb21ba65d789b6a1129d4cbb3033ba36c8a9023520/orjson-3.11.4-cp312-cp312-win_amd64.whl", hash = "sha256:c6dbf422894e1e3c80a177133c0dda260f81428f9de16d61041949f6a2e5c140", size = 131461, upload-time = "2025-10-24T15:49:27.259Z" }, + { url = "https://files.pythonhosted.org/packages/c6/3b/e2425f61e5825dc5b08c2a5a2b3af387eaaca22a12b9c8c01504f8614c36/orjson-3.11.4-cp312-cp312-win_arm64.whl", hash = "sha256:d38d2bc06d6415852224fcc9c0bfa834c25431e466dc319f0edd56cca81aa96e", size = 126167, upload-time = "2025-10-24T15:49:28.511Z" }, + { url = "https://files.pythonhosted.org/packages/23/15/c52aa7112006b0f3d6180386c3a46ae057f932ab3425bc6f6ac50431cca1/orjson-3.11.4-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:2d6737d0e616a6e053c8b4acc9eccea6b6cce078533666f32d140e4f85002534", size = 243525, upload-time = "2025-10-24T15:49:29.737Z" }, + { url = "https://files.pythonhosted.org/packages/ec/38/05340734c33b933fd114f161f25a04e651b0c7c33ab95e9416ade5cb44b8/orjson-3.11.4-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:afb14052690aa328cc118a8e09f07c651d301a72e44920b887c519b313d892ff", size = 128871, upload-time = "2025-10-24T15:49:31.109Z" }, + { url = "https://files.pythonhosted.org/packages/55/b9/ae8d34899ff0c012039b5a7cb96a389b2476e917733294e498586b45472d/orjson-3.11.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38aa9e65c591febb1b0aed8da4d469eba239d434c218562df179885c94e1a3ad", size = 130055, upload-time = "2025-10-24T15:49:33.382Z" }, + { url = "https://files.pythonhosted.org/packages/33/aa/6346dd5073730451bee3681d901e3c337e7ec17342fb79659ec9794fc023/orjson-3.11.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f2cf4dfaf9163b0728d061bebc1e08631875c51cd30bf47cb9e3293bfbd7dcd5", size = 129061, upload-time = "2025-10-24T15:49:34.935Z" }, + { url = "https://files.pythonhosted.org/packages/39/e4/8eea51598f66a6c853c380979912d17ec510e8e66b280d968602e680b942/orjson-3.11.4-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89216ff3dfdde0e4070932e126320a1752c9d9a758d6a32ec54b3b9334991a6a", size = 136541, upload-time = "2025-10-24T15:49:36.923Z" }, + { url = "https://files.pythonhosted.org/packages/9a/47/cb8c654fa9adcc60e99580e17c32b9e633290e6239a99efa6b885aba9dbc/orjson-3.11.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9daa26ca8e97fae0ce8aa5d80606ef8f7914e9b129b6b5df9104266f764ce436", size = 137535, upload-time = "2025-10-24T15:49:38.307Z" }, + { url = "https://files.pythonhosted.org/packages/43/92/04b8cc5c2b729f3437ee013ce14a60ab3d3001465d95c184758f19362f23/orjson-3.11.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c8b2769dc31883c44a9cd126560327767f848eb95f99c36c9932f51090bfce9", size = 136703, upload-time = "2025-10-24T15:49:40.795Z" }, + { url = "https://files.pythonhosted.org/packages/aa/fd/d0733fcb9086b8be4ebcfcda2d0312865d17d0d9884378b7cffb29d0763f/orjson-3.11.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1469d254b9884f984026bd9b0fa5bbab477a4bfe558bba6848086f6d43eb5e73", size = 136293, upload-time = "2025-10-24T15:49:42.347Z" }, + { url = "https://files.pythonhosted.org/packages/c2/d7/3c5514e806837c210492d72ae30ccf050ce3f940f45bf085bab272699ef4/orjson-3.11.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:68e44722541983614e37117209a194e8c3ad07838ccb3127d96863c95ec7f1e0", size = 140131, upload-time = "2025-10-24T15:49:43.638Z" }, + { url = "https://files.pythonhosted.org/packages/9c/dd/ba9d32a53207babf65bd510ac4d0faaa818bd0df9a9c6f472fe7c254f2e3/orjson-3.11.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8e7805fda9672c12be2f22ae124dcd7b03928d6c197544fe12174b86553f3196", size = 406164, upload-time = "2025-10-24T15:49:45.498Z" }, + { url = "https://files.pythonhosted.org/packages/8e/f9/f68ad68f4af7c7bde57cd514eaa2c785e500477a8bc8f834838eb696a685/orjson-3.11.4-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:04b69c14615fb4434ab867bf6f38b2d649f6f300af30a6705397e895f7aec67a", size = 149859, upload-time = "2025-10-24T15:49:46.981Z" }, + { url = "https://files.pythonhosted.org/packages/b6/d2/7f847761d0c26818395b3d6b21fb6bc2305d94612a35b0a30eae65a22728/orjson-3.11.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:639c3735b8ae7f970066930e58cf0ed39a852d417c24acd4a25fc0b3da3c39a6", size = 139926, upload-time = "2025-10-24T15:49:48.321Z" }, + { url = "https://files.pythonhosted.org/packages/9f/37/acd14b12dc62db9a0e1d12386271b8661faae270b22492580d5258808975/orjson-3.11.4-cp313-cp313-win32.whl", hash = "sha256:6c13879c0d2964335491463302a6ca5ad98105fc5db3565499dcb80b1b4bd839", size = 136007, upload-time = "2025-10-24T15:49:49.938Z" }, + { url = "https://files.pythonhosted.org/packages/c0/a9/967be009ddf0a1fffd7a67de9c36656b28c763659ef91352acc02cbe364c/orjson-3.11.4-cp313-cp313-win_amd64.whl", hash = "sha256:09bf242a4af98732db9f9a1ec57ca2604848e16f132e3f72edfd3c5c96de009a", size = 131314, upload-time = "2025-10-24T15:49:51.248Z" }, + { url = "https://files.pythonhosted.org/packages/cb/db/399abd6950fbd94ce125cb8cd1a968def95174792e127b0642781e040ed4/orjson-3.11.4-cp313-cp313-win_arm64.whl", hash = "sha256:a85f0adf63319d6c1ba06fb0dbf997fced64a01179cf17939a6caca662bf92de", size = 126152, upload-time = "2025-10-24T15:49:52.922Z" }, + { url = "https://files.pythonhosted.org/packages/25/e3/54ff63c093cc1697e758e4fceb53164dd2661a7d1bcd522260ba09f54533/orjson-3.11.4-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:42d43a1f552be1a112af0b21c10a5f553983c2a0938d2bbb8ecd8bc9fb572803", size = 243501, upload-time = "2025-10-24T15:49:54.288Z" }, + { url = "https://files.pythonhosted.org/packages/ac/7d/e2d1076ed2e8e0ae9badca65bf7ef22710f93887b29eaa37f09850604e09/orjson-3.11.4-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:26a20f3fbc6c7ff2cb8e89c4c5897762c9d88cf37330c6a117312365d6781d54", size = 128862, upload-time = "2025-10-24T15:49:55.961Z" }, + { url = "https://files.pythonhosted.org/packages/9f/37/ca2eb40b90621faddfa9517dfe96e25f5ae4d8057a7c0cdd613c17e07b2c/orjson-3.11.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e3f20be9048941c7ffa8fc523ccbd17f82e24df1549d1d1fe9317712d19938e", size = 130047, upload-time = "2025-10-24T15:49:57.406Z" }, + { url = "https://files.pythonhosted.org/packages/c7/62/1021ed35a1f2bad9040f05fa4cc4f9893410df0ba3eaa323ccf899b1c90a/orjson-3.11.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aac364c758dc87a52e68e349924d7e4ded348dedff553889e4d9f22f74785316", size = 129073, upload-time = "2025-10-24T15:49:58.782Z" }, + { url = "https://files.pythonhosted.org/packages/e8/3f/f84d966ec2a6fd5f73b1a707e7cd876813422ae4bf9f0145c55c9c6a0f57/orjson-3.11.4-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d5c54a6d76e3d741dcc3f2707f8eeb9ba2a791d3adbf18f900219b62942803b1", size = 136597, upload-time = "2025-10-24T15:50:00.12Z" }, + { url = "https://files.pythonhosted.org/packages/32/78/4fa0aeca65ee82bbabb49e055bd03fa4edea33f7c080c5c7b9601661ef72/orjson-3.11.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f28485bdca8617b79d44627f5fb04336897041dfd9fa66d383a49d09d86798bc", size = 137515, upload-time = "2025-10-24T15:50:01.57Z" }, + { url = "https://files.pythonhosted.org/packages/c1/9d/0c102e26e7fde40c4c98470796d050a2ec1953897e2c8ab0cb95b0759fa2/orjson-3.11.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bfc2a484cad3585e4ba61985a6062a4c2ed5c7925db6d39f1fa267c9d166487f", size = 136703, upload-time = "2025-10-24T15:50:02.944Z" }, + { url = "https://files.pythonhosted.org/packages/df/ac/2de7188705b4cdfaf0b6c97d2f7849c17d2003232f6e70df98602173f788/orjson-3.11.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e34dbd508cb91c54f9c9788923daca129fe5b55c5b4eebe713bf5ed3791280cf", size = 136311, upload-time = "2025-10-24T15:50:04.441Z" }, + { url = "https://files.pythonhosted.org/packages/e0/52/847fcd1a98407154e944feeb12e3b4d487a0e264c40191fb44d1269cbaa1/orjson-3.11.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b13c478fa413d4b4ee606ec8e11c3b2e52683a640b006bb586b3041c2ca5f606", size = 140127, upload-time = "2025-10-24T15:50:07.398Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ae/21d208f58bdb847dd4d0d9407e2929862561841baa22bdab7aea10ca088e/orjson-3.11.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:724ca721ecc8a831b319dcd72cfa370cc380db0bf94537f08f7edd0a7d4e1780", size = 406201, upload-time = "2025-10-24T15:50:08.796Z" }, + { url = "https://files.pythonhosted.org/packages/8d/55/0789d6de386c8366059db098a628e2ad8798069e94409b0d8935934cbcb9/orjson-3.11.4-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:977c393f2e44845ce1b540e19a786e9643221b3323dae190668a98672d43fb23", size = 149872, upload-time = "2025-10-24T15:50:10.234Z" }, + { url = "https://files.pythonhosted.org/packages/cc/1d/7ff81ea23310e086c17b41d78a72270d9de04481e6113dbe2ac19118f7fb/orjson-3.11.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1e539e382cf46edec157ad66b0b0872a90d829a6b71f17cb633d6c160a223155", size = 139931, upload-time = "2025-10-24T15:50:11.623Z" }, + { url = "https://files.pythonhosted.org/packages/77/92/25b886252c50ed64be68c937b562b2f2333b45afe72d53d719e46a565a50/orjson-3.11.4-cp314-cp314-win32.whl", hash = "sha256:d63076d625babab9db5e7836118bdfa086e60f37d8a174194ae720161eb12394", size = 136065, upload-time = "2025-10-24T15:50:13.025Z" }, + { url = "https://files.pythonhosted.org/packages/63/b8/718eecf0bb7e9d64e4956afaafd23db9f04c776d445f59fe94f54bdae8f0/orjson-3.11.4-cp314-cp314-win_amd64.whl", hash = "sha256:0a54d6635fa3aaa438ae32e8570b9f0de36f3f6562c308d2a2a452e8b0592db1", size = 131310, upload-time = "2025-10-24T15:50:14.46Z" }, + { url = "https://files.pythonhosted.org/packages/1a/bf/def5e25d4d8bfce296a9a7c8248109bf58622c21618b590678f945a2c59c/orjson-3.11.4-cp314-cp314-win_arm64.whl", hash = "sha256:78b999999039db3cf58f6d230f524f04f75f129ba3d1ca2ed121f8657e575d3d", size = 126151, upload-time = "2025-10-24T15:50:15.878Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -6135,6 +6396,55 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/ea/c67e1dee1ba208ed22c06d1d547ae5e293374bfc43e0eb0ef5e262b68561/werkzeug-3.1.1-py3-none-any.whl", hash = "sha256:a71124d1ef06008baafa3d266c02f56e1836a5984afd6dd6c9230669d60d9fb5", size = 224371, upload-time = "2024-11-01T16:40:43.994Z" }, ] +[[package]] +name = "wrapt" +version = "1.17.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" }, + { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" }, + { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" }, + { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" }, + { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" }, + { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" }, + { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" }, + { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" }, + { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" }, + { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" }, + { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" }, + { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" }, + { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" }, + { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" }, + { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" }, + { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" }, + { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" }, + { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" }, + { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" }, + { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" }, + { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, + { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, + { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" }, + { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, +] + [[package]] name = "wsproto" version = "1.2.0" From cdd72003423b0c1dbb8951f6aafb919675a54212 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 6 Nov 2025 19:23:57 +0000 Subject: [PATCH 17/66] Revert "Fix Docker cache tag length exceeding 128 character limit" This reverts commit 3ba1e46f29b01376d90c23087562373eb2f8e5d8. --- .github/workflows/build-swe-bench-images.yml | 6 --- .github/workflows/fix-cache-tag-length.patch | 51 -------------------- 2 files changed, 57 deletions(-) delete mode 100644 .github/workflows/fix-cache-tag-length.patch diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index e56d3343..bd4d741a 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -96,12 +96,6 @@ jobs: with: enable-cache: true - - name: Apply fix for Docker cache tag length limit - run: | - cd vendor/software-agent-sdk - git apply ../../.github/workflows/fix-cache-tag-length.patch - echo "Applied patch to fix cache tag length limit" - - name: Install dependencies run: | make build diff --git a/.github/workflows/fix-cache-tag-length.patch b/.github/workflows/fix-cache-tag-length.patch deleted file mode 100644 index 72306970..00000000 --- a/.github/workflows/fix-cache-tag-length.patch +++ /dev/null @@ -1,51 +0,0 @@ -diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py -index 1cc9cd4d..c5023d79 100755 ---- a/openhands-agent-server/openhands/agent_server/docker/build.py -+++ b/openhands-agent-server/openhands/agent_server/docker/build.py -@@ -14,6 +14,7 @@ Single-entry build helper for agent-server images. - """ - - import argparse -+import hashlib - import os - import re - import shutil -@@ -284,7 +285,37 @@ class BuildOptions(BaseModel): - - @property - def cache_tags(self) -> tuple[str, str]: -- base = f"buildcache-{self.target}-{self.base_image_slug}" -+ # Docker image tags have a 128-character limit. -+ # If the base slug is too long, hash it to create a shorter unique identifier. -+ MAX_TAG_LENGTH = 128 -+ base_slug = self.base_image_slug -+ -+ # Reserve space for prefix, branch, and separators -+ prefix = f"buildcache-{self.target}-" -+ branch_suffix = ( -+ f"-{_sanitize_branch(GIT_REF)}" -+ if GIT_REF not in ("main", "refs/heads/main", "unknown") -+ else "" -+ ) -+ main_suffix = "-main" if GIT_REF in ("main", "refs/heads/main") else "" -+ -+ # Calculate available space for base_slug -+ reserved = len(prefix) + max(len(branch_suffix), len(main_suffix)) -+ available = MAX_TAG_LENGTH - reserved -+ -+ # If base_slug is too long, use a hash -+ if len(base_slug) > available: -+ # Use first 8 chars of SHA256 hash for uniqueness while keeping it short -+ hash_digest = hashlib.sha256(base_slug.encode()).hexdigest()[:12] -+ base_slug_short = hash_digest -+ logger.debug( -+ f"[build] Base image slug too long ({len(base_slug)} chars), " -+ f"using hash: {base_slug_short}" -+ ) -+ else: -+ base_slug_short = base_slug -+ -+ base = f"{prefix}{base_slug_short}" - if GIT_REF in ("main", "refs/heads/main"): - return f"{base}-main", base - elif GIT_REF != "unknown": From 001bcee9f4a93c1a0695045f364aad6ebcc99a0a Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 19:39:52 +0000 Subject: [PATCH 18/66] Fix log file mixing issue by using ProcessPoolExecutor The build workflow was experiencing log file corruption and I/O errors due to concurrent builds writing to the wrong log files. This was caused by using ThreadPoolExecutor with contextlib.redirect_stdout/stderr, which only provides thread-local redirection of Python-level writes. The SDK's build() function spawns subprocesses and uses logger.info()/warning() to output build logs. Logger handlers write to process-wide file descriptors, not thread-local redirected streams, causing output from concurrent threads to: - Write to the wrong log files - Attempt writing to closed file handles - Result in ValueError('I/O operation on closed file.') Solution: Replace ThreadPoolExecutor with ProcessPoolExecutor to provide complete process-level isolation with separate stdout/stderr/logging per build. The additional overhead is negligible compared to Docker build time. Changes: - Import ProcessPoolExecutor instead of ThreadPoolExecutor - Move build_one_fn to module level (_build_with_logging) for pickle support - Update executor initialization to use ProcessPoolExecutor - Add explanatory comments about isolation requirements Co-authored-by: openhands --- benchmarks/swe_bench/build_images.py | 30 ++++++++++++++++++---------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 08054c35..48a408de 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -12,7 +12,7 @@ import contextlib import io import sys -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import UTC, datetime from pathlib import Path from threading import Lock @@ -162,6 +162,19 @@ def _default_build_output_dir( return root +def _build_with_logging( + base: str, log_dir: Path, args: argparse.Namespace +) -> BuildOutput: + """ + Module-level function for building a single image with output capture. + Must be at module level to be picklable for ProcessPoolExecutor. + """ + with capture_output(base, log_dir) as log_path: + result = build_one(base, args) + result.log_path = str(log_path) + return result + + def _update_pbar( pbar: tqdm, successes: int, @@ -191,12 +204,6 @@ def main(argv: list[str]) -> int: manifest_path = BUILD_DIR / "manifest.jsonl" manifest_path.parent.mkdir(parents=True, exist_ok=True) - def build_one_fn(base: str, args) -> BuildOutput: - with capture_output(base, BUILD_LOG_DIR) as log_path: - result = build_one(base, args) - result.log_path = str(log_path) - return result - if args.dry_run: print("\n".join(bases)) return 0 @@ -212,13 +219,14 @@ def build_one_fn(base: str, args) -> BuildOutput: ): _update_pbar(pbar, successes, failures, 0, None, "Queueing") - # Single unified path: ThreadPoolExecutor( max_workers = args.max_workers ), - # even if it's 1 - with ThreadPoolExecutor(max_workers=args.max_workers) as ex: + # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ), + # even if it's 1. Using processes instead of threads ensures proper isolation + # of stdout/stderr and logging handlers, preventing output mixing between builds. + with ProcessPoolExecutor(max_workers=args.max_workers) as ex: futures = {} for base in bases: in_progress.add(base) - fut = ex.submit(build_one_fn, base, args) + fut = ex.submit(_build_with_logging, base, BUILD_LOG_DIR, args) futures[fut] = base _update_pbar( From 271b5271a1c22375a85f3b771c78189764439a8f Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 20:33:36 +0000 Subject: [PATCH 19/66] Improve Docker image tagging for reproducibility This commit improves the tagging system for SWE-Bench Docker images to enable better reproducibility and clarity. ## Changes ### 1. Benchmarks Build System **benchmarks/swe_bench/build_images.py:** - Added `get_sdk_commit_hash()`: Extracts 7-char SDK submodule commit hash - Added `extract_instance_id()`: Parses SWE-Bench base images to extract instance IDs - Modified `main()`: Sets SDK_VERSION_OVERRIDE env var with SDK commit hash - Modified `build_one()`: - Generates custom tags: `swebench-{instance_id}` - Disables versioned tags via `include_versioned_tag=False` ### 2. SDK Submodule Update **vendor/software-agent-sdk:** Updated to commit 77d50e61 which includes: - `SDK_VERSION_OVERRIDE` environment variable support - `include_versioned_tag` option in BuildOptions - Target-based tag suffixes (replaces `-dev` suffix) - See: https://github.com/OpenHands/software-agent-sdk/pull/1088 ### 3. Documentation **TAGGING_CHANGES.md:** Comprehensive documentation explaining: - Why these changes are needed (submodule git context issues) - Tag format comparison (before/after) - Benefits (reproducibility, usability, maintainability) - Implementation details and examples ## Tag Format ### Before ``` v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev ``` - 137 characters - Package version (non-reproducible) - Unclear `-dev` suffix ### After ``` a612c0a-swebench-django-12155-source-minimal main-swebench-django-12155-source-minimal ``` - 84 characters (39% shorter) - Exact commit hash (reproducible) - Clear target indication ## Benefits 1. **Reproducibility**: Git commit hash ensures exact SDK version tracking 2. **Clarity**: Instance ID and target clearly visible in tag 3. **Consistency**: All builds use same suffix pattern 4. **Backward Compatible**: SDK changes only apply when explicitly enabled ## Related - SDK PR: https://github.com/OpenHands/software-agent-sdk/pull/1088 - Issue: Improve SWE-Bench image build workflow Co-authored-by: openhands --- TAGGING_CHANGES.md | 185 +++++++++++++++++++++++++++ benchmarks/swe_bench/build_images.py | 64 ++++++++- vendor/software-agent-sdk | 2 +- 3 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 TAGGING_CHANGES.md diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md new file mode 100644 index 00000000..79ee855a --- /dev/null +++ b/TAGGING_CHANGES.md @@ -0,0 +1,185 @@ +# Docker Image Tagging Improvements + +## Summary + +This change replaces the long, auto-generated versioned tags with short, meaningful tags that include: +- **SDK commit hash** (exact reproducibility) +- **SWE-Bench instance ID** (clear identification) + +## Changes Made + +### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`) + +**Added three features:** + +1. **`SDK_VERSION_OVERRIDE` environment variable** + - Allows overriding the package version with a commit hash + - Falls back to `importlib.metadata.version("openhands-sdk")` if not set + - Critical for git submodule contexts where package version != actual commit + - Follows existing pattern (SDK already uses `GITHUB_REF` env var) + +2. **`include_versioned_tag` option in BuildOptions** + - When `False`, skips the long versioned tag + - Defaults to `True` for backward compatibility + - Gives consumers control over tag format + +3. **Target-based tag suffixes** (replaces `-dev` suffix) + - All tags now include `-{target}` suffix: `-binary`, `-source`, `-binary-minimal`, `-source-minimal` + - More descriptive than previous `-dev` suffix (which only applied to source builds) + - Makes tag meaning immediately clear without needing to check build config + - Removed deprecated `is_dev` property + +### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`) + +**Added two functions:** + +1. **`get_sdk_commit_hash()`** + - Extracts the 7-character commit hash from SDK submodule + - Returns "unknown" if git fails (with warning) + +2. **`extract_instance_id(base_image)`** + - Parses SWE-Bench base image name to extract instance ID + - Examples: + - `...django_1776_django-12155:latest` → `django-12155` + - `...sympy_1776_sympy-18189:latest` → `sympy-18189` + - `...scikit-learn_3742_scikit-learn-25973:latest` → `scikit-learn-25973` + +**Modified build flow:** + +1. At startup: Set `SDK_VERSION_OVERRIDE` env var to SDK commit hash +2. Per image: Extract instance ID and create custom tag `swebench-{instance_id}` +3. Pass `include_versioned_tag=False` to disable long tag + +## Tag Format Comparison + +### Before (Old Format) +``` +ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev +``` +- **Length**: 137 characters +- **Includes**: Package version (v1.0.0), full base image path, target +- **Problem**: No git commit info, hard to parse + +### After (New Format) +``` +ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal +ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal +``` +- **Length**: 84 characters (**39% shorter**) +- **Includes**: SDK commit hash, instance ID, build target +- **Benefits**: + - Exact reproducibility (commit hash) + - Easy to parse and filter + - Clear instance identification + - Explicit target indication (no more ambiguous `-dev` suffix) + +## Tag Generation Logic + +The SDK's `all_tags` property generates: + +1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}-{target}{arch_suffix}` + - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`) + - `custom_tag` = `swebench-{instance_id}` + - `target` = Build target (`binary`, `source`, `binary-minimal`, `source-minimal`) + - Example: `a612c0a-swebench-django-12155-source-minimal` + +2. **Main branch tag** (if on main): `{image}:main-{custom_tag}-{target}{arch_suffix}` + - Example: `main-swebench-django-12155-source-minimal` + +3. **Versioned tag** (now disabled): `{image}:{versioned_tag}-{target}{arch_suffix}` + - Skipped when `include_versioned_tag=False` + +All tags now include `-{target}` suffix for clarity (replaces old `-dev` suffix pattern). + +## Benefits + +### 1. Reproducibility +- Git commit hash ensures exact SDK version tracking +- Can reconstruct exact build environment from tag alone +- No ambiguity (version 1.0.0 could be many commits) + +### 2. Usability +- **39% shorter tags** (137 → 84 chars) +- Easy to filter: `docker images | grep a612c0a` +- Easy to identify: `swebench-django-12155-source-minimal` is self-documenting +- Explicit target indication (no more guessing what `-dev` means) +- Fits in terminal/log output better + +### 3. Maintainability +- SDK changes are backward compatible (env var is optional) +- Benchmarks repo has full control over tag format +- Can easily extend with more metadata later + +## Example Build Command + +```bash +uv run benchmarks/swe_bench/build_images.py \ + --dataset princeton-nlp/SWE-bench_Verified \ + --split test \ + --image ghcr.io/openhands/eval-agent-server \ + --target source-minimal \ + --platforms linux/amd64 \ + --push \ + --max-workers 2 +``` + +## Testing + +To test the tagging logic without building: + +```python +from benchmarks.swe_bench.build_images import extract_instance_id, get_sdk_commit_hash + +# Test instance ID extraction +base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest" +print(extract_instance_id(base)) # → django-12155 + +# Get SDK commit +print(get_sdk_commit_hash()) # → a612c0a +``` + +## Migration Notes + +### For existing workflows: +- No changes needed - SDK defaults to old behavior +- Opt-in by setting `include_versioned_tag=False` + +### For CI/CD: +- New tags will be generated automatically +- Old tags (if any exist) remain unchanged +- Can coexist during transition period + +### For consumers: +- Update image references to use new tag format +- Can filter by SDK version: `grep a612c0a` +- Can filter by instance: `grep django-12155` + +## Future Enhancements + +Possible additions: +1. **Docker labels** for metadata (see `docker inspect`) +2. **Benchmarks commit** in tag or label +3. **Build timestamp** in labels +4. **Platform/architecture** in tag (already supported via `arch` param) + +## Files Changed + +1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py` + - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()` + - Added `include_versioned_tag` field to `BuildOptions` + - Changed tag suffix logic: All tags get `-{target}` suffix (replaces `-dev`) + - Removed deprecated `is_dev` property + - Modified `all_tags` property to respect new flag and suffix logic + +2. `benchmarks/swe_bench/build_images.py` + - Added `get_sdk_commit_hash()` function + - Added `extract_instance_id()` function + - Modified `main()` to set `SDK_VERSION_OVERRIDE` + - Modified `build_one()` to use custom tags and disable versioned tag + +## Related PRs + +- **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088 + - Adds `SDK_VERSION_OVERRIDE` support + - Changes `-dev` suffix to `-{target}` for all builds (more descriptive) + - Adds `include_versioned_tag` option diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 48a408de..24c743cc 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -11,6 +11,8 @@ import argparse import contextlib import io +import os +import subprocess import sys from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import UTC, datetime @@ -30,6 +32,52 @@ logger = get_logger(__name__) +def get_sdk_commit_hash() -> str: + """Get the short commit hash of the SDK submodule.""" + sdk_path = Path(__file__).parent.parent.parent / "vendor" / "software-agent-sdk" + try: + result = subprocess.run( + ["git", "rev-parse", "--short=7", "HEAD"], + cwd=sdk_path, + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except subprocess.CalledProcessError: + logger.warning("Failed to get SDK commit hash, using 'unknown'") + return "unknown" + + +def extract_instance_id(base_image: str) -> str: + """ + Extract SWE-Bench instance ID from base image name. + + Example: + docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest + -> django-12155 + + docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest + -> sympy-18189 + + docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest + -> scikit-learn-25973 + """ + # SWE-Bench images pattern: ..._{repo}_{version}_{instance_id}:tag + # We want to extract just the instance_id (last part before colon) + # Instance ID format: {repo}-{number} or {repo}_{number} + + parts = base_image.split("_") + if len(parts) >= 2: + # Last part contains the instance ID and tag + last_part = parts[-1] # e.g., "django-12155:latest" + instance_id = last_part.split(":")[0] # Remove tag + return instance_id + + logger.warning(f"Could not extract instance ID from: {base_image}") + return "unknown" + + @contextlib.contextmanager def capture_output(base_name: str, out_dir: Path): """ @@ -138,13 +186,22 @@ class BuildOutput(BaseModel): def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput: + # Extract instance ID and build custom tag + instance_id = extract_instance_id(base_image) + custom_tag = f"swebench-{instance_id}" + + # Combine with user-provided custom tags if any + if args.custom_tags: + custom_tag = f"{custom_tag},{args.custom_tags}" + opts = BuildOptions( base_image=base_image, - custom_tags=args.custom_tags, + custom_tags=custom_tag, image=args.image, target=args.target, platforms=[p.strip() for p in args.platforms.split(",") if p.strip()], push=args.push, + include_versioned_tag=False, # Disable long versioned tag ) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None) @@ -195,6 +252,11 @@ def main(argv: list[str]) -> int: parser = extend_parser() args = parser.parse_args(argv) + # Set SDK commit hash as version override for image tags + sdk_commit = get_sdk_commit_hash() + os.environ["SDK_VERSION_OVERRIDE"] = sdk_commit + logger.info(f"Using SDK commit: {sdk_commit}") + bases: list[str] = collect_unique_base_images( args.dataset, args.split, args.docker_image_prefix, args.n_limit ) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index a612c0a6..77d50e61 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit a612c0a685fa96bc725085ac81c59492d4a88974 +Subproject commit 77d50e61093d7725893996fd1d6e528b9a6220a3 From 92f04c1fb3ecbd6f0b6183770c11be77d3c8b6e9 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 20:41:12 +0000 Subject: [PATCH 20/66] refactor: omit target suffix for binary builds (default case) Updated SDK submodule to bc25aa0d which omits the target suffix for binary builds since it's the default/common case. This keeps tags cleaner. Tag examples: - Binary: a612c0a-swebench-django-12155 (no suffix) - Source: a612c0a-swebench-django-12155-source - Source-minimal: a612c0a-swebench-django-12155-source-minimal Updated TAGGING_CHANGES.md to reflect this behavior with updated examples showing both binary and source-minimal formats. Co-authored-by: openhands --- TAGGING_CHANGES.md | 41 ++++++++++++++++++++++++++------------- vendor/software-agent-sdk | 2 +- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md index 79ee855a..534189c1 100644 --- a/TAGGING_CHANGES.md +++ b/TAGGING_CHANGES.md @@ -24,7 +24,8 @@ This change replaces the long, auto-generated versioned tags with short, meaning - Gives consumers control over tag format 3. **Target-based tag suffixes** (replaces `-dev` suffix) - - All tags now include `-{target}` suffix: `-binary`, `-source`, `-binary-minimal`, `-source-minimal` + - Non-binary builds include `-{target}` suffix: `-source`, `-binary-minimal`, `-source-minimal` + - Binary builds have no suffix (it's the default/common case) - More descriptive than previous `-dev` suffix (which only applied to source builds) - Makes tag meaning immediately clear without needing to check build config - Removed deprecated `is_dev` property @@ -61,35 +62,49 @@ ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_ - **Problem**: No git commit info, hard to parse ### After (New Format) + +For source-minimal (most common for SWE-Bench): ``` ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal ``` - **Length**: 84 characters (**39% shorter**) -- **Includes**: SDK commit hash, instance ID, build target -- **Benefits**: + +For binary (no suffix, it's the default): +``` +ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155 +ghcr.io/openhands/eval-agent-server:main-swebench-django-12155 +``` +- **Length**: 69 characters (**50% shorter**) + +**Benefits**: - Exact reproducibility (commit hash) - Easy to parse and filter - Clear instance identification - - Explicit target indication (no more ambiguous `-dev` suffix) + - Clean tags for common case (binary has no suffix) ## Tag Generation Logic The SDK's `all_tags` property generates: -1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}-{target}{arch_suffix}` +1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}[-{target}]{arch_suffix}` - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`) - `custom_tag` = `swebench-{instance_id}` - - `target` = Build target (`binary`, `source`, `binary-minimal`, `source-minimal`) - - Example: `a612c0a-swebench-django-12155-source-minimal` + - `target` = Build target (omitted for `binary`, included for others) + - Examples: + - Binary: `a612c0a-swebench-django-12155` + - Source: `a612c0a-swebench-django-12155-source` + - Source-minimal: `a612c0a-swebench-django-12155-source-minimal` -2. **Main branch tag** (if on main): `{image}:main-{custom_tag}-{target}{arch_suffix}` - - Example: `main-swebench-django-12155-source-minimal` +2. **Main branch tag** (if on main): `{image}:main-{custom_tag}[-{target}]{arch_suffix}` + - Examples: + - Binary: `main-swebench-django-12155` + - Source-minimal: `main-swebench-django-12155-source-minimal` -3. **Versioned tag** (now disabled): `{image}:{versioned_tag}-{target}{arch_suffix}` +3. **Versioned tag** (now disabled): `{image}:{versioned_tag}[-{target}]{arch_suffix}` - Skipped when `include_versioned_tag=False` -All tags now include `-{target}` suffix for clarity (replaces old `-dev` suffix pattern). +Non-binary targets include `-{target}` suffix for clarity. Binary has no suffix (default case). ## Benefits @@ -167,7 +182,7 @@ Possible additions: 1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py` - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()` - Added `include_versioned_tag` field to `BuildOptions` - - Changed tag suffix logic: All tags get `-{target}` suffix (replaces `-dev`) + - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix - Removed deprecated `is_dev` property - Modified `all_tags` property to respect new flag and suffix logic @@ -181,5 +196,5 @@ Possible additions: - **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088 - Adds `SDK_VERSION_OVERRIDE` support - - Changes `-dev` suffix to `-{target}` for all builds (more descriptive) + - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive) - Adds `include_versioned_tag` option diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 77d50e61..bc25aa0d 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 77d50e61093d7725893996fd1d6e528b9a6220a3 +Subproject commit bc25aa0de519591b44047061f8f402a84d322c70 From 49d96678b8efc2f025da7038b61482cbcdee1e56 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 21:02:25 +0000 Subject: [PATCH 21/66] fix: update SDK to use SDK_VERSION for commit tags Updates SDK submodule to 27f37dc0 which fixes an issue where SHORT_SHA was using git info from the benchmarks repo instead of the SDK repo. Now tags correctly use the SDK commit hash when SDK_VERSION_OVERRIDE is set, ensuring proper versioning in vendored/submodule contexts. Co-authored-by: openhands --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index bc25aa0d..27f37dc0 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit bc25aa0de519591b44047061f8f402a84d322c70 +Subproject commit 27f37dc03543e9f41a07762e5bd120a0fb8d6f55 From c2711a331334744553c2ecf1e570cce5f69fe728 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 21:08:30 +0000 Subject: [PATCH 22/66] refactor: remove SDK_VERSION_OVERRIDE logic SDK now automatically detects its own commit hash, so we don't need to manually extract and override it. This simplifies the build script significantly: - Removed get_sdk_commit_hash() function - Removed SDK_VERSION_OVERRIDE env var setting - Removed unused imports (subprocess, os) - Updated documentation to reflect simpler approach The SDK's _sdk_version() now automatically finds the SDK repo root and gets the commit hash directly, regardless of whether it's used as a submodule or vendored dependency. Co-authored-by: openhands --- TAGGING_CHANGES.md | 44 ++++++++++++---------------- benchmarks/swe_bench/build_images.py | 24 --------------- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md index 534189c1..1c362a36 100644 --- a/TAGGING_CHANGES.md +++ b/TAGGING_CHANGES.md @@ -10,13 +10,13 @@ This change replaces the long, auto-generated versioned tags with short, meaning ### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`) -**Added three features:** +**Added two features:** -1. **`SDK_VERSION_OVERRIDE` environment variable** - - Allows overriding the package version with a commit hash - - Falls back to `importlib.metadata.version("openhands-sdk")` if not set - - Critical for git submodule contexts where package version != actual commit - - Follows existing pattern (SDK already uses `GITHUB_REF` env var) +1. **SDK_VERSION now uses git commit hash** + - `_sdk_version()` now automatically detects the SDK repo root and gets its commit hash + - Falls back to package version only if git info unavailable + - Works correctly in submodule contexts (uses SDK repo, not calling repo) + - No environment variable override needed - automatic and robust 2. **`include_versioned_tag` option in BuildOptions** - When `False`, skips the long versioned tag @@ -32,13 +32,9 @@ This change replaces the long, auto-generated versioned tags with short, meaning ### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`) -**Added two functions:** +**Added one function:** -1. **`get_sdk_commit_hash()`** - - Extracts the 7-character commit hash from SDK submodule - - Returns "unknown" if git fails (with warning) - -2. **`extract_instance_id(base_image)`** +1. **`extract_instance_id(base_image)`** - Parses SWE-Bench base image name to extract instance ID - Examples: - `...django_1776_django-12155:latest` → `django-12155` @@ -47,9 +43,9 @@ This change replaces the long, auto-generated versioned tags with short, meaning **Modified build flow:** -1. At startup: Set `SDK_VERSION_OVERRIDE` env var to SDK commit hash -2. Per image: Extract instance ID and create custom tag `swebench-{instance_id}` -3. Pass `include_versioned_tag=False` to disable long tag +1. Per image: Extract instance ID and create custom tag `swebench-{instance_id}` +2. Pass `include_versioned_tag=False` to disable long tag +3. SDK automatically uses its own commit hash (no manual override needed) ## Tag Format Comparison @@ -87,8 +83,8 @@ ghcr.io/openhands/eval-agent-server:main-swebench-django-12155 The SDK's `all_tags` property generates: -1. **Commit-based tag**: `{image}:{SHORT_SHA}-{custom_tag}[-{target}]{arch_suffix}` - - `SHORT_SHA` = First 7 chars of SDK commit (from `SDK_VERSION_OVERRIDE`) +1. **Commit-based tag**: `{image}:{SDK_VERSION[:7]}-{custom_tag}[-{target}]{arch_suffix}` + - `SDK_VERSION[:7]` = First 7 chars of SDK commit hash (automatically detected) - `custom_tag` = `swebench-{instance_id}` - `target` = Build target (omitted for `binary`, included for others) - Examples: @@ -143,14 +139,11 @@ uv run benchmarks/swe_bench/build_images.py \ To test the tagging logic without building: ```python -from benchmarks.swe_bench.build_images import extract_instance_id, get_sdk_commit_hash +from benchmarks.swe_bench.build_images import extract_instance_id # Test instance ID extraction base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest" print(extract_instance_id(base)) # → django-12155 - -# Get SDK commit -print(get_sdk_commit_hash()) # → a612c0a ``` ## Migration Notes @@ -180,21 +173,22 @@ Possible additions: ## Files Changed 1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py` - - Added `SDK_VERSION_OVERRIDE` env var support to `_sdk_version()` + - Refactored `_sdk_version()` to automatically use SDK repo commit hash + - Added `_git_info_for_repo()` to get git info from specific directories - Added `include_versioned_tag` field to `BuildOptions` - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix - Removed deprecated `is_dev` property - Modified `all_tags` property to respect new flag and suffix logic 2. `benchmarks/swe_bench/build_images.py` - - Added `get_sdk_commit_hash()` function - Added `extract_instance_id()` function - - Modified `main()` to set `SDK_VERSION_OVERRIDE` - Modified `build_one()` to use custom tags and disable versioned tag + - Removed unnecessary SDK_VERSION_OVERRIDE logic (now automatic) ## Related PRs - **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088 - - Adds `SDK_VERSION_OVERRIDE` support + - SDK_VERSION now automatically uses commit hash from SDK repo - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive) - Adds `include_versioned_tag` option + - Works correctly in submodule/vendored contexts diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 24c743cc..e80fc69b 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -11,8 +11,6 @@ import argparse import contextlib import io -import os -import subprocess import sys from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import UTC, datetime @@ -32,23 +30,6 @@ logger = get_logger(__name__) -def get_sdk_commit_hash() -> str: - """Get the short commit hash of the SDK submodule.""" - sdk_path = Path(__file__).parent.parent.parent / "vendor" / "software-agent-sdk" - try: - result = subprocess.run( - ["git", "rev-parse", "--short=7", "HEAD"], - cwd=sdk_path, - capture_output=True, - text=True, - check=True, - ) - return result.stdout.strip() - except subprocess.CalledProcessError: - logger.warning("Failed to get SDK commit hash, using 'unknown'") - return "unknown" - - def extract_instance_id(base_image: str) -> str: """ Extract SWE-Bench instance ID from base image name. @@ -252,11 +233,6 @@ def main(argv: list[str]) -> int: parser = extend_parser() args = parser.parse_args(argv) - # Set SDK commit hash as version override for image tags - sdk_commit = get_sdk_commit_hash() - os.environ["SDK_VERSION_OVERRIDE"] = sdk_commit - logger.info(f"Using SDK commit: {sdk_commit}") - bases: list[str] = collect_unique_base_images( args.dataset, args.split, args.docker_image_prefix, args.n_limit ) From 6d6845ee074980b90a62270f003b977aa8c3b8f3 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 21:08:39 +0000 Subject: [PATCH 23/66] chore: update SDK to commit 85e436df Update SDK submodule to include automatic SDK_VERSION detection. SDK now auto-detects its own commit hash without requiring external override, making the tagging system fully automatic. Co-authored-by: openhands --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 27f37dc0..85e436df 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 27f37dc03543e9f41a07762e5bd120a0fb8d6f55 +Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f From 8d8ed8cbc62df048a5958cc3d84b9d52d27ca81e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 16:54:12 +0000 Subject: [PATCH 24/66] update agent-sdk version --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 85e436df..204d3a4b 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f +Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715 From 8763fade1291693d0f6d0b72043371ecea90cc91 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 16:58:15 +0000 Subject: [PATCH 25/66] improve custom tags for swebench image --- benchmarks/swe_bench/build_images.py | 33 +++++++--------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index e80fc69b..30d489a0 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -30,33 +30,23 @@ logger = get_logger(__name__) -def extract_instance_id(base_image: str) -> str: +def extract_custom_tag(base_image: str) -> str: """ Extract SWE-Bench instance ID from base image name. Example: docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest - -> django-12155 + -> sweb.eval.x86_64.django_1776_django-12155 docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest - -> sympy-18189 + -> sweb.eval.x86_64.sympy_1776_sympy-18189 docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest - -> scikit-learn-25973 + -> sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973 """ - # SWE-Bench images pattern: ..._{repo}_{version}_{instance_id}:tag - # We want to extract just the instance_id (last part before colon) - # Instance ID format: {repo}-{number} or {repo}_{number} - - parts = base_image.split("_") - if len(parts) >= 2: - # Last part contains the instance ID and tag - last_part = parts[-1] # e.g., "django-12155:latest" - instance_id = last_part.split(":")[0] # Remove tag - return instance_id - - logger.warning(f"Could not extract instance ID from: {base_image}") - return "unknown" + name_tag = base_image.split("/")[-1] + name = name_tag.split(":")[0] + return name @contextlib.contextmanager @@ -133,7 +123,6 @@ def extend_parser() -> argparse.ArgumentParser: parser.add_argument( "--platforms", default="linux/amd64", help="Comma-separated platforms" ) - parser.add_argument("--custom-tags", default="", help="Comma-separated custom tags") parser.add_argument( "--push", action="store_true", help="Push via buildx instead of load locally" ) @@ -168,12 +157,7 @@ class BuildOutput(BaseModel): def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput: # Extract instance ID and build custom tag - instance_id = extract_instance_id(base_image) - custom_tag = f"swebench-{instance_id}" - - # Combine with user-provided custom tags if any - if args.custom_tags: - custom_tag = f"{custom_tag},{args.custom_tags}" + custom_tag = extract_custom_tag(base_image) opts = BuildOptions( base_image=base_image, @@ -182,7 +166,6 @@ def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput: target=args.target, platforms=[p.strip() for p in args.platforms.split(",") if p.strip()], push=args.push, - include_versioned_tag=False, # Disable long versioned tag ) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None) From 99927f8f6fd8857f72ba9061448c391a94c9bad8 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 17:04:10 +0000 Subject: [PATCH 26/66] Revert "update agent-sdk version" This reverts commit 8d8ed8cbc62df048a5958cc3d84b9d52d27ca81e. --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 204d3a4b..85e436df 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715 +Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f From 7e3c50ef56c8a78f481017b0dee945fd3ad195cf Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 17:05:46 +0000 Subject: [PATCH 27/66] update sha --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 85e436df..204d3a4b 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 85e436df11d5636f673d79a45ab63f1684e85a1f +Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715 From c1182973aef5ff28d29f7d6ec04f1bb144b7c2ad Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 18:32:08 +0000 Subject: [PATCH 28/66] fix: update run_infer.py to use new SDK tag format - Replace SDK_VERSION with SHORT_SHA (renamed in SDK PR #1088) - Add extract_custom_tag() function to avoid circular import - Update get_agent_server_docker_image() to use new tag format: - Binary target: {SHORT_SHA}-{custom_tag} - Other targets: {SHORT_SHA}-{custom_tag}-{target} - Aligns with SDK's git commit-based tagging strategy Co-authored-by: openhands --- benchmarks/swe_bench/run_infer.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index f9562ffb..782e2ca0 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -16,7 +16,7 @@ EvalMetadata, EvalOutput, ) -from openhands.agent_server.docker.build import SDK_VERSION, _base_slug +from openhands.agent_server.docker.build import SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -26,6 +26,19 @@ logger = get_logger(__name__) +def extract_custom_tag(base_image: str) -> str: + """ + Extract SWE-Bench instance ID from base image name. + + Example: + docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest + -> sweb.eval.x86_64.django_1776_django-12155 + """ + name_tag = base_image.split("/")[-1] + name = name_tag.split(":")[0] + return name + + def get_official_docker_image( instance_id: str, docker_image_prefix="docker.io/swebench/", @@ -45,10 +58,14 @@ def get_agent_server_docker_image( target: str = "source-minimal", ) -> str: official_image_name = get_official_docker_image(instance_id, docker_image_prefix) - return ( - "ghcr.io/openhands/eval-agent-server" - + f":v{SDK_VERSION}_{_base_slug(official_image_name)}_{target}" - ) + custom_tag = extract_custom_tag(official_image_name) + + # New tag format: {SHORT_SHA}-{custom_tag}-{target} + # For non-binary targets, append target suffix + if target == "binary": + return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}" + else: + return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}-{target}" def get_instruction( From 4f3f9b1ed3ee12ad591e20f9cf4258bd91d8a0eb Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 18:58:54 +0000 Subject: [PATCH 29/66] refactor: deduplicate extract_custom_tag by importing from run_infer Remove duplicate implementation of extract_custom_tag in build_images.py and import it from run_infer.py instead. This avoids code duplication and ensures both modules use the same implementation. Co-authored-by: openhands --- benchmarks/swe_bench/build_images.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 30d489a0..3951f559 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -20,7 +20,7 @@ from pydantic import BaseModel, Field from tqdm.auto import tqdm -from benchmarks.swe_bench.run_infer import get_official_docker_image +from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image from benchmarks.utils.args_parser import get_parser from benchmarks.utils.dataset import get_dataset from openhands.agent_server.docker.build import BuildOptions, build @@ -30,25 +30,6 @@ logger = get_logger(__name__) -def extract_custom_tag(base_image: str) -> str: - """ - Extract SWE-Bench instance ID from base image name. - - Example: - docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest - -> sweb.eval.x86_64.django_1776_django-12155 - - docker.io/swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest - -> sweb.eval.x86_64.sympy_1776_sympy-18189 - - docker.io/swebench/sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973:latest - -> sweb.eval.x86_64.scikit-learn_3742_scikit-learn-25973 - """ - name_tag = base_image.split("/")[-1] - name = name_tag.split(":")[0] - return name - - @contextlib.contextmanager def capture_output(base_name: str, out_dir: Path): """ From 26c3f0226c6caa5de3ff64ec9b15046a912e8ac1 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 18:59:30 +0000 Subject: [PATCH 30/66] docs: clarify SHORT_SHA source in run_infer.py Add comment explaining that SHORT_SHA is computed from the benchmarks repo's git commit (via git rev-parse HEAD in cwd), not the SDK submodule. This makes it clear that images are tagged with the benchmarks repo commit for reproducibility and traceability. Co-authored-by: openhands --- benchmarks/swe_bench/run_infer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 782e2ca0..cb6751a3 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -16,6 +16,10 @@ EvalMetadata, EvalOutput, ) + +# SHORT_SHA is computed from git rev-parse HEAD in the current working directory +# (benchmarks repo), not the SDK submodule. This ensures images are tagged with +# the benchmarks repo commit, making them reproducible and traceable. from openhands.agent_server.docker.build import SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace From 89e4cda7d25bd7e5f9549116938f6827e921ab2c Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:40:45 +0000 Subject: [PATCH 31/66] update sdk --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 204d3a4b..a0d35851 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 204d3a4b262d47c7f9f0690636b4766a413a5715 +Subproject commit a0d3585104558b70a915419b9a9a17b3fa0a8a54 From eacfe0b62dc50ea11b219c129c9401307d249966 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:53:27 +0000 Subject: [PATCH 32/66] refactor --- benchmarks/swe_bench/build_images.py | 3 ++- benchmarks/swe_bench/run_infer.py | 14 ++++---------- benchmarks/utils/constants.py | 1 + benchmarks/utils/version.py | 27 +++++++++++++++++++++++++++ vendor/software-agent-sdk | 2 +- 5 files changed, 35 insertions(+), 12 deletions(-) create mode 100644 benchmarks/utils/version.py diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 3951f559..999fc442 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -22,6 +22,7 @@ from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.dataset import get_dataset from openhands.agent_server.docker.build import BuildOptions, build from openhands.sdk import get_logger @@ -93,7 +94,7 @@ def extend_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--image", - default="ghcr.io/openhands/eval-agent-server", + default=EVAL_AGENT_SERVER_IMAGE, help="Target repo/name for built image", ) parser.add_argument( diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index cb6751a3..91e00b2f 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -5,6 +5,7 @@ from jinja2 import Environment, FileSystemLoader from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.dataset import get_dataset from benchmarks.utils.evaluation import Evaluation from benchmarks.utils.evaluation_utils import ( @@ -16,11 +17,7 @@ EvalMetadata, EvalOutput, ) - -# SHORT_SHA is computed from git rev-parse HEAD in the current working directory -# (benchmarks repo), not the SDK submodule. This ensures images are tagged with -# the benchmarks repo commit, making them reproducible and traceable. -from openhands.agent_server.docker.build import SHORT_SHA +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -64,12 +61,9 @@ def get_agent_server_docker_image( official_image_name = get_official_docker_image(instance_id, docker_image_prefix) custom_tag = extract_custom_tag(official_image_name) - # New tag format: {SHORT_SHA}-{custom_tag}-{target} # For non-binary targets, append target suffix - if target == "binary": - return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}" - else: - return f"ghcr.io/openhands/eval-agent-server:{SHORT_SHA}-{custom_tag}-{target}" + suffix = f"-{target}" if target != "binary" else "" + return f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" def get_instruction( diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index aa912a43..9337b847 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1 +1,2 @@ OUTPUT_FILENAME = "output.jsonl" +EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py new file mode 100644 index 00000000..951c6592 --- /dev/null +++ b/benchmarks/utils/version.py @@ -0,0 +1,27 @@ +import subprocess +from pathlib import Path + + +PROJECT_ROOT = Path(__file__).parent.parent.parent + + +def _get_submodule_sha(submodule_path: Path) -> str: + result = subprocess.run( + ["git", "submodule", "status", str(submodule_path)], + capture_output=True, + text=True, + check=True, + ) + sha = result.stdout.strip().split()[0].lstrip("+-") + return sha + + +def get_sdk_sha() -> str: + """ + Get the current git sha from the SDK submodule. + """ + return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk") + + +SDK_SHA = get_sdk_sha() +SDK_SHORT_SHA = SDK_SHA[:7] diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index a0d35851..006e8db4 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit a0d3585104558b70a915419b9a9a17b3fa0a8a54 +Subproject commit 006e8db45c66c05f9d7b9ed00449bcf59d301f5c From 3a2c0095c7c8a48c5a09168effe56a456afb89dd Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:53:48 +0000 Subject: [PATCH 33/66] remove tagging changes --- TAGGING_CHANGES.md | 194 --------------------------------------------- 1 file changed, 194 deletions(-) delete mode 100644 TAGGING_CHANGES.md diff --git a/TAGGING_CHANGES.md b/TAGGING_CHANGES.md deleted file mode 100644 index 1c362a36..00000000 --- a/TAGGING_CHANGES.md +++ /dev/null @@ -1,194 +0,0 @@ -# Docker Image Tagging Improvements - -## Summary - -This change replaces the long, auto-generated versioned tags with short, meaningful tags that include: -- **SDK commit hash** (exact reproducibility) -- **SWE-Bench instance ID** (clear identification) - -## Changes Made - -### 1. SDK Build System (`vendor/software-agent-sdk/.../docker/build.py`) - -**Added two features:** - -1. **SDK_VERSION now uses git commit hash** - - `_sdk_version()` now automatically detects the SDK repo root and gets its commit hash - - Falls back to package version only if git info unavailable - - Works correctly in submodule contexts (uses SDK repo, not calling repo) - - No environment variable override needed - automatic and robust - -2. **`include_versioned_tag` option in BuildOptions** - - When `False`, skips the long versioned tag - - Defaults to `True` for backward compatibility - - Gives consumers control over tag format - -3. **Target-based tag suffixes** (replaces `-dev` suffix) - - Non-binary builds include `-{target}` suffix: `-source`, `-binary-minimal`, `-source-minimal` - - Binary builds have no suffix (it's the default/common case) - - More descriptive than previous `-dev` suffix (which only applied to source builds) - - Makes tag meaning immediately clear without needing to check build config - - Removed deprecated `is_dev` property - -### 2. Benchmarks Build Script (`benchmarks/swe_bench/build_images.py`) - -**Added one function:** - -1. **`extract_instance_id(base_image)`** - - Parses SWE-Bench base image name to extract instance ID - - Examples: - - `...django_1776_django-12155:latest` → `django-12155` - - `...sympy_1776_sympy-18189:latest` → `sympy-18189` - - `...scikit-learn_3742_scikit-learn-25973:latest` → `scikit-learn-25973` - -**Modified build flow:** - -1. Per image: Extract instance ID and create custom tag `swebench-{instance_id}` -2. Pass `include_versioned_tag=False` to disable long tag -3. SDK automatically uses its own commit hash (no manual override needed) - -## Tag Format Comparison - -### Before (Old Format) -``` -ghcr.io/openhands/eval-agent-server:v1.0.0_docker.io_s_swebench_s_sweb.eval.x86_64.django_1776_django-12155_tag_latest_source-minimal-dev -``` -- **Length**: 137 characters -- **Includes**: Package version (v1.0.0), full base image path, target -- **Problem**: No git commit info, hard to parse - -### After (New Format) - -For source-minimal (most common for SWE-Bench): -``` -ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155-source-minimal -ghcr.io/openhands/eval-agent-server:main-swebench-django-12155-source-minimal -``` -- **Length**: 84 characters (**39% shorter**) - -For binary (no suffix, it's the default): -``` -ghcr.io/openhands/eval-agent-server:a612c0a-swebench-django-12155 -ghcr.io/openhands/eval-agent-server:main-swebench-django-12155 -``` -- **Length**: 69 characters (**50% shorter**) - -**Benefits**: - - Exact reproducibility (commit hash) - - Easy to parse and filter - - Clear instance identification - - Clean tags for common case (binary has no suffix) - -## Tag Generation Logic - -The SDK's `all_tags` property generates: - -1. **Commit-based tag**: `{image}:{SDK_VERSION[:7]}-{custom_tag}[-{target}]{arch_suffix}` - - `SDK_VERSION[:7]` = First 7 chars of SDK commit hash (automatically detected) - - `custom_tag` = `swebench-{instance_id}` - - `target` = Build target (omitted for `binary`, included for others) - - Examples: - - Binary: `a612c0a-swebench-django-12155` - - Source: `a612c0a-swebench-django-12155-source` - - Source-minimal: `a612c0a-swebench-django-12155-source-minimal` - -2. **Main branch tag** (if on main): `{image}:main-{custom_tag}[-{target}]{arch_suffix}` - - Examples: - - Binary: `main-swebench-django-12155` - - Source-minimal: `main-swebench-django-12155-source-minimal` - -3. **Versioned tag** (now disabled): `{image}:{versioned_tag}[-{target}]{arch_suffix}` - - Skipped when `include_versioned_tag=False` - -Non-binary targets include `-{target}` suffix for clarity. Binary has no suffix (default case). - -## Benefits - -### 1. Reproducibility -- Git commit hash ensures exact SDK version tracking -- Can reconstruct exact build environment from tag alone -- No ambiguity (version 1.0.0 could be many commits) - -### 2. Usability -- **39% shorter tags** (137 → 84 chars) -- Easy to filter: `docker images | grep a612c0a` -- Easy to identify: `swebench-django-12155-source-minimal` is self-documenting -- Explicit target indication (no more guessing what `-dev` means) -- Fits in terminal/log output better - -### 3. Maintainability -- SDK changes are backward compatible (env var is optional) -- Benchmarks repo has full control over tag format -- Can easily extend with more metadata later - -## Example Build Command - -```bash -uv run benchmarks/swe_bench/build_images.py \ - --dataset princeton-nlp/SWE-bench_Verified \ - --split test \ - --image ghcr.io/openhands/eval-agent-server \ - --target source-minimal \ - --platforms linux/amd64 \ - --push \ - --max-workers 2 -``` - -## Testing - -To test the tagging logic without building: - -```python -from benchmarks.swe_bench.build_images import extract_instance_id - -# Test instance ID extraction -base = "docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest" -print(extract_instance_id(base)) # → django-12155 -``` - -## Migration Notes - -### For existing workflows: -- No changes needed - SDK defaults to old behavior -- Opt-in by setting `include_versioned_tag=False` - -### For CI/CD: -- New tags will be generated automatically -- Old tags (if any exist) remain unchanged -- Can coexist during transition period - -### For consumers: -- Update image references to use new tag format -- Can filter by SDK version: `grep a612c0a` -- Can filter by instance: `grep django-12155` - -## Future Enhancements - -Possible additions: -1. **Docker labels** for metadata (see `docker inspect`) -2. **Benchmarks commit** in tag or label -3. **Build timestamp** in labels -4. **Platform/architecture** in tag (already supported via `arch` param) - -## Files Changed - -1. `vendor/software-agent-sdk/openhands-agent-server/openhands/agent_server/docker/build.py` - - Refactored `_sdk_version()` to automatically use SDK repo commit hash - - Added `_git_info_for_repo()` to get git info from specific directories - - Added `include_versioned_tag` field to `BuildOptions` - - Changed tag suffix logic: Non-binary targets get `-{target}` suffix, binary gets no suffix - - Removed deprecated `is_dev` property - - Modified `all_tags` property to respect new flag and suffix logic - -2. `benchmarks/swe_bench/build_images.py` - - Added `extract_instance_id()` function - - Modified `build_one()` to use custom tags and disable versioned tag - - Removed unnecessary SDK_VERSION_OVERRIDE logic (now automatic) - -## Related PRs - -- **SDK Changes**: https://github.com/OpenHands/software-agent-sdk/pull/1088 - - SDK_VERSION now automatically uses commit hash from SDK repo - - Changes tag suffix: binary gets no suffix, non-binary gets `-{target}` (more descriptive) - - Adds `include_versioned_tag` option - - Works correctly in submodule/vendored contexts From 84c88760d8fa73ad08cd78118ee29e8b166edcd9 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:55:32 +0000 Subject: [PATCH 34/66] bump commit --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 006e8db4..5481fc8f 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 006e8db45c66c05f9d7b9ed00449bcf59d301f5c +Subproject commit 5481fc8fb527078b74937cb800f001f4aba6882b From de46db7265f4692428a66d4e77f0317cf5301b03 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:56:00 +0000 Subject: [PATCH 35/66] simplify build script --- benchmarks/swe_bench/build_images.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 999fc442..983062a2 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -138,9 +138,7 @@ class BuildOutput(BaseModel): def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput: - # Extract instance ID and build custom tag custom_tag = extract_custom_tag(base_image) - opts = BuildOptions( base_image=base_image, custom_tags=custom_tag, From bcbd455b74d2f225e9b63f3fdc6e2797c53ba4ba Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 19:58:41 +0000 Subject: [PATCH 36/66] bump version --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 5481fc8f..6eef51b3 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 5481fc8fb527078b74937cb800f001f4aba6882b +Subproject commit 6eef51b3627a2f95709150b10cfb2094ad3a677e From 96f2da678a00eaa893fb3b2c9396149cbca5dd33 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 20:01:41 +0000 Subject: [PATCH 37/66] bump --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 6eef51b3..681c9610 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 6eef51b3627a2f95709150b10cfb2094ad3a677e +Subproject commit 681c9610f599134ad4d3bd9c3c7fd3750fe550c4 From aad870b0b021728d6bce3bb7e82338f010c6a669 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 20:10:37 +0000 Subject: [PATCH 38/66] bump --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 681c9610..a7a93a7a 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 681c9610f599134ad4d3bd9c3c7fd3750fe550c4 +Subproject commit a7a93a7a48f13ee6398f15b67b2bf339e647786a From acee9cb175cdde6e16cdc11eb587b6f6451b261e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 20:32:19 +0000 Subject: [PATCH 39/66] refactor build util into shared file --- benchmarks/swe_bench/build_images.py | 308 ++++----------------------- benchmarks/utils/build_utils.py | 301 ++++++++++++++++++++++++++ 2 files changed, 342 insertions(+), 267 deletions(-) create mode 100644 benchmarks/utils/build_utils.py diff --git a/benchmarks/swe_bench/build_images.py b/benchmarks/swe_bench/build_images.py index 983062a2..2236a647 100644 --- a/benchmarks/swe_bench/build_images.py +++ b/benchmarks/swe_bench/build_images.py @@ -8,299 +8,73 @@ --image ghcr.io/openhands/eval-agent-server --target source-minimal """ -import argparse -import contextlib -import io import sys -from concurrent.futures import ProcessPoolExecutor, as_completed -from datetime import UTC, datetime -from pathlib import Path -from threading import Lock -from pydantic import BaseModel, Field -from tqdm.auto import tqdm - -from benchmarks.swe_bench.run_infer import extract_custom_tag, get_official_docker_image -from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from benchmarks.utils.build_utils import ( + build_all_images, + default_build_output_dir, + get_build_parser, +) from benchmarks.utils.dataset import get_dataset -from openhands.agent_server.docker.build import BuildOptions, build from openhands.sdk import get_logger logger = get_logger(__name__) -@contextlib.contextmanager -def capture_output(base_name: str, out_dir: Path): - """ - Capture stdout/stderr during a block and stream them to: - //build-.log - - Keeps redirect_* semantics; writes are realtime (line-buffered + flush). - Yields the log_path. - """ - ts = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ") - log_path = Path(out_dir) / base_name / f"build-{ts}.log" - log_path.parent.mkdir(parents=True, exist_ok=True) - - # tell the user where we’re logging, without being swallowed by the redirect - # (goes to the original stderr so it’s visible immediately) - logger.info(f"Logging build output to {log_path}") - - # Open line-buffered so writes flush on newlines; - # also wrap to hard-flush every write. - f = log_path.open("w", encoding="utf-8", buffering=1) - - class _FlushOnWrite(io.TextIOBase): - encoding = f.encoding - - def __init__(self, sink): - self._sink = sink - - def write(self, s): - n = self._sink.write(s) - self._sink.flush() - return n - - def flush(self): - self._sink.flush() - - def fileno(self): - # allow libs that try to detect fileno() - return self._sink.fileno() - - sink = _FlushOnWrite(f) - - # Redirect stdout/stderr to the same realtime sink. - with contextlib.redirect_stdout(sink), contextlib.redirect_stderr(sink): # type: ignore[arg-type] - try: - yield log_path - finally: - # make sure everything is on disk - sink.flush() - f.close() +def get_official_docker_image( + instance_id: str, + docker_image_prefix="docker.io/swebench/", +) -> str: + # Official SWE-Bench image + # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 + repo, name = instance_id.split("__") + official_image_name = docker_image_prefix.rstrip("/") + official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() + logger.debug(f"Official SWE-Bench image: {official_image_name}") + return official_image_name -def extend_parser() -> argparse.ArgumentParser: - """Reuse benchmark parser and extend with build-related options.""" - parser = get_parser(add_llm_config=False) - parser.description = "Build all agent-server images for SWE-Bench base images." +def extract_custom_tag(base_image: str) -> str: + """ + Extract SWE-Bench instance ID from official SWE-Bench image name. - parser.add_argument( - "--docker-image-prefix", - default="docker.io/swebench/", - help="Prefix for SWE-Bench images", - ) - parser.add_argument( - "--image", - default=EVAL_AGENT_SERVER_IMAGE, - help="Target repo/name for built image", - ) - parser.add_argument( - "--target", - default="source-minimal", - help="Build target (source | source-minimal | binary | binary-minimal)", - ) - parser.add_argument( - "--platforms", default="linux/amd64", help="Comma-separated platforms" - ) - parser.add_argument( - "--push", action="store_true", help="Push via buildx instead of load locally" - ) - parser.add_argument( - "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)" - ) - parser.add_argument( - "--dry-run", action="store_true", help="List base images only, don’t build" - ) - return parser + Example: + docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest + -> sweb.eval.x86_64.django_1776_django-12155 + """ + name_tag = base_image.split("/")[-1] + name = name_tag.split(":")[0] + return name -def collect_unique_base_images(dataset, split, prefix, n_limit): +def collect_unique_base_images(dataset, split, n_limit): df = get_dataset( dataset_name=dataset, split=split, eval_limit=n_limit if n_limit else None ) return sorted( - { - get_official_docker_image(str(row["instance_id"]), prefix) - for _, row in df.iterrows() - } - ) - - -class BuildOutput(BaseModel): - time: str = Field(default_factory=lambda: datetime.now(UTC).isoformat()) - base_image: str - tags: list[str] - error: str | None = None - log_path: str | None = None - - -def build_one(base_image: str, args: argparse.Namespace) -> BuildOutput: - custom_tag = extract_custom_tag(base_image) - opts = BuildOptions( - base_image=base_image, - custom_tags=custom_tag, - image=args.image, - target=args.target, - platforms=[p.strip() for p in args.platforms.split(",") if p.strip()], - push=args.push, + {get_official_docker_image(str(row["instance_id"])) for _, row in df.iterrows()} ) - tags = build(opts) - return BuildOutput(base_image=base_image, tags=tags, error=None) - - -def _default_build_output_dir( - dataset: str, split: str, base_dir: Path | None = None -) -> Path: - """ - Default: ./builds// - Keeps build outputs in one predictable place, easy to .gitignore. - """ - root = (base_dir or Path.cwd()) / "builds" / dataset / split - root.mkdir(parents=True, exist_ok=True) - return root - - -def _build_with_logging( - base: str, log_dir: Path, args: argparse.Namespace -) -> BuildOutput: - """ - Module-level function for building a single image with output capture. - Must be at module level to be picklable for ProcessPoolExecutor. - """ - with capture_output(base, log_dir) as log_path: - result = build_one(base, args) - result.log_path = str(log_path) - return result - - -def _update_pbar( - pbar: tqdm, - successes: int, - failures: int, - running: int, - sample: str | None, - last_event: str | None, -): - postfix = f"✅ {successes} ❌ {failures} 🏃 {running}" - if sample: - postfix += f" ({sample})" - if last_event: - pbar.set_description(last_event) - pbar.set_postfix_str(postfix, refresh=True) def main(argv: list[str]) -> int: - parser = extend_parser() + parser = get_build_parser() args = parser.parse_args(argv) - bases: list[str] = collect_unique_base_images( - args.dataset, args.split, args.docker_image_prefix, args.n_limit + base_images: list[str] = collect_unique_base_images( + args.dataset, args.split, args.n_limit ) - # Decide manifest path under ./builds/// - BUILD_DIR = _default_build_output_dir(args.dataset, args.split) - BUILD_LOG_DIR = BUILD_DIR / "logs" - manifest_path = BUILD_DIR / "manifest.jsonl" - manifest_path.parent.mkdir(parents=True, exist_ok=True) - - if args.dry_run: - print("\n".join(bases)) - return 0 - - successes = 0 - failures = 0 - in_progress: set[str] = set() - mu = Lock() - - with ( - manifest_path.open("w") as writer, - tqdm(total=len(bases), desc="Building agent-server images", leave=True) as pbar, - ): - _update_pbar(pbar, successes, failures, 0, None, "Queueing") - - # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ), - # even if it's 1. Using processes instead of threads ensures proper isolation - # of stdout/stderr and logging handlers, preventing output mixing between builds. - with ProcessPoolExecutor(max_workers=args.max_workers) as ex: - futures = {} - for base in bases: - in_progress.add(base) - fut = ex.submit(_build_with_logging, base, BUILD_LOG_DIR, args) - futures[fut] = base - - _update_pbar( - pbar, - successes, - failures, - len(in_progress), - next(iter(in_progress), None), - "Running", - ) - - for fut in as_completed(futures): - base = futures[fut] - try: - result: BuildOutput = fut.result() - writer.write(result.model_dump_json() + "\n") - writer.flush() - with mu: - successes += 1 - _update_pbar( - pbar, successes, failures, len(in_progress), base, "✅ Done" - ) - except Exception as e: - logger.error("Build failed for %s: %r", base, e) - # Write a failure line to manifest; keep going. - writer.write( - BuildOutput( - base_image=base, tags=[], error=repr(e) - ).model_dump_json() - + "\n" - ) - writer.flush() - with mu: - failures += 1 - _update_pbar( - pbar, successes, failures, len(in_progress), base, "❌ Failed" - ) - finally: - with mu: - in_progress.discard(base) - pbar.update(1) - _update_pbar( - pbar, - successes, - failures, - len(in_progress), - next(iter(in_progress), None), - None, - ) - - # Optional: write a tiny summary JSON next to the manifest for quick reads - summary_path = manifest_path.with_name("summary.json") - summary_path.write_text( - ( - "{" - f'"dataset":"{args.dataset}",' - f'"split":"{args.split}",' - f'"total_unique_base_images":{len(bases)},' - f'"built":{successes},' - f'"failed":{failures}' - "}" - ), - encoding="utf-8", - ) - - logger.info( - "Done. Built=%d Failed=%d Manifest=%s Summary=%s", - successes, - failures, - str(manifest_path), - str(summary_path), + build_dir = default_build_output_dir(args.dataset, args.split) + return build_all_images( + base_images=base_images, + target=args.target, + build_dir=build_dir, + image=args.image, + push=args.push, + max_workers=args.max_workers, + dry_run=args.dry_run, + base_image_to_custom_tag_fn=extract_custom_tag, ) - return 1 if failures else 0 if __name__ == "__main__": diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py new file mode 100644 index 00000000..cdbfd8ca --- /dev/null +++ b/benchmarks/utils/build_utils.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +""" +Shared utilities for batch building agent-server images. +""" + +import argparse +import contextlib +import io +from concurrent.futures import ProcessPoolExecutor, as_completed +from datetime import UTC, datetime +from pathlib import Path +from threading import Lock +from typing import Callable + +from pydantic import BaseModel, Field +from tqdm.auto import tqdm + +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from openhands.agent_server.docker.build import BuildOptions, TargetType, build +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +class BuildOutput(BaseModel): + time: str = Field(default_factory=lambda: datetime.now(UTC).isoformat()) + base_image: str + tags: list[str] + error: str | None = None + log_path: str | None = None + + +@contextlib.contextmanager +def capture_output(base_name: str, out_dir: Path): + """ + Capture stdout/stderr during a block and stream them to: + //build-.log + + Keeps redirect_* semantics; writes are realtime (line-buffered + flush). + Yields the log_path. + """ + ts = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ") + log_path = Path(out_dir) / base_name / f"build-{ts}.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + + # tell the user where we’re logging, without being swallowed by the redirect + # (goes to the original stderr so it’s visible immediately) + logger.info(f"Logging build output to {log_path}") + + # Open line-buffered so writes flush on newlines; + # also wrap to hard-flush every write. + f = log_path.open("w", encoding="utf-8", buffering=1) + + class _FlushOnWrite(io.TextIOBase): + encoding = f.encoding + + def __init__(self, sink): + self._sink = sink + + def write(self, s): + n = self._sink.write(s) + self._sink.flush() + return n + + def flush(self): + self._sink.flush() + + def fileno(self): + # allow libs that try to detect fileno() + return self._sink.fileno() + + sink = _FlushOnWrite(f) + + # Redirect stdout/stderr to the same realtime sink. + with contextlib.redirect_stdout(sink), contextlib.redirect_stderr(sink): # type: ignore[arg-type] + try: + yield log_path + finally: + # make sure everything is on disk + sink.flush() + f.close() + + +def get_build_parser() -> argparse.ArgumentParser: + """Reuse benchmark parser and extend with build-related options.""" + parser = get_parser(add_llm_config=False) + parser.description = "Script for build agent-server images." + parser.add_argument( + "--image", + default=EVAL_AGENT_SERVER_IMAGE, + help="Target repo/name for built image", + ) + parser.add_argument( + "--target", + default="source-minimal", + help="Build target (source | source-minimal | binary | binary-minimal)", + ) + parser.add_argument( + "--push", action="store_true", help="Push via buildx instead of load locally" + ) + parser.add_argument( + "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)" + ) + parser.add_argument( + "--dry-run", action="store_true", help="List base images only, don’t build" + ) + return parser + + +def build_image( + base_image: str, + target_image: str, + custom_tag: str, + target: TargetType = "source-minimal", + push: bool = False, +) -> BuildOutput: + opts = BuildOptions( + base_image=base_image, + custom_tags=custom_tag, + image=target_image, + target=target, + # SWE-Bench only supports linux/amd64 images + platforms=["linux/amd64"], + push=push, + ) + tags = build(opts) + return BuildOutput(base_image=base_image, tags=tags, error=None) + + +def _build_with_logging( + log_dir: Path, + base_image: str, + target_image: str, + target: TargetType = "source-minimal", + push: bool = False, + base_image_to_custom_tag_fn: Callable[[str], str] | None = None, +) -> BuildOutput: + """ + Module-level function for building a single image with output capture. + Must be at module level to be picklable for ProcessPoolExecutor. + """ + with capture_output(base_image, log_dir) as log_path: + custom_tag = "" + if base_image_to_custom_tag_fn: + custom_tag = base_image_to_custom_tag_fn(base_image) + result = build_image(base_image, target_image, custom_tag, target, push) + result.log_path = str(log_path) + return result + + +def _update_pbar( + pbar: tqdm, + successes: int, + failures: int, + running: int, + sample: str | None, + last_event: str | None, +): + postfix = f"✅ {successes} ❌ {failures} 🏃 {running}" + if sample: + postfix += f" ({sample})" + if last_event: + pbar.set_description(last_event) + pbar.set_postfix_str(postfix, refresh=True) + + +def default_build_output_dir( + dataset: str, split: str, base_dir: Path | None = None +) -> Path: + """ + Default: ./builds// + Keeps build outputs in one predictable place, easy to .gitignore. + """ + root = (base_dir or Path.cwd()) / "builds" / dataset / split + root.mkdir(parents=True, exist_ok=True) + return root + + +def build_all_images( + base_images: list[str], + target: TargetType, + build_dir: Path, + image: str = EVAL_AGENT_SERVER_IMAGE, + push: bool = False, + base_image_to_custom_tag_fn: Callable[[str], str] | None = None, + max_workers: int = 1, + dry_run: bool = False, +) -> int: + """ + Build all specified base images concurrently, logging output and + writing a manifest file. + + Args: + base_images: List of base images to build from. + target: Build target type. + build_dir: Directory to store build logs and manifest. + image: Target image name for built images. + push: Whether to push images via buildx. + base_image_to_custom_tag_fn: Function to extract custom tag from base image. + max_workers: Number of concurrent builds. + dry_run: If True, only list base images without building. + + Returns: + Exit code: 0 if all builds succeeded, 1 if any failed. + """ + + build_log_dir = build_dir / "logs" + manifest_path = build_dir / "manifest.jsonl" + manifest_path.parent.mkdir(parents=True, exist_ok=True) + + if dry_run: + print("\n".join(base_images)) + return 0 + + successes = 0 + failures = 0 + in_progress: set[str] = set() + mu = Lock() + + with ( + manifest_path.open("w") as writer, + tqdm( + total=len(base_images), desc="Building agent-server images", leave=True + ) as pbar, + ): + _update_pbar(pbar, successes, failures, 0, None, "Queueing") + + # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ), + # even if it's 1. Using processes instead of threads ensures proper isolation + # of stdout/stderr and logging handlers, preventing output mixing between builds. + with ProcessPoolExecutor(max_workers=max_workers) as ex: + futures = {} + for base in base_images: + in_progress.add(base) + fut = ex.submit( + _build_with_logging, + build_log_dir, + base, + image, + target, + push, + base_image_to_custom_tag_fn, + ) + futures[fut] = base + + _update_pbar( + pbar, + successes, + failures, + len(in_progress), + next(iter(in_progress), None), + "Running", + ) + + for fut in as_completed(futures): + base = futures[fut] + try: + result: BuildOutput = fut.result() + writer.write(result.model_dump_json() + "\n") + writer.flush() + with mu: + successes += 1 + _update_pbar( + pbar, successes, failures, len(in_progress), base, "✅ Done" + ) + except Exception as e: + logger.error("Build failed for %s: %r", base, e) + # Write a failure line to manifest; keep going. + writer.write( + BuildOutput( + base_image=base, tags=[], error=repr(e) + ).model_dump_json() + + "\n" + ) + writer.flush() + with mu: + failures += 1 + _update_pbar( + pbar, successes, failures, len(in_progress), base, "❌ Failed" + ) + finally: + with mu: + in_progress.discard(base) + pbar.update(1) + _update_pbar( + pbar, + successes, + failures, + len(in_progress), + next(iter(in_progress), None), + None, + ) + logger.info( + "Done. Built=%d Failed=%d Manifest=%s", + successes, + failures, + str(manifest_path), + ) + return 1 if failures else 0 From a4bf9e44b790855518f416eef315d8be9c89af4e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 20:38:33 +0000 Subject: [PATCH 40/66] simplify build on the fly logic --- benchmarks/swe_bench/run_infer.py | 89 +++++++++++++------------------ 1 file changed, 36 insertions(+), 53 deletions(-) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 91e00b2f..32d5ed63 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -4,7 +4,12 @@ from jinja2 import Environment, FileSystemLoader +from benchmarks.swe_bench.build_images import ( + extract_custom_tag, + get_official_docker_image, +) from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.dataset import get_dataset from benchmarks.utils.evaluation import Evaluation @@ -27,45 +32,6 @@ logger = get_logger(__name__) -def extract_custom_tag(base_image: str) -> str: - """ - Extract SWE-Bench instance ID from base image name. - - Example: - docker.io/swebench/sweb.eval.x86_64.django_1776_django-12155:latest - -> sweb.eval.x86_64.django_1776_django-12155 - """ - name_tag = base_image.split("/")[-1] - name = name_tag.split(":")[0] - return name - - -def get_official_docker_image( - instance_id: str, - docker_image_prefix="docker.io/swebench/", -) -> str: - # Official SWE-Bench image - # swebench/sweb.eval.x86_64.django_1776_django-11333:v1 - repo, name = instance_id.split("__") - official_image_name = docker_image_prefix.rstrip("/") - official_image_name += f"/sweb.eval.x86_64.{repo}_1776_{name}:latest".lower() - logger.debug(f"Official SWE-Bench image: {official_image_name}") - return official_image_name - - -def get_agent_server_docker_image( - instance_id: str, - docker_image_prefix="docker.io/swebench/", - target: str = "source-minimal", -) -> str: - official_image_name = get_official_docker_image(instance_id, docker_image_prefix) - custom_tag = extract_custom_tag(official_image_name) - - # For non-binary targets, append target suffix - suffix = f"-{target}" if target != "binary" else "" - return f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" - - def get_instruction( instance: dict, metadata: EvalMetadata, @@ -132,26 +98,43 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: """ SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if SKIP_BUILD: - agent_server_image = get_agent_server_docker_image(instance.id) - workspace = DockerWorkspace( - server_image=agent_server_image, - working_dir="/workspace", - ) - else: - official_docker_image = get_official_docker_image(instance.id) - workspace = DockerWorkspace( - base_image=official_docker_image, - working_dir="/workspace", - target="source-minimal", - ) + official_docker_image = get_official_docker_image(instance.id) + build_target = "source-minimal" + custom_tag = extract_custom_tag(official_docker_image) + + # For non-binary targets, append target suffix + suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + ) + if not SKIP_BUILD: logger.info( - f"Building workspace from {official_docker_image}. " + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " "This may take a while...\n" "You can run benchmarks/swe_bench/build_images.py and set " "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " "agent-server image." ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + ) for cmd in self.metadata.env_setup_commands or []: res = workspace.execute_command(cmd) if res.exit_code != 0: From 9ef0d4849df66507832f0a622010024dafcd0ba5 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 20:45:29 +0000 Subject: [PATCH 41/66] remove targets and platform --- .github/workflows/build-swe-bench-images.yml | 21 -------------------- 1 file changed, 21 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index bd4d741a..a1e1d4b9 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -14,21 +14,6 @@ on: required: true default: 'test' type: string - target: - description: 'Build target (source | source-minimal | binary | binary-minimal)' - required: false - default: 'source-minimal' - type: choice - options: - - source - - source-minimal - - binary - - binary-minimal - platforms: - description: 'Comma-separated platforms (e.g., linux/amd64,linux/arm64)' - required: false - default: 'linux/amd64' - type: string max-workers: description: 'Number of concurrent builds' required: false @@ -44,8 +29,6 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - TARGET: source-minimal - PLATFORMS: linux/amd64 MAX_WORKERS: '2' # modest concurrency for reliability N_LIMIT: '10' # empty = no limit @@ -75,8 +58,6 @@ jobs: run: | if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi - if [ -n "${{ inputs.target }}" ]; then echo "TARGET=${{ inputs.target }}" >> "$GITHUB_ENV"; fi - if [ -n "${{ inputs.platforms }}" ]; then echo "PLATFORMS=${{ inputs.platforms }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi # Empty string means "no limit" if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi @@ -108,8 +89,6 @@ jobs: --dataset '${DATASET}' \ --split '${SPLIT}' \ --image ghcr.io/openhands/eval-agent-server \ - --target '${TARGET}' \ - --platforms '${PLATFORMS}' \ --push \ --max-workers '${MAX_WORKERS}'" From 06e994a02c9db113b18466f1340438b0d0020342 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 20:57:56 +0000 Subject: [PATCH 42/66] Add automatic comment to issue #81 on successful build This adds a new step to the build-and-push workflow that: - Posts a comment to issue #81 when the build completes successfully - Includes dataset name, split, SDK version, and workflow run link - Lists all built image tags in a collapsible markdown section Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 51 +++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index a1e1d4b9..e6686281 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -41,10 +41,11 @@ jobs: runs-on: labels: blacksmith-32vcpu-ubuntu-2204 - # Allow pushing to GHCR + # Allow pushing to GHCR and commenting on issues permissions: contents: read packages: write + issues: write steps: - name: Checkout repository @@ -140,3 +141,51 @@ jobs: echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY" cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY" fi + + - name: Comment on tracker issue + if: success() + run: | + # Get SDK version from submodule + SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//') + + # Count total images built + TOTAL_IMAGES=$(cat builds/*/manifest.jsonl | wc -l) + + # Extract all tags and format them as a markdown list + TAGS=$(cat builds/*/manifest.jsonl | python -c " + import sys + import json + for line in sys.stdin: + data = json.loads(line.strip()) + if data.get('tags'): + for tag in data['tags']: + print(f'- \`{tag}\`') + ") + + # Create the comment body + COMMENT_BODY=$(cat < + Built Tags (${TOTAL_IMAGES} images) + + ${TAGS} + + + EOF + ) + + # Post comment to issue #81 + curl -L -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \ + -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')" + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From fba2a557bff7d5386da1c1aa6e2f1e7a6442a8dd Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 21:02:16 +0000 Subject: [PATCH 43/66] Fix SDK URL and add workflow trigger information - Corrected SDK repository URL from All-Hands-AI/agent-sdk to OpenHands/software-agent-sdk - Added 'Triggered by' field to comment to show workflow trigger source - Updated .openhands/microagents/repo.md with correct SDK URL Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 12 +++++++++++- .openhands/microagents/repo.md | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index e6686281..dc3cc849 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -162,14 +162,24 @@ jobs: print(f'- \`{tag}\`') ") + # Determine how the workflow was triggered + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + TRIGGER="Manual trigger (workflow_dispatch)" + elif [ "${{ github.event_name }}" = "pull_request" ]; then + TRIGGER="Pull request [#${{ github.event.pull_request.number }}](${{ github.event.pull_request.html_url }})" + else + TRIGGER="${{ github.event_name }}" + fi + # Create the comment body COMMENT_BODY=$(cat < Built Tags (${TOTAL_IMAGES} images) diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md index 8aa0c1dc..0206a51d 100644 --- a/.openhands/microagents/repo.md +++ b/.openhands/microagents/repo.md @@ -84,7 +84,7 @@ make build # Rebuild environment 5. Update README.md with usage instructions # LLM Configuration -LLM configs use JSON matching the [LLM class schema](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93): +LLM configs use JSON matching the [LLM class schema](https://github.com/OpenHands/software-agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93): ```json { "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514", From 0ab219fa542dfddd31a6cd4321b6f208332af326 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 21:02:40 +0000 Subject: [PATCH 44/66] Update .gitignore to properly allow .openhands/microagents/ Changed .openhands/ to .openhands/* so that negation patterns work correctly Co-authored-by: openhands --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 77513d9d..459fad58 100644 --- a/.gitignore +++ b/.gitignore @@ -205,7 +205,7 @@ cython_debug/ workspace/ # IDE and editor directories -.openhands/ +.openhands/* !.openhands/setup.sh !.openhands/microagents/ .vscode/ From aa8b452fdf28906b01f3b914ed1844ed6d0ab1eb Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 22:30:57 +0000 Subject: [PATCH 45/66] Add error handling to skip comment when no images are built The comment step now checks if manifest.jsonl files exist and contain data before attempting to post a comment. This prevents posting comments with '0 images' when builds complete successfully but produce no output (e.g., during PR testing or when the build step is skipped). Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index dc3cc849..eb78f195 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -148,8 +148,21 @@ jobs: # Get SDK version from submodule SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//') + # Check if manifest files exist + if ! ls builds/*/manifest.jsonl >/dev/null 2>&1; then + echo "No manifest.jsonl files found in builds directory" + echo "Build may have completed but produced no images" + exit 0 + fi + # Count total images built - TOTAL_IMAGES=$(cat builds/*/manifest.jsonl | wc -l) + TOTAL_IMAGES=$(cat builds/*/manifest.jsonl 2>/dev/null | wc -l) + + if [ "$TOTAL_IMAGES" -eq 0 ]; then + echo "No images found in manifest files" + echo "Skipping comment as there are no built images to report" + exit 0 + fi # Extract all tags and format them as a markdown list TAGS=$(cat builds/*/manifest.jsonl | python -c " From a95969eba03a80d714745c0a252580d3fb3a8a09 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 22:45:32 +0000 Subject: [PATCH 46/66] Fix manifest file path detection using find command The previous check using 'builds/*/manifest.jsonl' only looked one level deep, but the actual path is 'builds/princeton-nlp/SWE-bench_Verified/test/manifest.jsonl' which is three levels deep. Using 'find' command now correctly locates manifest files at any depth within the builds directory. Tested with actual artifact from run #19182998503 containing 10 images. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index eb78f195..ded68a76 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -148,15 +148,17 @@ jobs: # Get SDK version from submodule SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//') - # Check if manifest files exist - if ! ls builds/*/manifest.jsonl >/dev/null 2>&1; then + # Find all manifest.jsonl files + MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null) + + if [ -z "$MANIFEST_FILES" ]; then echo "No manifest.jsonl files found in builds directory" echo "Build may have completed but produced no images" exit 0 fi # Count total images built - TOTAL_IMAGES=$(cat builds/*/manifest.jsonl 2>/dev/null | wc -l) + TOTAL_IMAGES=$(cat $MANIFEST_FILES 2>/dev/null | wc -l) if [ "$TOTAL_IMAGES" -eq 0 ]; then echo "No images found in manifest files" @@ -165,7 +167,7 @@ jobs: fi # Extract all tags and format them as a markdown list - TAGS=$(cat builds/*/manifest.jsonl | python -c " + TAGS=$(cat $MANIFEST_FILES | python -c " import sys import json for line in sys.stdin: From 46b52667181118742a003204ae41b224b487d3e3 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 22:48:50 +0000 Subject: [PATCH 47/66] bump sdk --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index a7a93a7a..a90d1345 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit a7a93a7a48f13ee6398f15b67b2bf339e647786a +Subproject commit a90d1345274403dd32a08f4415a0297d8b87e790 From 16526b3ece575f8c7d9fc2345703f175d888e8d4 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 7 Nov 2025 22:49:08 +0000 Subject: [PATCH 48/66] increase n work and n limit --- .github/workflows/build-swe-bench-images.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index ded68a76..10970a92 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -29,8 +29,8 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - MAX_WORKERS: '2' # modest concurrency for reliability - N_LIMIT: '10' # empty = no limit + MAX_WORKERS: '16' # modest concurrency for reliability + N_LIMIT: '50' # empty = no limit concurrency: group: build-swe-bench-${{ github.ref }} From 90ee94eb2358e3e38144d67699c3eac7c9fbebe9 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 22:50:25 +0000 Subject: [PATCH 49/66] Show only one tag per image in issue comment Each image has multiple tags (base tag + detailed tag with hash). Now showing only the first (cleaner) tag per image to reduce clutter in the issue comment, making it easier to read. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 10970a92..792a078c 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -166,15 +166,15 @@ jobs: exit 0 fi - # Extract all tags and format them as a markdown list + # Extract all tags and format them as a markdown list (one tag per image) TAGS=$(cat $MANIFEST_FILES | python -c " import sys import json for line in sys.stdin: data = json.loads(line.strip()) - if data.get('tags'): - for tag in data['tags']: - print(f'- \`{tag}\`') + if data.get('tags') and len(data['tags']) > 0: + # Only show the first tag per image to reduce clutter + print(f'- \`{data[\"tags\"][0]}\`') ") # Determine how the workflow was triggered From 2d10954776bd96edba994f3dcfd50320cc68eff1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 8 Nov 2025 18:53:50 +0000 Subject: [PATCH 50/66] bump sdk commit --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index a90d1345..aa954ce8 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit a90d1345274403dd32a08f4415a0297d8b87e790 +Subproject commit aa954ce876c55cf4b18e2296ca4458a7b1a44620 From 178123e8b5a216992ba31e85e18620f298b0b9cd Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 8 Nov 2025 18:54:20 +0000 Subject: [PATCH 51/66] increase to 500 limit and 32 concurrency --- .github/workflows/build-swe-bench-images.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 792a078c..de1aac2b 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -29,8 +29,8 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - MAX_WORKERS: '16' # modest concurrency for reliability - N_LIMIT: '50' # empty = no limit + MAX_WORKERS: '32' + N_LIMIT: '500' concurrency: group: build-swe-bench-${{ github.ref }} From 061913409ec0222b8bde4141691dcfa6a0c4696d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 10 Nov 2025 19:59:41 +0000 Subject: [PATCH 52/66] disable rebuild on every push --- .github/workflows/build-swe-bench-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index de1aac2b..007976e1 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -1,7 +1,7 @@ name: Build SWE-Bench Images on: - pull_request: # for debugging + # pull_request: # for debugging workflow_dispatch: inputs: dataset: From e67b9b081a50052693f67d632c5adc54ef9ab055 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 10 Nov 2025 20:28:57 +0000 Subject: [PATCH 53/66] Fix workflow summary mismatch: use manifest.jsonl instead of summary.json The new builder in build_utils.py only writes manifest.jsonl, not summary.json. This commit updates the workflow to: - Remove summary.json from artifact upload path - Generate build summary from manifest.jsonl instead of summary.json - Display total/successful/failed counts and list failed builds Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 50 +++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 007976e1..f5c8d63f 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -109,9 +109,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: build-manifest-${{ github.run_id }} - path: | - builds/**/manifest.jsonl - builds/**/summary.json + path: builds/**/manifest.jsonl retention-days: 30 - name: Archive build logs @@ -137,9 +135,49 @@ jobs: - name: Display build summary if: always() run: | - if ls builds/*/summary.json >/dev/null 2>&1; then - echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY" - cat builds/*/summary.json | python -m json.tool >> "$GITHUB_STEP_SUMMARY" + # Find all manifest.jsonl files + MANIFEST_FILES=$(find builds -name "manifest.jsonl" -type f 2>/dev/null) + + if [ -z "$MANIFEST_FILES" ]; then + echo "No manifest.jsonl files found" + exit 0 + fi + + # Generate summary from manifest files + echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + + # Count successes and failures + TOTAL=$(cat $MANIFEST_FILES 2>/dev/null | wc -l) + SUCCESSES=$(cat $MANIFEST_FILES 2>/dev/null | python -c " + import sys + import json + count = 0 + for line in sys.stdin: + data = json.loads(line.strip()) + if data.get('error') is None and len(data.get('tags', [])) > 0: + count += 1 + print(count) + ") + FAILURES=$((TOTAL - SUCCESSES)) + + echo "**Total Images:** $TOTAL" >> "$GITHUB_STEP_SUMMARY" + echo "**Successful Builds:** ✅ $SUCCESSES" >> "$GITHUB_STEP_SUMMARY" + echo "**Failed Builds:** ❌ $FAILURES" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + + # Show failed builds if any + if [ "$FAILURES" -gt 0 ]; then + echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + cat $MANIFEST_FILES | python -c " + import sys + import json + for line in sys.stdin: + data = json.loads(line.strip()) + if data.get('error') is not None or len(data.get('tags', [])) == 0: + print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\") + " >> "$GITHUB_STEP_SUMMARY" fi - name: Comment on tracker issue From 822e41747c5735a5252b18b91ca92514d58b9267 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 10 Nov 2025 20:34:57 +0000 Subject: [PATCH 54/66] Remove redundant 'Upload build manifest' step The 'Archive build logs' step already packages the entire builds/ directory (including manifest.jsonl files) into build-logs.tar.gz, so a separate step to upload manifest.jsonl is redundant. Co-authored-by: openhands --- .github/workflows/build-swe-bench-images.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index f5c8d63f..7a783335 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -104,14 +104,6 @@ jobs: DOCKER_BUILDKIT: 1 BUILDKIT_PROGRESS: plain - - name: Upload build manifest - if: always() - uses: actions/upload-artifact@v4 - with: - name: build-manifest-${{ github.run_id }} - path: builds/**/manifest.jsonl - retention-days: 30 - - name: Archive build logs if: always() run: | From 04f0cf4fbf8822ac6ba669dc5f99402ef6d1caa4 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 11 Nov 2025 19:29:04 +0000 Subject: [PATCH 55/66] bump sdk to v1.1 --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index aa954ce8..f45c900e 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit aa954ce876c55cf4b18e2296ca4458a7b1a44620 +Subproject commit f45c900e98db4d6623ff724ddfc769307494ab89 From a1c93c9c36bc8f8b359ed5aab6ae946942e17922 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 11 Nov 2025 19:34:48 +0000 Subject: [PATCH 56/66] support remote runtime & bump ver again --- benchmarks/swe_bench/run_infer.py | 90 ++++++++++++++++++++----------- benchmarks/utils/args_parser.py | 7 +++ benchmarks/utils/models.py | 6 ++- uv.lock | 8 +-- vendor/software-agent-sdk | 2 +- 5 files changed, 76 insertions(+), 37 deletions(-) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 32d5ed63..79b66ed2 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -26,7 +26,7 @@ from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools -from openhands.workspace import DockerWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerWorkspace logger = get_logger(__name__) @@ -96,45 +96,72 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: """ Use DockerWorkspace by default. """ - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") official_docker_image = get_official_docker_image(instance.id) build_target = "source-minimal" custom_tag = extract_custom_tag(official_docker_image) - # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" - ) - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/swe_bench/build_images.py and set " - "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." + + if self.metadata.workspace_type == "docker": + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swe_bench/build_images.py and set " + "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" + elif self.metadata.workspace_type == "remote": + runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + if not runtime_api_key: + raise ValueError( + "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - workspace = DockerWorkspace( - server_image=agent_server_image, - working_dir="/workspace", - ) + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + ) + logger.info( + f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})" + ) + workspace = APIRemoteWorkspace( + runtime_api_url=os.getenv( + "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" + ), + runtime_api_key=runtime_api_key, + server_image="ghcr.io/openhands/agent-server:main-python", + ) + else: + raise ValueError( + f"Unsupported workspace_type: {self.metadata.workspace_type}" + ) + for cmd in self.metadata.env_setup_commands or []: res = workspace.execute_command(cmd) if res.exit_code != 0: @@ -297,6 +324,7 @@ def main() -> None: critic_name=args.critic, selected_instances_file=args.select, max_retries=args.max_retries, + workspace_type=args.workspace, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index cb1584e5..56f950ad 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -25,6 +25,13 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Dataset name", ) parser.add_argument("--split", type=str, default="test", help="Dataset split") + parser.add_argument( + "--workspace", + type=str, + default="docker", + choices=["docker", "remote"], + help="Type of workspace to use (default: docker)", + ) parser.add_argument( "--max-iterations", type=int, default=100, help="Maximum iterations" ) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index b10df1f3..d3599772 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, Field @@ -45,6 +45,10 @@ class EvalMetadata(BaseModel): ge=0, description="Maximum number of retries for instances that throw exceptions", ) + workspace_type: Literal["docker", "remote"] = Field( + default="docker", + description="Type of workspace to use, e.g., 'docker' or 'remote'", + ) EvalInstanceID = str diff --git a/uv.lock b/uv.lock index 6ea29413..ab6872cf 100644 --- a/uv.lock +++ b/uv.lock @@ -1942,7 +1942,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.0.0" +version = "1.1.0" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2034,7 +2034,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.0.0" +version = "1.1.0" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "fastmcp" }, @@ -2070,7 +2070,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.0.0" +version = "1.1.0" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2097,7 +2097,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.0.0" +version = "1.1.0" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-sdk" }, diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index f45c900e..85803a23 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit f45c900e98db4d6623ff724ddfc769307494ab89 +Subproject commit 85803a23fc51edf2990a46be39b28df60f99ebea From 07abd723af145fcc7f7d3024ebeb2ef6a614d563 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 11 Nov 2025 19:36:54 +0000 Subject: [PATCH 57/66] fix target type --- benchmarks/swe_bench/run_infer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 79b66ed2..6e3d08f9 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -155,7 +155,8 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" ), runtime_api_key=runtime_api_key, - server_image="ghcr.io/openhands/agent-server:main-python", + server_image=agent_server_image, + target_type="source" if "source" in build_target else "binary", ) else: raise ValueError( From 49499571d7b65d1011e84c47580e2e808e8bb993 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 11 Nov 2025 22:57:25 +0000 Subject: [PATCH 58/66] bump sdk --- vendor/software-agent-sdk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index 85803a23..d67bd848 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit 85803a23fc51edf2990a46be39b28df60f99ebea +Subproject commit d67bd8485bd1389e4e30a5b89d2c9d8f790cd521 From 94c4326dab74e06d9a79c15f63fa49457ed6dd89 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 12 Nov 2025 16:58:55 +0000 Subject: [PATCH 59/66] check image exists before launching remote runtime job --- benchmarks/swe_bench/run_infer.py | 6 ++ benchmarks/utils/image_utils.py | 105 ++++++++++++++++++++++++++++++ pyproject.toml | 1 + uv.lock | 31 +++++++++ 4 files changed, 143 insertions(+) create mode 100644 benchmarks/utils/image_utils.py diff --git a/benchmarks/swe_bench/run_infer.py b/benchmarks/swe_bench/run_infer.py index 6e3d08f9..f7e840f3 100644 --- a/benchmarks/swe_bench/run_infer.py +++ b/benchmarks/swe_bench/run_infer.py @@ -17,6 +17,7 @@ construct_eval_output_dir, get_default_on_result_writer, ) +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -147,6 +148,11 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) + if not image_exists(agent_server_image): + raise RuntimeError( + f"Agent server image {agent_server_image} does not exist in container registry, " + "make sure to build, push it, and make it public accessible before using remote workspace." + ) logger.info( f"Using remote workspace with image {agent_server_image} (sdk sha: {sdk_short_sha})" ) diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py new file mode 100644 index 00000000..a463f3b4 --- /dev/null +++ b/benchmarks/utils/image_utils.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +import base64 +import sys + +import requests + + +ACCEPT = ",".join( + [ + "application/vnd.oci.image.index.v1+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.docker.distribution.manifest.v2+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + ] +) + + +def _parse(image: str): + digest = None + if "@" in image: + image, digest = image.split("@", 1) + tag = None + last = image.rsplit("/", 1)[-1] + if ":" in last: # tag after last slash (not registry:port) + image, tag = image.rsplit(":", 1) + parts = image.split("/") + if "." in parts[0] or ":" in parts[0] or parts[0] == "localhost": + registry, repo = parts[0], "/".join(parts[1:]) + else: + registry, repo = "registry-1.docker.io", "/".join(parts) + ref = digest or tag or "latest" + return registry, repo, ref + + +def _dockerhub_token(repo: str) -> str | None: + url = f"https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull" + r = requests.get(url, timeout=10) + if r.ok: + return r.json().get("token") + return None + + +def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None: + # Public: anonymous works; Private: Basic auth with PAT (read:packages) to get bearer + url = f"https://ghcr.io/token?service=ghcr.io&scope=repository:{repo}:pull" + headers = {} + if username and pat: + headers["Authorization"] = ( + "Basic " + base64.b64encode(f"{username}:{pat}".encode()).decode() + ) + r = requests.get(url, headers=headers, timeout=10) + if r.ok: + return r.json().get("token") + return None + + +def image_exists( + image_ref: str, + gh_username: str | None = None, + gh_pat: str | None = None, # GitHub PAT with read:packages for private GHCR + docker_token: str | None = None, # Docker Hub JWT if you already have one +) -> bool: + registry, repo, ref = _parse(image_ref) + headers = {"Accept": ACCEPT} + + if registry in ("docker.io", "index.docker.io", "registry-1.docker.io"): + base = "https://registry-1.docker.io" + token = docker_token or _dockerhub_token(repo) + if token: + headers["Authorization"] = f"Bearer {token}" + elif registry == "ghcr.io": + base = "https://ghcr.io" + token = _ghcr_token(repo, gh_username, gh_pat) + if token: + headers["Authorization"] = f"Bearer {token}" + else: + base = f"https://{registry}" + + url = f"{base}/v2/{repo}/manifests/{ref}" + try: + r = requests.head(url, headers=headers, timeout=10) + if r.status_code in ( + 405, + 406, + ): # some registries disallow HEAD or need GET for content-negotiation + r = requests.get(url, headers=headers, timeout=10) + # 200 -> exists; 401/403 -> exists but unauthorized; 404 -> not found + return r.status_code == 200 + except requests.RequestException: + return False + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print( + "Usage: python image_check.py [gh_user] [gh_pat]" + ) + sys.exit(1) + + image = sys.argv[1] + gh_user = sys.argv[2] if len(sys.argv) > 2 else None + gh_pat = sys.argv[3] if len(sys.argv) > 3 else None + + ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat) + print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}") diff --git a/pyproject.toml b/pyproject.toml index 5e924d58..8561951d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "openhands-workspace", "modal>=1.1.4", "swebench", + "docker-registry-client>=0.5.2", ] [project.scripts] diff --git a/uv.lock b/uv.lock index ab6872cf..7c233247 100644 --- a/uv.lock +++ b/uv.lock @@ -719,6 +719,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "docker-registry-client" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ecdsa" }, + { name = "jws" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/82/3c/287104dcdbd6fd3d367b8bc50f1387f8326fb8026312af61b2bcf5c09387/docker-registry-client-0.5.2.tar.gz", hash = "sha256:8482efc9ec9ec708dfb74193cdfa530eee23c93596c63d704c5a3702b049e58f", size = 8037, upload-time = "2017-06-16T16:05:24.387Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b4/f1b3b2da3024fc20fe1e359871dc3c4f8e0ade1b0bbd85294f244c6a29d7/docker_registry_client-0.5.2-py2.py3-none-any.whl", hash = "sha256:cb6c1c5e72e091ada9b32499c8529850e247bafb2202bc31bbe45e9710bf9038", size = 11731, upload-time = "2017-06-16T16:05:26.057Z" }, +] + [[package]] name = "docstring-parser" version = "0.17.0" @@ -737,6 +751,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/dd/f95350e853a4468ec37478414fc04ae2d61dad7a947b3015c3dcc51a09b9/docutils-0.22.2-py3-none-any.whl", hash = "sha256:b0e98d679283fc3bb0ead8a5da7f501baa632654e7056e9c5846842213d674d8", size = 632667, upload-time = "2025-09-20T17:55:43.052Z" }, ] +[[package]] +name = "ecdsa" +version = "0.13.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/d8/9c3596fd0f18ae0a76333492a119c00183323d8e64de1a4f4bd642856963/ecdsa-0.13.3.tar.gz", hash = "sha256:163c80b064a763ea733870feb96f9dd9b92216cfcacd374837af18e4e8ec3d4d", size = 60477, upload-time = "2019-10-07T14:05:24.318Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/81/2b170b460c84fdc8700cf08aa077ac6a9ff41f4ad3f05d0b3a64ba9f8f2e/ecdsa-0.13.3-py2.py3-none-any.whl", hash = "sha256:9814e700890991abeceeb2242586024d4758c8fc18445b194a49bd62d85861db", size = 52113, upload-time = "2019-10-07T14:05:22.583Z" }, +] + [[package]] name = "email-validator" version = "2.3.0" @@ -1470,6 +1493,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "jws" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/9e/1536d578ed50f5fe8196310ddcc921a3cd8e973312d60ac74488b805d395/jws-0.1.3.tar.gz", hash = "sha256:0e3d4cb06ae7c5c1d16d357b4e7acb5c5ecab0cccb3a4b998035b85052488053", size = 8104, upload-time = "2015-03-10T15:53:37.844Z" } + [[package]] name = "lazy-object-proxy" version = "1.12.0" @@ -1975,6 +2004,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "datasets" }, + { name = "docker-registry-client" }, { name = "huggingface-hub" }, { name = "jinja2" }, { name = "modal" }, @@ -2005,6 +2035,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "datasets" }, + { name = "docker-registry-client", specifier = ">=0.5.2" }, { name = "huggingface-hub" }, { name = "jinja2" }, { name = "modal", specifier = ">=1.1.4" }, From 5d734aac4bf2c6c7db08f1693b52f23a67a94204 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 16:10:28 +0000 Subject: [PATCH 60/66] trying fixing docker build trigger --- .github/workflows/build-swe-bench-images.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 9a4aee6f..a87c5e09 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -1,7 +1,7 @@ name: Build SWE-Bench Images on: - pull_request: + pull_request:pull_request_target: types: [labeled] workflow_dispatch: inputs: @@ -40,9 +40,10 @@ concurrency: jobs: build-and-push: # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench' - if: | + if: > github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'build-swebench') + (github.event_name == 'pull_request_target' && github.event.action == 'labeled' && github.event.label.name == 'build-swebench') + runs-on: labels: blacksmith-32vcpu-ubuntu-2204 From 3e1f8f9c20e70304fc6391736b01334a7945717c Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 16:24:12 +0000 Subject: [PATCH 61/66] fix typo --- .github/workflows/build-swe-bench-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index a87c5e09..f7821da9 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -1,7 +1,7 @@ name: Build SWE-Bench Images on: - pull_request:pull_request_target: + pull_request_target: types: [labeled] workflow_dispatch: inputs: From 860187557555506b4a9423ec9813c4ee9d9e4882 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 16:30:51 +0000 Subject: [PATCH 62/66] tweak --- .github/workflows/build-swe-bench-images.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index f7821da9..db3612c9 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -2,7 +2,8 @@ name: Build SWE-Bench Images on: pull_request_target: - types: [labeled] + types: + - labeled workflow_dispatch: inputs: dataset: @@ -42,8 +43,7 @@ jobs: # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench' if: > github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request_target' && github.event.action == 'labeled' && github.event.label.name == 'build-swebench') - + (github.event_name == 'pull_request_target' && github.event.label.name == 'build-swebench') runs-on: labels: blacksmith-32vcpu-ubuntu-2204 From af6966a8307c31847b9631c7df957c8d22655a7b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 16:34:02 +0000 Subject: [PATCH 63/66] tweak --- .github/workflows/build-swe-bench-images.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index db3612c9..9cc4844c 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -2,8 +2,7 @@ name: Build SWE-Bench Images on: pull_request_target: - types: - - labeled + types: [labeled] workflow_dispatch: inputs: dataset: @@ -40,10 +39,11 @@ concurrency: jobs: build-and-push: - # Only run on workflow_dispatch or if the PR is labeled with 'build-swebench' if: > github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request_target' && github.event.label.name == 'build-swebench') + (github.event_name == 'pull_request_target' && + github.event.label.name == 'build-swebench') + runs-on: labels: blacksmith-32vcpu-ubuntu-2204 From 2160810d21aae96d7f81f9b6c2d898b2a2cac439 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 16:36:02 +0000 Subject: [PATCH 64/66] drop default --- .github/workflows/build-swe-bench-images.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swe-bench-images.yml b/.github/workflows/build-swe-bench-images.yml index 9cc4844c..52b0d6e6 100644 --- a/.github/workflows/build-swe-bench-images.yml +++ b/.github/workflows/build-swe-bench-images.yml @@ -18,7 +18,7 @@ on: max-workers: description: 'Number of concurrent builds' required: false - default: '64' + default: '32' type: string n-limit: description: 'Limit number of images to build (for testing). Leave blank for no limit.' @@ -30,7 +30,7 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - MAX_WORKERS: '64' + MAX_WORKERS: '32' N_LIMIT: '500' concurrency: From fd5c0c68cdc8d8a13f5b692c6f8ffd10a810b2fb Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 17:22:26 +0000 Subject: [PATCH 65/66] sleep after failure --- benchmarks/utils/build_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index cdcb9e61..31b718d4 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -7,6 +7,7 @@ import contextlib import io import subprocess +import time import tomllib from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import UTC, datetime @@ -224,6 +225,7 @@ def _build_with_logging( logger.info( f"Retrying build for {base_image} (attempt {attempt + 1}/{max_retries})" ) + time.sleep(2 + attempt * 2) result = build_image(base_image, target_image, custom_tag, target, push) result.log_path = str(log_path) if not result.error: From ea3f69fbe7de07b0f68cec5c12777f54b947d6a6 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 13 Nov 2025 17:25:25 +0000 Subject: [PATCH 66/66] check target image existence before build --- benchmarks/utils/build_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index 31b718d4..e55b3273 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -20,6 +20,7 @@ from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from benchmarks.utils.image_utils import image_exists from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -196,6 +197,11 @@ def build_image( git_sha=git_sha, sdk_version=sdk_version, ) + for t in opts.all_tags[0]: + # Check if image exists or not + if image_exists(t): + logger.info(f"Image {t} already exists. Skipping build.") + return BuildOutput(base_image=base_image, tags=[t], error=None) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None)