diff --git a/.gitignore b/.gitignore
index 43135338..d89f1a36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -206,6 +206,8 @@ workspace/
# IDE and editor directories
.openhands/
+!.openhands/setup.sh
+!.openhands/microagents/
.vscode/
# LLM configuration directory (contains API keys and sensitive configs)
diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md
new file mode 100644
index 00000000..8aa0c1dc
--- /dev/null
+++ b/.openhands/microagents/repo.md
@@ -0,0 +1,103 @@
+
+You are a collaborative software engineering partner focused on maintaining high-quality benchmark evaluation infrastructure. Your approach emphasizes simplicity, reliability, and reproducible results.
+
+# Core Engineering Principles
+
+1. **Reproducibility**
+"Benchmarks must produce consistent, comparable results."
+ • Pin dependencies and submodule versions
+ • Maintain isolation between test environments
+ • Document evaluation methodology clearly
+
+2. **Simplicity**
+"Clear evaluation logic is easier to validate and debug."
+ • Prefer straightforward data transformations
+ • Avoid complex abstractions in evaluation code
+ • Keep benchmark scripts focused and readable
+
+3. **Backward Compatibility**
+"Preserve comparability with historical results."
+ • Changes should not invalidate previous evaluations
+ • Document any changes that affect metrics
+ • Maintain compatibility with upstream benchmark datasets
+
+4. **Pragmatic Testing**
+"Test what matters for accurate evaluation."
+ • Validate data format conversions
+ • Verify evaluation harness integration
+ • Focus on correctness of benchmark logic
+
+
+
+- Run `make build` to initialize the agent-sdk submodule and install dependencies
+- We use pre-commit hooks (`.pre-commit-config.yaml`) that include:
+ - Type checking with `pyright`
+ - Linting and formatting with `ruff`
+- NEVER USE `mypy`!
+- Do NOT commit ALL files, only commit relevant changes!
+- Add "Co-authored-by: openhands " to every commit message
+- Run tests with `uv run pytest`
+
+# Project Structure
+- `benchmarks/swe_bench/` - SWE-Bench evaluation (code generation on GitHub issues)
+- `benchmarks/gaia/` - GAIA evaluation (general AI assistant tasks)
+- `benchmarks/utils/` - Shared utilities (patch handling, etc.)
+- `vendor/agent-sdk/` - Git submodule for OpenHands Agent SDK
+- `.llm_config/` - LLM configuration files (JSON format)
+
+# Submodule Management
+The Agent SDK is vendored as a git submodule. To update:
+```bash
+cd vendor/agent-sdk
+git fetch && git checkout
+cd ../..
+git add vendor/agent-sdk
+git commit -m "Update agent-sdk to "
+make build # Rebuild environment
+```
+
+
+
+- Avoid `sys.path.insert` hacks for imports
+- Use existing libraries instead of reimplementing (e.g., use `swebench` package for evaluation)
+- Avoid `# type: ignore` unless absolutely necessary
+- Avoid inline imports unless required for circular dependencies
+- Prefer explicit type hints over runtime checks with `getattr`/`hasattr`
+- Use real newlines in commit messages, not literal `\n`
+
+
+
+- After editing a file, run `uv run pre-commit run --files [filepath]`
+- Write focused tests that cover edge cases, not exhaustive tests
+- Put tests in corresponding test folders: `benchmarks/*/tests/`
+- Avoid test classes unless necessary
+- Extract common test setup into fixtures in `conftest.py`
+- Test only logic in this codebase, not third-party functionality
+
+
+
+# Adding New Benchmarks
+1. Create new directory under `benchmarks/`
+2. Implement `run_infer.py` for inference and output generation
+3. Add evaluation script if needed (or integrate with existing harness)
+4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]`
+5. Update README.md with usage instructions
+
+# LLM Configuration
+LLM configs use JSON matching the [LLM class schema](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
+```json
+{
+ "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",
+ "base_url": "https://llm-proxy.eval.all-hands.dev",
+ "api_key": "YOUR_API_KEY"
+}
+```
+Validate with: `uv run validate-cfg .llm_config/your-config.json`
+
+# Data Format Conversions
+When converting between OpenHands format and benchmark-specific formats:
+- Preserve all required fields for evaluation
+- Handle missing/optional fields gracefully
+- Log conversion warnings for debugging
+- Validate output format before evaluation
+
diff --git a/.openhands/setup.sh b/.openhands/setup.sh
new file mode 100755
index 00000000..9082145a
--- /dev/null
+++ b/.openhands/setup.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if ! command -v uv &> /dev/null; then
+ echo "uv is not installed. Installing..."
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+else
+ echo "uv is already installed."
+ uv self update # always update to the latest version
+fi
+
+make build
diff --git a/README.md b/README.md
index 3e5f1785..c8d9f3b5 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ uv run benchmarks/swe_bench/build_images.py \
```
-### 3. Run SWE-Bench Evaluation
+### 3. Run SWE-Bench Inference
```bash
# Run evaluation with your configured LLM
uv run swebench-infer .llm_config/example.json \
@@ -134,6 +134,24 @@ python -m benchmarks.swe_bench.run_infer \
This will only evaluate the instances listed in the file.
+### 5. Evaluate SWE-Bench Results
+After running inference, evaluate the results using the official SWE-Bench evaluation:
+
+```bash
+# Convert output format and run SWE-Bench evaluation
+uv run swebench-eval output.jsonl
+
+# Or specify custom dataset and output file
+uv run swebench-eval output.jsonl --dataset princeton-nlp/SWE-bench_Lite --output-file results.swebench.jsonl
+
+# Only convert format without running evaluation
+uv run swebench-eval output.jsonl --skip-evaluation
+```
+
+The script will:
+1. Convert OpenHands output format to SWE-Bench prediction format
+2. Run the official SWE-Bench evaluation harness
+
## Links
- **Original OpenHands**: https://github.com/All-Hands-AI/OpenHands/
diff --git a/benchmarks/swe_bench/eval_infer.py b/benchmarks/swe_bench/eval_infer.py
new file mode 100644
index 00000000..c2dca753
--- /dev/null
+++ b/benchmarks/swe_bench/eval_infer.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+SWE-Bench Evaluation Script
+
+This script converts OpenHands output.jsonl format to SWE-Bench prediction format
+and runs the SWE-Bench evaluation.
+
+Usage:
+ uv run swebench-eval
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+from benchmarks.utils.patch_utils import remove_files_from_patch
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def convert_to_swebench_format(
+ input_file: str, output_file: str, model_name: str = "OpenHands"
+) -> None:
+ """
+ Convert OpenHands output.jsonl to SWE-Bench prediction format.
+
+ OpenHands format:
+ {
+ "instance_id": "django__django-11333",
+ "test_result": {
+ "git_patch": "diff --git a/file.py b/file.py\n..."
+ },
+ "instruction": "...",
+ "error": null,
+ "history": [...]
+ }
+
+ SWE-Bench format:
+ {
+ "instance_id": "django__django-11333",
+ "model_patch": "diff --git a/file.py b/file.py\n...",
+ "model_name_or_path": "OpenHands"
+ }
+ """
+ logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
+
+ converted_count = 0
+ error_count = 0
+
+ with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+ for line_num, line in enumerate(infile, 1):
+ try:
+ line = line.strip()
+ if not line:
+ continue
+
+ data = json.loads(line)
+
+ # Extract required fields
+ instance_id = data.get("instance_id")
+ if not instance_id:
+ logger.warning(f"Line {line_num}: Missing instance_id")
+ error_count += 1
+ continue
+
+ # Extract git_patch from test_result
+ test_result = data.get("test_result", {})
+ git_patch = test_result.get("git_patch", "")
+
+ if not git_patch:
+ logger.warning(
+ f"Line {line_num}: Missing or empty git_patch for {instance_id}"
+ )
+ # Still create entry with empty patch
+ git_patch = ""
+
+ # postprocess git_patch
+ setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
+ git_patch = remove_files_from_patch(git_patch, setup_files)
+
+ # Create SWE-Bench format entry
+ swebench_entry = {
+ "instance_id": instance_id,
+ "model_patch": git_patch,
+ "model_name_or_path": model_name,
+ }
+
+ # Write to output file
+ outfile.write(json.dumps(swebench_entry) + "\n")
+ converted_count += 1
+
+ except json.JSONDecodeError as e:
+ logger.error(f"Line {line_num}: Invalid JSON - {e}")
+ error_count += 1
+ except Exception as e:
+ logger.error(f"Line {line_num}: Unexpected error - {e}")
+ error_count += 1
+
+ logger.info(
+ f"Conversion complete: {converted_count} entries converted, "
+ f"{error_count} errors"
+ )
+
+ if converted_count == 0:
+ raise ValueError("No valid entries were converted")
+
+
+def run_swebench_evaluation(
+ predictions_file: str,
+ dataset: str = "princeton-nlp/SWE-bench_Verified",
+ workers: str = "12",
+) -> None:
+ """
+ Run SWE-Bench evaluation on the predictions file.
+
+ Args:
+ predictions_file: Path to the SWE-Bench format predictions file
+ dataset: SWE-Bench dataset to evaluate against
+ workers: Number of workers to use for evaluation
+ """
+ logger.info(f"Running SWE-Bench evaluation on {predictions_file}")
+
+ try:
+ # Get the directory of the predictions file
+ predictions_path = Path(predictions_file)
+ predictions_dir = predictions_path.parent
+ predictions_filename = predictions_path.name
+
+ # Run SWE-Bench evaluation using global python (not UV environment)
+ # since swebench is installed globally
+ cmd = [
+ "/usr/bin/python3",
+ "-m",
+ "swebench.harness.run_evaluation",
+ "--dataset_name",
+ dataset,
+ "--predictions_path",
+ predictions_filename,
+ "--max_workers",
+ str(workers),
+ "--run_id",
+ f"eval_{predictions_path.stem}",
+ ]
+
+ logger.info(f"Running command: {' '.join(cmd)}")
+ logger.info(f"Working directory: {predictions_dir}")
+ logger.info("SWE-Bench evaluation output:")
+ print("-" * 80)
+
+ # Stream output directly to console, running from predictions file directory
+ result = subprocess.run(cmd, text=True, cwd=predictions_dir)
+
+ print("-" * 80)
+ if result.returncode == 0:
+ logger.info("SWE-Bench evaluation completed successfully")
+ else:
+ logger.error(
+ f"SWE-Bench evaluation failed with return code {result.returncode}"
+ )
+ raise subprocess.CalledProcessError(result.returncode, cmd)
+
+ except FileNotFoundError:
+ logger.error(
+ "SWE-Bench evaluation command not found. "
+ "Make sure SWE-Bench is properly installed."
+ )
+ raise
+ except Exception as e:
+ logger.error(f"Error running SWE-Bench evaluation: {e}")
+ raise
+
+
+def main() -> None:
+ """Main entry point for the script."""
+ parser = argparse.ArgumentParser(
+ description="Convert OpenHands output to SWE-Bench format and run evaluation",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ uv run swebench-eval output.jsonl
+ uv run swebench-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Lite
+ uv run swebench-eval output.jsonl --model-name "MyModel-v1.0"
+ """,
+ )
+
+ parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file")
+
+ parser.add_argument(
+ "--dataset",
+ default="princeton-nlp/SWE-bench_Verified",
+ help="SWE-Bench dataset to evaluate against "
+ "(default: princeton-nlp/SWE-bench_Verified)",
+ )
+
+ parser.add_argument(
+ "--output-file",
+ help="Output file for SWE-Bench format "
+ "(default: input_file with .swebench.jsonl extension)",
+ )
+
+ parser.add_argument(
+ "--skip-evaluation",
+ action="store_true",
+ help="Only convert format, skip running evaluation",
+ )
+
+ parser.add_argument(
+ "--model-name",
+ default="OpenHands",
+ help="Model name to use in the model_name_or_path field (default: OpenHands)",
+ )
+
+ parser.add_argument(
+ "--workers",
+ default="12",
+ help="Number of workers to use when evaluating",
+ )
+
+ args = parser.parse_args()
+
+ # Validate input file
+ input_file = Path(args.input_file)
+ if not input_file.exists():
+ logger.error(f"Input file does not exist: {input_file}")
+ sys.exit(1)
+
+ if not input_file.suffix == ".jsonl":
+ logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+ # Determine output file
+ if args.output_file:
+ output_file = Path(args.output_file)
+ else:
+ output_file = input_file.with_suffix(".swebench.jsonl")
+
+ logger.info(f"Input file: {input_file}")
+ logger.info(f"Output file: {output_file}")
+ logger.info(f"Dataset: {args.dataset}")
+ logger.info(f"Model name: {args.model_name}")
+
+ try:
+ # Convert format
+ convert_to_swebench_format(str(input_file), str(output_file), args.model_name)
+
+ if not args.skip_evaluation:
+ # Run evaluation
+ run_swebench_evaluation(str(output_file), args.dataset, args.workers)
+
+ logger.info("Script completed successfully!")
+
+ except Exception as e:
+ logger.error(f"Script failed: {e}")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/utils/binary_patch_utils.py b/benchmarks/utils/binary_patch_utils.py
deleted file mode 100644
index f17737e7..00000000
--- a/benchmarks/utils/binary_patch_utils.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Utilities for handling binary files and patch generation in SWE-bench evaluation.
-"""
-
-
-def remove_binary_diffs(patch_text):
- """
- Remove binary file diffs from a git patch.
-
- Args:
- patch_text (str): The git patch text
-
- Returns:
- str: The cleaned patch text with binary diffs removed
- """
- lines = patch_text.splitlines()
- cleaned_lines = []
- block = []
- is_binary_block = False
-
- for line in lines:
- if line.startswith("diff --git "):
- if block and not is_binary_block:
- cleaned_lines.extend(block)
- block = [line]
- is_binary_block = False
- elif "Binary files" in line:
- is_binary_block = True
- block.append(line)
- else:
- block.append(line)
-
- if block and not is_binary_block:
- cleaned_lines.extend(block)
- return "\n".join(cleaned_lines)
-
-
-def remove_binary_files_from_git():
- """
- Generate a bash command to remove binary files from git staging.
-
- Returns:
- str: A bash command that removes binary files from git staging
- """
- return """
- for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
- if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\
- git check-attr binary "$file" | grep -q "binary: set"); then
- git rm -f "$file" 2>/dev/null || rm -f "$file"
- echo "Removed: $file"
- fi
- done
- """.strip()
diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py
new file mode 100644
index 00000000..aa8afc01
--- /dev/null
+++ b/benchmarks/utils/patch_utils.py
@@ -0,0 +1,116 @@
+"""
+Utilities for handling patch generation in SWE-bench evaluation.
+"""
+
+import re
+
+
+def remove_files_from_patch(git_patch, files):
+ """
+ Remove files modifications from a git patch string.
+ Args:
+ git_patch (str): The original git patch string
+ files (List[str]): The files to remove form the patch
+ Returns:
+ str: The git patch with files modifications removed
+ """
+ if not git_patch:
+ return git_patch
+
+ # Split patch into individual file diffs
+ # Look for diff --git patterns to identify file boundaries
+ diff_pattern = r"diff --git [^\n]*\n"
+
+ # Find all diff headers and their positions
+ diff_matches = list(re.finditer(diff_pattern, git_patch))
+
+ if not diff_matches:
+ return git_patch
+
+ # Extract individual file diffs
+ file_diffs = []
+ for i, match in enumerate(diff_matches):
+ start = match.start()
+ end = (
+ diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch)
+ )
+ file_diff = git_patch[start:end]
+ file_diffs.append(file_diff)
+
+ # Filter out files in list
+ filtered_diffs = []
+ for diff in file_diffs:
+ # Extract filenames from diff header to do exact matching
+ should_skip = False
+ if "diff --git" in diff:
+ # Extract the diff header line
+ first_line = diff.split("\n")[0]
+ # Parse diff --git a/file b/file format
+ match = re.match(r"diff --git a/(.+) b/(.+)", first_line)
+ if match:
+ file_a, file_b = match.groups()
+ # Check if either filename (before or after) matches any file to remove
+ if file_a in files or file_b in files:
+ should_skip = True
+
+ if should_skip:
+ # Skip this diff
+ continue
+ filtered_diffs.append(diff)
+
+ # Rejoin the filtered diffs with proper newlines
+ if not filtered_diffs:
+ return ""
+
+ # Join the diffs while preserving their original structure
+ # Each diff already contains its proper ending from the original split
+ result = "".join(filtered_diffs)
+
+ return result
+
+
+def remove_binary_diffs(patch_text):
+ """
+ Remove binary file diffs from a git patch.
+ Args:
+ patch_text (str): The git patch text
+ Returns:
+ str: The cleaned patch text with binary diffs removed
+ """
+ lines = patch_text.splitlines()
+ cleaned_lines = []
+ block = []
+ is_binary_block = False
+
+ for line in lines:
+ if line.startswith("diff --git "):
+ if block and not is_binary_block:
+ cleaned_lines.extend(block)
+ block = [line]
+ is_binary_block = False
+ elif "Binary files" in line:
+ is_binary_block = True
+ block.append(line)
+ else:
+ block.append(line)
+
+ if block and not is_binary_block:
+ cleaned_lines.extend(block)
+ return "\n".join(cleaned_lines)
+
+
+def remove_binary_files_from_git():
+ """
+ Generate a bash command to remove binary files from git staging.
+ Returns:
+ str: A bash command that removes binary files from git staging
+ """
+ return """
+ for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
+ if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\
+ git check-attr binary "$file" | grep -q "binary: set"); then
+ git rm -f "$file" 2>/dev/null || rm -f "$file"
+ echo "Removed: $file"
+ fi
+ done
+ """.strip()
diff --git a/pyproject.toml b/pyproject.toml
index 73792ba1..04f8e506 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,13 +18,16 @@ dependencies = [
"openhands-agent-server",
"openhands-workspace",
"modal>=1.1.4",
+ "swebench",
]
[project.scripts]
validate-cfg = "benchmarks.scripts.validate_cfg:main"
swebench-infer = "benchmarks.swe_bench.run_infer:main"
+swebench-eval = "benchmarks.swe_bench.eval_infer:main"
gaia-infer = "benchmarks.gaia.run_infer:main"
+
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
diff --git a/uv.lock b/uv.lock
index 2a3814fd..95619ec6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -204,6 +204,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f4/be/6985abb1011fda8a523cfe21ed9629e397d6e06fb5bae99750402b25c95b/bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa", size = 69539, upload-time = "2023-01-18T15:21:24.167Z" },
]
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "soupsieve" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822, upload-time = "2025-09-29T10:05:42.613Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" },
+]
+
[[package]]
name = "binaryornot"
version = "0.4.4"
@@ -689,6 +702,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/54/20/54e2bdaad22ca91a59455251998d43094d5c3d3567c52c7c04774b3f43f2/fastapi-0.118.0-py3-none-any.whl", hash = "sha256:705137a61e2ef71019d2445b123aa8845bd97273c395b744d5a7dfe559056855", size = 97694, upload-time = "2025-09-29T03:37:21.338Z" },
]
+[[package]]
+name = "fastcore"
+version = "1.8.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/41/fc/4782041a7e96ae3de2b6bc7a287693d619688d938f43e6d9e70a23874d51/fastcore-1.8.14.tar.gz", hash = "sha256:869735ef493dbc7e5e8cbfc35fa3310772ce4c768d5b3a82d6a0d571148401be", size = 83648, upload-time = "2025-10-29T05:38:46.285Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ed/c6/236247deaa155fad1b38b6deb95b8b76efd20f5107b4577eee42002cbf11/fastcore-1.8.14-py3-none-any.whl", hash = "sha256:a02a749c26243ffd54d6dd11165cf4a556c7cb08f4c7e47ff67b32c7b0183ce9", size = 86791, upload-time = "2025-10-29T05:38:44.343Z" },
+]
+
[[package]]
name = "fastmcp"
version = "2.12.4"
@@ -830,6 +855,43 @@ version = "4.3.5"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b3/0d/bf0567477f7281d9a3926c582bfef21bff7498fc0ffd3e9de21811896a0b/func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd", size = 44264, upload-time = "2019-08-19T21:32:07.43Z" }
+[[package]]
+name = "ghapi"
+version = "1.0.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "fastcore" },
+ { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/45/714e944ca610758c5fd7cece1aea366fb88e318983d6a8e52857598b5b09/ghapi-1.0.8.tar.gz", hash = "sha256:9ab02dcd06b3c622ea2d9b21a2efee316076a744ce7847251a2fe9f542f381df", size = 72049, upload-time = "2025-09-16T23:57:48.847Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/df/cd/63129b50c44da8461f663f09f8e4801acaf0ff5bc8b201d75cee82d5b35c/ghapi-1.0.8-py3-none-any.whl", hash = "sha256:3e4023f475ec966995dd3feeacd3f42f9e296dd23148e6f28d15e80487300e66", size = 68569, upload-time = "2025-09-16T23:57:47.017Z" },
+]
+
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
[[package]]
name = "google-api-core"
version = "2.26.0"
@@ -947,6 +1009,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
{ url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
{ url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
+ { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
+ { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
{ url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -956,6 +1020,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
{ url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
+ { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
+ { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
{ url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -963,6 +1029,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
+ { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
+ { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
]
@@ -1765,6 +1833,7 @@ dependencies = [
{ name = "openhands-workspace" },
{ name = "pandas" },
{ name = "pillow" },
+ { name = "swebench" },
{ name = "toml" },
{ name = "tqdm" },
{ name = "unidiff" },
@@ -1791,6 +1860,7 @@ requires-dist = [
{ name = "openhands-workspace", editable = "vendor/agent-sdk/openhands-workspace" },
{ name = "pandas" },
{ name = "pillow" },
+ { name = "swebench" },
{ name = "toml" },
{ name = "tqdm" },
{ name = "unidiff", specifier = ">=0.7.5,<0.8.0" },
@@ -5458,6 +5528,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
]
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
[[package]]
name = "sniffio"
version = "1.3.1"
@@ -5467,6 +5546,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
+[[package]]
+name = "soupsieve"
+version = "2.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
+]
+
[[package]]
name = "sqlalchemy"
version = "2.0.43"
@@ -5521,6 +5609,31 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ce/fd/901cfa59aaa5b30a99e16876f11abe38b59a1a2c51ffb3d7142bb6089069/starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51", size = 72991, upload-time = "2025-08-24T13:36:40.887Z" },
]
+[[package]]
+name = "swebench"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "beautifulsoup4" },
+ { name = "chardet" },
+ { name = "datasets" },
+ { name = "docker" },
+ { name = "ghapi" },
+ { name = "gitpython" },
+ { name = "modal" },
+ { name = "pre-commit" },
+ { name = "python-dotenv" },
+ { name = "requests" },
+ { name = "rich" },
+ { name = "tenacity" },
+ { name = "tqdm" },
+ { name = "unidiff" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/e1/c997299ad7bf088876d30398203aa1eed7dec897670dc1aa35b1d748ffcc/swebench-4.1.0.tar.gz", hash = "sha256:5aaa6a92c2db1aa64892d28a47483ca46a45a15cf1d2df673d7744f71811dc9a", size = 134341, upload-time = "2025-09-11T02:58:00.447Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
+]
+
[[package]]
name = "synchronicity"
version = "0.10.2"