diff --git a/.gitignore b/.gitignore index 43135338..d89f1a36 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,8 @@ workspace/ # IDE and editor directories .openhands/ +!.openhands/setup.sh +!.openhands/microagents/ .vscode/ # LLM configuration directory (contains API keys and sensitive configs) diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md new file mode 100644 index 00000000..8aa0c1dc --- /dev/null +++ b/.openhands/microagents/repo.md @@ -0,0 +1,103 @@ + +You are a collaborative software engineering partner focused on maintaining high-quality benchmark evaluation infrastructure. Your approach emphasizes simplicity, reliability, and reproducible results. + +# Core Engineering Principles + +1. **Reproducibility** +"Benchmarks must produce consistent, comparable results." + • Pin dependencies and submodule versions + • Maintain isolation between test environments + • Document evaluation methodology clearly + +2. **Simplicity** +"Clear evaluation logic is easier to validate and debug." + • Prefer straightforward data transformations + • Avoid complex abstractions in evaluation code + • Keep benchmark scripts focused and readable + +3. **Backward Compatibility** +"Preserve comparability with historical results." + • Changes should not invalidate previous evaluations + • Document any changes that affect metrics + • Maintain compatibility with upstream benchmark datasets + +4. **Pragmatic Testing** +"Test what matters for accurate evaluation." + • Validate data format conversions + • Verify evaluation harness integration + • Focus on correctness of benchmark logic + + + +- Run `make build` to initialize the agent-sdk submodule and install dependencies +- We use pre-commit hooks (`.pre-commit-config.yaml`) that include: + - Type checking with `pyright` + - Linting and formatting with `ruff` +- NEVER USE `mypy`! +- Do NOT commit ALL files, only commit relevant changes! +- Add "Co-authored-by: openhands " to every commit message +- Run tests with `uv run pytest` + +# Project Structure +- `benchmarks/swe_bench/` - SWE-Bench evaluation (code generation on GitHub issues) +- `benchmarks/gaia/` - GAIA evaluation (general AI assistant tasks) +- `benchmarks/utils/` - Shared utilities (patch handling, etc.) +- `vendor/agent-sdk/` - Git submodule for OpenHands Agent SDK +- `.llm_config/` - LLM configuration files (JSON format) + +# Submodule Management +The Agent SDK is vendored as a git submodule. To update: +```bash +cd vendor/agent-sdk +git fetch && git checkout +cd ../.. +git add vendor/agent-sdk +git commit -m "Update agent-sdk to " +make build # Rebuild environment +``` + + + +- Avoid `sys.path.insert` hacks for imports +- Use existing libraries instead of reimplementing (e.g., use `swebench` package for evaluation) +- Avoid `# type: ignore` unless absolutely necessary +- Avoid inline imports unless required for circular dependencies +- Prefer explicit type hints over runtime checks with `getattr`/`hasattr` +- Use real newlines in commit messages, not literal `\n` + + + +- After editing a file, run `uv run pre-commit run --files [filepath]` +- Write focused tests that cover edge cases, not exhaustive tests +- Put tests in corresponding test folders: `benchmarks/*/tests/` +- Avoid test classes unless necessary +- Extract common test setup into fixtures in `conftest.py` +- Test only logic in this codebase, not third-party functionality + + + +# Adding New Benchmarks +1. Create new directory under `benchmarks/` +2. Implement `run_infer.py` for inference and output generation +3. Add evaluation script if needed (or integrate with existing harness) +4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]` +5. Update README.md with usage instructions + +# LLM Configuration +LLM configs use JSON matching the [LLM class schema](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93): +```json +{ + "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514", + "base_url": "https://llm-proxy.eval.all-hands.dev", + "api_key": "YOUR_API_KEY" +} +``` +Validate with: `uv run validate-cfg .llm_config/your-config.json` + +# Data Format Conversions +When converting between OpenHands format and benchmark-specific formats: +- Preserve all required fields for evaluation +- Handle missing/optional fields gracefully +- Log conversion warnings for debugging +- Validate output format before evaluation + diff --git a/.openhands/setup.sh b/.openhands/setup.sh new file mode 100755 index 00000000..9082145a --- /dev/null +++ b/.openhands/setup.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +if ! command -v uv &> /dev/null; then + echo "uv is not installed. Installing..." + curl -LsSf https://astral.sh/uv/install.sh | sh +else + echo "uv is already installed." + uv self update # always update to the latest version +fi + +make build diff --git a/README.md b/README.md index 3e5f1785..c8d9f3b5 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ uv run benchmarks/swe_bench/build_images.py \ ``` -### 3. Run SWE-Bench Evaluation +### 3. Run SWE-Bench Inference ```bash # Run evaluation with your configured LLM uv run swebench-infer .llm_config/example.json \ @@ -134,6 +134,24 @@ python -m benchmarks.swe_bench.run_infer \ This will only evaluate the instances listed in the file. +### 5. Evaluate SWE-Bench Results +After running inference, evaluate the results using the official SWE-Bench evaluation: + +```bash +# Convert output format and run SWE-Bench evaluation +uv run swebench-eval output.jsonl + +# Or specify custom dataset and output file +uv run swebench-eval output.jsonl --dataset princeton-nlp/SWE-bench_Lite --output-file results.swebench.jsonl + +# Only convert format without running evaluation +uv run swebench-eval output.jsonl --skip-evaluation +``` + +The script will: +1. Convert OpenHands output format to SWE-Bench prediction format +2. Run the official SWE-Bench evaluation harness + ## Links - **Original OpenHands**: https://github.com/All-Hands-AI/OpenHands/ diff --git a/benchmarks/swe_bench/eval_infer.py b/benchmarks/swe_bench/eval_infer.py new file mode 100644 index 00000000..c2dca753 --- /dev/null +++ b/benchmarks/swe_bench/eval_infer.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +""" +SWE-Bench Evaluation Script + +This script converts OpenHands output.jsonl format to SWE-Bench prediction format +and runs the SWE-Bench evaluation. + +Usage: + uv run swebench-eval +""" + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +from benchmarks.utils.patch_utils import remove_files_from_patch +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def convert_to_swebench_format( + input_file: str, output_file: str, model_name: str = "OpenHands" +) -> None: + """ + Convert OpenHands output.jsonl to SWE-Bench prediction format. + + OpenHands format: + { + "instance_id": "django__django-11333", + "test_result": { + "git_patch": "diff --git a/file.py b/file.py\n..." + }, + "instruction": "...", + "error": null, + "history": [...] + } + + SWE-Bench format: + { + "instance_id": "django__django-11333", + "model_patch": "diff --git a/file.py b/file.py\n...", + "model_name_or_path": "OpenHands" + } + """ + logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}") + + converted_count = 0 + error_count = 0 + + with open(input_file, "r") as infile, open(output_file, "w") as outfile: + for line_num, line in enumerate(infile, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + + # Extract required fields + instance_id = data.get("instance_id") + if not instance_id: + logger.warning(f"Line {line_num}: Missing instance_id") + error_count += 1 + continue + + # Extract git_patch from test_result + test_result = data.get("test_result", {}) + git_patch = test_result.get("git_patch", "") + + if not git_patch: + logger.warning( + f"Line {line_num}: Missing or empty git_patch for {instance_id}" + ) + # Still create entry with empty patch + git_patch = "" + + # postprocess git_patch + setup_files = ["pyproject.toml", "tox.ini", "setup.py"] + git_patch = remove_files_from_patch(git_patch, setup_files) + + # Create SWE-Bench format entry + swebench_entry = { + "instance_id": instance_id, + "model_patch": git_patch, + "model_name_or_path": model_name, + } + + # Write to output file + outfile.write(json.dumps(swebench_entry) + "\n") + converted_count += 1 + + except json.JSONDecodeError as e: + logger.error(f"Line {line_num}: Invalid JSON - {e}") + error_count += 1 + except Exception as e: + logger.error(f"Line {line_num}: Unexpected error - {e}") + error_count += 1 + + logger.info( + f"Conversion complete: {converted_count} entries converted, " + f"{error_count} errors" + ) + + if converted_count == 0: + raise ValueError("No valid entries were converted") + + +def run_swebench_evaluation( + predictions_file: str, + dataset: str = "princeton-nlp/SWE-bench_Verified", + workers: str = "12", +) -> None: + """ + Run SWE-Bench evaluation on the predictions file. + + Args: + predictions_file: Path to the SWE-Bench format predictions file + dataset: SWE-Bench dataset to evaluate against + workers: Number of workers to use for evaluation + """ + logger.info(f"Running SWE-Bench evaluation on {predictions_file}") + + try: + # Get the directory of the predictions file + predictions_path = Path(predictions_file) + predictions_dir = predictions_path.parent + predictions_filename = predictions_path.name + + # Run SWE-Bench evaluation using global python (not UV environment) + # since swebench is installed globally + cmd = [ + "/usr/bin/python3", + "-m", + "swebench.harness.run_evaluation", + "--dataset_name", + dataset, + "--predictions_path", + predictions_filename, + "--max_workers", + str(workers), + "--run_id", + f"eval_{predictions_path.stem}", + ] + + logger.info(f"Running command: {' '.join(cmd)}") + logger.info(f"Working directory: {predictions_dir}") + logger.info("SWE-Bench evaluation output:") + print("-" * 80) + + # Stream output directly to console, running from predictions file directory + result = subprocess.run(cmd, text=True, cwd=predictions_dir) + + print("-" * 80) + if result.returncode == 0: + logger.info("SWE-Bench evaluation completed successfully") + else: + logger.error( + f"SWE-Bench evaluation failed with return code {result.returncode}" + ) + raise subprocess.CalledProcessError(result.returncode, cmd) + + except FileNotFoundError: + logger.error( + "SWE-Bench evaluation command not found. " + "Make sure SWE-Bench is properly installed." + ) + raise + except Exception as e: + logger.error(f"Error running SWE-Bench evaluation: {e}") + raise + + +def main() -> None: + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Convert OpenHands output to SWE-Bench format and run evaluation", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run swebench-eval output.jsonl + uv run swebench-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Lite + uv run swebench-eval output.jsonl --model-name "MyModel-v1.0" + """, + ) + + parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file") + + parser.add_argument( + "--dataset", + default="princeton-nlp/SWE-bench_Verified", + help="SWE-Bench dataset to evaluate against " + "(default: princeton-nlp/SWE-bench_Verified)", + ) + + parser.add_argument( + "--output-file", + help="Output file for SWE-Bench format " + "(default: input_file with .swebench.jsonl extension)", + ) + + parser.add_argument( + "--skip-evaluation", + action="store_true", + help="Only convert format, skip running evaluation", + ) + + parser.add_argument( + "--model-name", + default="OpenHands", + help="Model name to use in the model_name_or_path field (default: OpenHands)", + ) + + parser.add_argument( + "--workers", + default="12", + help="Number of workers to use when evaluating", + ) + + args = parser.parse_args() + + # Validate input file + input_file = Path(args.input_file) + if not input_file.exists(): + logger.error(f"Input file does not exist: {input_file}") + sys.exit(1) + + if not input_file.suffix == ".jsonl": + logger.warning(f"Input file does not have .jsonl extension: {input_file}") + + # Determine output file + if args.output_file: + output_file = Path(args.output_file) + else: + output_file = input_file.with_suffix(".swebench.jsonl") + + logger.info(f"Input file: {input_file}") + logger.info(f"Output file: {output_file}") + logger.info(f"Dataset: {args.dataset}") + logger.info(f"Model name: {args.model_name}") + + try: + # Convert format + convert_to_swebench_format(str(input_file), str(output_file), args.model_name) + + if not args.skip_evaluation: + # Run evaluation + run_swebench_evaluation(str(output_file), args.dataset, args.workers) + + logger.info("Script completed successfully!") + + except Exception as e: + logger.error(f"Script failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/utils/binary_patch_utils.py b/benchmarks/utils/binary_patch_utils.py deleted file mode 100644 index f17737e7..00000000 --- a/benchmarks/utils/binary_patch_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Utilities for handling binary files and patch generation in SWE-bench evaluation. -""" - - -def remove_binary_diffs(patch_text): - """ - Remove binary file diffs from a git patch. - - Args: - patch_text (str): The git patch text - - Returns: - str: The cleaned patch text with binary diffs removed - """ - lines = patch_text.splitlines() - cleaned_lines = [] - block = [] - is_binary_block = False - - for line in lines: - if line.startswith("diff --git "): - if block and not is_binary_block: - cleaned_lines.extend(block) - block = [line] - is_binary_block = False - elif "Binary files" in line: - is_binary_block = True - block.append(line) - else: - block.append(line) - - if block and not is_binary_block: - cleaned_lines.extend(block) - return "\n".join(cleaned_lines) - - -def remove_binary_files_from_git(): - """ - Generate a bash command to remove binary files from git staging. - - Returns: - str: A bash command that removes binary files from git staging - """ - return """ - for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do - if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\ - git check-attr binary "$file" | grep -q "binary: set"); then - git rm -f "$file" 2>/dev/null || rm -f "$file" - echo "Removed: $file" - fi - done - """.strip() diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py new file mode 100644 index 00000000..aa8afc01 --- /dev/null +++ b/benchmarks/utils/patch_utils.py @@ -0,0 +1,116 @@ +""" +Utilities for handling patch generation in SWE-bench evaluation. +""" + +import re + + +def remove_files_from_patch(git_patch, files): + """ + Remove files modifications from a git patch string. + Args: + git_patch (str): The original git patch string + files (List[str]): The files to remove form the patch + Returns: + str: The git patch with files modifications removed + """ + if not git_patch: + return git_patch + + # Split patch into individual file diffs + # Look for diff --git patterns to identify file boundaries + diff_pattern = r"diff --git [^\n]*\n" + + # Find all diff headers and their positions + diff_matches = list(re.finditer(diff_pattern, git_patch)) + + if not diff_matches: + return git_patch + + # Extract individual file diffs + file_diffs = [] + for i, match in enumerate(diff_matches): + start = match.start() + end = ( + diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch) + ) + file_diff = git_patch[start:end] + file_diffs.append(file_diff) + + # Filter out files in list + filtered_diffs = [] + for diff in file_diffs: + # Extract filenames from diff header to do exact matching + should_skip = False + if "diff --git" in diff: + # Extract the diff header line + first_line = diff.split("\n")[0] + # Parse diff --git a/file b/file format + match = re.match(r"diff --git a/(.+) b/(.+)", first_line) + if match: + file_a, file_b = match.groups() + # Check if either filename (before or after) matches any file to remove + if file_a in files or file_b in files: + should_skip = True + + if should_skip: + # Skip this diff + continue + filtered_diffs.append(diff) + + # Rejoin the filtered diffs with proper newlines + if not filtered_diffs: + return "" + + # Join the diffs while preserving their original structure + # Each diff already contains its proper ending from the original split + result = "".join(filtered_diffs) + + return result + + +def remove_binary_diffs(patch_text): + """ + Remove binary file diffs from a git patch. + Args: + patch_text (str): The git patch text + Returns: + str: The cleaned patch text with binary diffs removed + """ + lines = patch_text.splitlines() + cleaned_lines = [] + block = [] + is_binary_block = False + + for line in lines: + if line.startswith("diff --git "): + if block and not is_binary_block: + cleaned_lines.extend(block) + block = [line] + is_binary_block = False + elif "Binary files" in line: + is_binary_block = True + block.append(line) + else: + block.append(line) + + if block and not is_binary_block: + cleaned_lines.extend(block) + return "\n".join(cleaned_lines) + + +def remove_binary_files_from_git(): + """ + Generate a bash command to remove binary files from git staging. + Returns: + str: A bash command that removes binary files from git staging + """ + return """ + for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do + if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\ + git check-attr binary "$file" | grep -q "binary: set"); then + git rm -f "$file" 2>/dev/null || rm -f "$file" + echo "Removed: $file" + fi + done + """.strip() diff --git a/pyproject.toml b/pyproject.toml index 73792ba1..04f8e506 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,13 +18,16 @@ dependencies = [ "openhands-agent-server", "openhands-workspace", "modal>=1.1.4", + "swebench", ] [project.scripts] validate-cfg = "benchmarks.scripts.validate_cfg:main" swebench-infer = "benchmarks.swe_bench.run_infer:main" +swebench-eval = "benchmarks.swe_bench.eval_infer:main" gaia-infer = "benchmarks.gaia.run_infer:main" + [build-system] requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" diff --git a/uv.lock b/uv.lock index 2a3814fd..95619ec6 100644 --- a/uv.lock +++ b/uv.lock @@ -204,6 +204,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/be/6985abb1011fda8a523cfe21ed9629e397d6e06fb5bae99750402b25c95b/bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa", size = 69539, upload-time = "2023-01-18T15:21:24.167Z" }, ] +[[package]] +name = "beautifulsoup4" +version = "4.14.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822, upload-time = "2025-09-29T10:05:42.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" }, +] + [[package]] name = "binaryornot" version = "0.4.4" @@ -689,6 +702,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/54e2bdaad22ca91a59455251998d43094d5c3d3567c52c7c04774b3f43f2/fastapi-0.118.0-py3-none-any.whl", hash = "sha256:705137a61e2ef71019d2445b123aa8845bd97273c395b744d5a7dfe559056855", size = 97694, upload-time = "2025-09-29T03:37:21.338Z" }, ] +[[package]] +name = "fastcore" +version = "1.8.14" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/41/fc/4782041a7e96ae3de2b6bc7a287693d619688d938f43e6d9e70a23874d51/fastcore-1.8.14.tar.gz", hash = "sha256:869735ef493dbc7e5e8cbfc35fa3310772ce4c768d5b3a82d6a0d571148401be", size = 83648, upload-time = "2025-10-29T05:38:46.285Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/c6/236247deaa155fad1b38b6deb95b8b76efd20f5107b4577eee42002cbf11/fastcore-1.8.14-py3-none-any.whl", hash = "sha256:a02a749c26243ffd54d6dd11165cf4a556c7cb08f4c7e47ff67b32c7b0183ce9", size = 86791, upload-time = "2025-10-29T05:38:44.343Z" }, +] + [[package]] name = "fastmcp" version = "2.12.4" @@ -830,6 +855,43 @@ version = "4.3.5" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/b3/0d/bf0567477f7281d9a3926c582bfef21bff7498fc0ffd3e9de21811896a0b/func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd", size = 44264, upload-time = "2019-08-19T21:32:07.43Z" } +[[package]] +name = "ghapi" +version = "1.0.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastcore" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/45/714e944ca610758c5fd7cece1aea366fb88e318983d6a8e52857598b5b09/ghapi-1.0.8.tar.gz", hash = "sha256:9ab02dcd06b3c622ea2d9b21a2efee316076a744ce7847251a2fe9f542f381df", size = 72049, upload-time = "2025-09-16T23:57:48.847Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/cd/63129b50c44da8461f663f09f8e4801acaf0ff5bc8b201d75cee82d5b35c/ghapi-1.0.8-py3-none-any.whl", hash = "sha256:3e4023f475ec966995dd3feeacd3f42f9e296dd23148e6f28d15e80487300e66", size = 68569, upload-time = "2025-09-16T23:57:47.017Z" }, +] + +[[package]] +name = "gitdb" +version = "4.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, +] + +[[package]] +name = "gitpython" +version = "3.1.45" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, +] + [[package]] name = "google-api-core" version = "2.26.0" @@ -947,6 +1009,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -956,6 +1020,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -963,6 +1029,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -1765,6 +1833,7 @@ dependencies = [ { name = "openhands-workspace" }, { name = "pandas" }, { name = "pillow" }, + { name = "swebench" }, { name = "toml" }, { name = "tqdm" }, { name = "unidiff" }, @@ -1791,6 +1860,7 @@ requires-dist = [ { name = "openhands-workspace", editable = "vendor/agent-sdk/openhands-workspace" }, { name = "pandas" }, { name = "pillow" }, + { name = "swebench" }, { name = "toml" }, { name = "tqdm" }, { name = "unidiff", specifier = ">=0.7.5,<0.8.0" }, @@ -5458,6 +5528,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "smmap" +version = "5.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -5467,6 +5546,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "soupsieve" +version = "2.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.43" @@ -5521,6 +5609,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/fd/901cfa59aaa5b30a99e16876f11abe38b59a1a2c51ffb3d7142bb6089069/starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51", size = 72991, upload-time = "2025-08-24T13:36:40.887Z" }, ] +[[package]] +name = "swebench" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "chardet" }, + { name = "datasets" }, + { name = "docker" }, + { name = "ghapi" }, + { name = "gitpython" }, + { name = "modal" }, + { name = "pre-commit" }, + { name = "python-dotenv" }, + { name = "requests" }, + { name = "rich" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "unidiff" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/e1/c997299ad7bf088876d30398203aa1eed7dec897670dc1aa35b1d748ffcc/swebench-4.1.0.tar.gz", hash = "sha256:5aaa6a92c2db1aa64892d28a47483ca46a45a15cf1d2df673d7744f71811dc9a", size = 134341, upload-time = "2025-09-11T02:58:00.447Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" }, +] + [[package]] name = "synchronicity" version = "0.10.2"