diff --git a/.gitignore b/.gitignore
index 43135338..d89f1a36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -206,6 +206,8 @@ workspace/
 
 # IDE and editor directories
 .openhands/
+!.openhands/setup.sh
+!.openhands/microagents/
 .vscode/
 
 # LLM configuration directory (contains API keys and sensitive configs)
diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md
new file mode 100644
index 00000000..8aa0c1dc
--- /dev/null
+++ b/.openhands/microagents/repo.md
@@ -0,0 +1,103 @@
+<ROLE>
+You are a collaborative software engineering partner focused on maintaining high-quality benchmark evaluation infrastructure. Your approach emphasizes simplicity, reliability, and reproducible results.
+
+# Core Engineering Principles
+
+1. **Reproducibility**
+"Benchmarks must produce consistent, comparable results."
+    • Pin dependencies and submodule versions
+    • Maintain isolation between test environments
+    • Document evaluation methodology clearly
+
+2. **Simplicity**
+"Clear evaluation logic is easier to validate and debug."
+    • Prefer straightforward data transformations
+    • Avoid complex abstractions in evaluation code
+    • Keep benchmark scripts focused and readable
+
+3. **Backward Compatibility**
+"Preserve comparability with historical results."
+    • Changes should not invalidate previous evaluations
+    • Document any changes that affect metrics
+    • Maintain compatibility with upstream benchmark datasets
+
+4. **Pragmatic Testing**
+"Test what matters for accurate evaluation."
+    • Validate data format conversions
+    • Verify evaluation harness integration
+    • Focus on correctness of benchmark logic
+</ROLE>
+
+<DEV_SETUP>
+- Run `make build` to initialize the agent-sdk submodule and install dependencies
+- We use pre-commit hooks (`.pre-commit-config.yaml`) that include:
+  - Type checking with `pyright`
+  - Linting and formatting with `ruff`
+- NEVER USE `mypy`!
+- Do NOT commit ALL files, only commit relevant changes!
+- Add "Co-authored-by: openhands <openhands@all-hands.dev>" to every commit message
+- Run tests with `uv run pytest`
+
+# Project Structure
+- `benchmarks/swe_bench/` - SWE-Bench evaluation (code generation on GitHub issues)
+- `benchmarks/gaia/` - GAIA evaluation (general AI assistant tasks)
+- `benchmarks/utils/` - Shared utilities (patch handling, etc.)
+- `vendor/agent-sdk/` - Git submodule for OpenHands Agent SDK
+- `.llm_config/` - LLM configuration files (JSON format)
+
+# Submodule Management
+The Agent SDK is vendored as a git submodule. To update:
+```bash
+cd vendor/agent-sdk
+git fetch && git checkout <commit-or-branch>
+cd ../..
+git add vendor/agent-sdk
+git commit -m "Update agent-sdk to <version>"
+make build  # Rebuild environment
+```
+</DEV_SETUP>
+
+<CODE>
+- Avoid `sys.path.insert` hacks for imports
+- Use existing libraries instead of reimplementing (e.g., use `swebench` package for evaluation)
+- Avoid `# type: ignore` unless absolutely necessary
+- Avoid inline imports unless required for circular dependencies
+- Prefer explicit type hints over runtime checks with `getattr`/`hasattr`
+- Use real newlines in commit messages, not literal `\n`
+</CODE>
+
+<TESTING>
+- After editing a file, run `uv run pre-commit run --files [filepath]`
+- Write focused tests that cover edge cases, not exhaustive tests
+- Put tests in corresponding test folders: `benchmarks/*/tests/`
+- Avoid test classes unless necessary
+- Extract common test setup into fixtures in `conftest.py`
+- Test only logic in this codebase, not third-party functionality
+</TESTING>
+
+<BENCHMARK_SPECIFIC>
+# Adding New Benchmarks
+1. Create new directory under `benchmarks/`
+2. Implement `run_infer.py` for inference and output generation
+3. Add evaluation script if needed (or integrate with existing harness)
+4. Register CLI entrypoint in `pyproject.toml` under `[project.scripts]`
+5. Update README.md with usage instructions
+
+# LLM Configuration
+LLM configs use JSON matching the [LLM class schema](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands/sdk/llm/llm.py#L93):
+```json
+{
+  "model": "litellm_proxy/anthropic/claude-sonnet-4-20250514",
+  "base_url": "https://llm-proxy.eval.all-hands.dev",
+  "api_key": "YOUR_API_KEY"
+}
+```
+Validate with: `uv run validate-cfg .llm_config/your-config.json`
+
+# Data Format Conversions
+When converting between OpenHands format and benchmark-specific formats:
+- Preserve all required fields for evaluation
+- Handle missing/optional fields gracefully
+- Log conversion warnings for debugging
+- Validate output format before evaluation
+</BENCHMARK_SPECIFIC>
diff --git a/.openhands/setup.sh b/.openhands/setup.sh
new file mode 100755
index 00000000..9082145a
--- /dev/null
+++ b/.openhands/setup.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if ! command -v uv &> /dev/null; then
+    echo "uv is not installed. Installing..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+else
+    echo "uv is already installed."
+    uv self update  # always update to the latest version
+fi
+
+make build
diff --git a/README.md b/README.md
index 3e5f1785..c8d9f3b5 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ uv run benchmarks/swe_bench/build_images.py \
 ```
 
 
-### 3. Run SWE-Bench Evaluation
+### 3. Run SWE-Bench Inference
 ```bash
 # Run evaluation with your configured LLM
 uv run swebench-infer .llm_config/example.json \
@@ -134,6 +134,24 @@ python -m benchmarks.swe_bench.run_infer \
 
 This will only evaluate the instances listed in the file.
 
+### 5. Evaluate SWE-Bench Results
+After running inference, evaluate the results using the official SWE-Bench evaluation:
+
+```bash
+# Convert output format and run SWE-Bench evaluation
+uv run swebench-eval output.jsonl
+
+# Or specify custom dataset and output file
+uv run swebench-eval output.jsonl --dataset princeton-nlp/SWE-bench_Lite --output-file results.swebench.jsonl
+
+# Only convert format without running evaluation
+uv run swebench-eval output.jsonl --skip-evaluation
+```
+
+The script will:
+1. Convert OpenHands output format to SWE-Bench prediction format
+2. Run the official SWE-Bench evaluation harness
+
 ## Links
 
 - **Original OpenHands**: https://github.com/All-Hands-AI/OpenHands/
diff --git a/benchmarks/swe_bench/eval_infer.py b/benchmarks/swe_bench/eval_infer.py
new file mode 100644
index 00000000..c2dca753
--- /dev/null
+++ b/benchmarks/swe_bench/eval_infer.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+SWE-Bench Evaluation Script
+
+This script converts OpenHands output.jsonl format to SWE-Bench prediction format
+and runs the SWE-Bench evaluation.
+
+Usage:
+    uv run swebench-eval <path_to_output.jsonl>
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+from benchmarks.utils.patch_utils import remove_files_from_patch
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def convert_to_swebench_format(
+    input_file: str, output_file: str, model_name: str = "OpenHands"
+) -> None:
+    """
+    Convert OpenHands output.jsonl to SWE-Bench prediction format.
+
+    OpenHands format:
+    {
+        "instance_id": "django__django-11333",
+        "test_result": {
+            "git_patch": "diff --git a/file.py b/file.py\n..."
+        },
+        "instruction": "...",
+        "error": null,
+        "history": [...]
+    }
+
+    SWE-Bench format:
+    {
+        "instance_id": "django__django-11333",
+        "model_patch": "diff --git a/file.py b/file.py\n...",
+        "model_name_or_path": "OpenHands"
+    }
+    """
+    logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
+
+    converted_count = 0
+    error_count = 0
+
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                # Extract required fields
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    error_count += 1
+                    continue
+
+                # Extract git_patch from test_result
+                test_result = data.get("test_result", {})
+                git_patch = test_result.get("git_patch", "")
+
+                if not git_patch:
+                    logger.warning(
+                        f"Line {line_num}: Missing or empty git_patch for {instance_id}"
+                    )
+                    # Still create entry with empty patch
+                    git_patch = ""
+
+                # postprocess git_patch
+                setup_files = ["pyproject.toml", "tox.ini", "setup.py"]
+                git_patch = remove_files_from_patch(git_patch, setup_files)
+
+                # Create SWE-Bench format entry
+                swebench_entry = {
+                    "instance_id": instance_id,
+                    "model_patch": git_patch,
+                    "model_name_or_path": model_name,
+                }
+
+                # Write to output file
+                outfile.write(json.dumps(swebench_entry) + "\n")
+                converted_count += 1
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+                error_count += 1
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+                error_count += 1
+
+    logger.info(
+        f"Conversion complete: {converted_count} entries converted, "
+        f"{error_count} errors"
+    )
+
+    if converted_count == 0:
+        raise ValueError("No valid entries were converted")
+
+
+def run_swebench_evaluation(
+    predictions_file: str,
+    dataset: str = "princeton-nlp/SWE-bench_Verified",
+    workers: str = "12",
+) -> None:
+    """
+    Run SWE-Bench evaluation on the predictions file.
+
+    Args:
+        predictions_file: Path to the SWE-Bench format predictions file
+        dataset: SWE-Bench dataset to evaluate against
+        workers: Number of workers to use for evaluation
+    """
+    logger.info(f"Running SWE-Bench evaluation on {predictions_file}")
+
+    try:
+        # Get the directory of the predictions file
+        predictions_path = Path(predictions_file)
+        predictions_dir = predictions_path.parent
+        predictions_filename = predictions_path.name
+
+        # Run SWE-Bench evaluation using global python (not UV environment)
+        # since swebench is installed globally
+        cmd = [
+            "/usr/bin/python3",
+            "-m",
+            "swebench.harness.run_evaluation",
+            "--dataset_name",
+            dataset,
+            "--predictions_path",
+            predictions_filename,
+            "--max_workers",
+            str(workers),
+            "--run_id",
+            f"eval_{predictions_path.stem}",
+        ]
+
+        logger.info(f"Running command: {' '.join(cmd)}")
+        logger.info(f"Working directory: {predictions_dir}")
+        logger.info("SWE-Bench evaluation output:")
+        print("-" * 80)
+
+        # Stream output directly to console, running from predictions file directory
+        result = subprocess.run(cmd, text=True, cwd=predictions_dir)
+
+        print("-" * 80)
+        if result.returncode == 0:
+            logger.info("SWE-Bench evaluation completed successfully")
+        else:
+            logger.error(
+                f"SWE-Bench evaluation failed with return code {result.returncode}"
+            )
+            raise subprocess.CalledProcessError(result.returncode, cmd)
+
+    except FileNotFoundError:
+        logger.error(
+            "SWE-Bench evaluation command not found. "
+            "Make sure SWE-Bench is properly installed."
+        )
+        raise
+    except Exception as e:
+        logger.error(f"Error running SWE-Bench evaluation: {e}")
+        raise
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Convert OpenHands output to SWE-Bench format and run evaluation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run swebench-eval output.jsonl
+    uv run swebench-eval /path/to/output.jsonl --dataset princeton-nlp/SWE-bench_Lite
+    uv run swebench-eval output.jsonl --model-name "MyModel-v1.0"
+        """,
+    )
+
+    parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file")
+
+    parser.add_argument(
+        "--dataset",
+        default="princeton-nlp/SWE-bench_Verified",
+        help="SWE-Bench dataset to evaluate against "
+        "(default: princeton-nlp/SWE-bench_Verified)",
+    )
+
+    parser.add_argument(
+        "--output-file",
+        help="Output file for SWE-Bench format "
+        "(default: input_file with .swebench.jsonl extension)",
+    )
+
+    parser.add_argument(
+        "--skip-evaluation",
+        action="store_true",
+        help="Only convert format, skip running evaluation",
+    )
+
+    parser.add_argument(
+        "--model-name",
+        default="OpenHands",
+        help="Model name to use in the model_name_or_path field (default: OpenHands)",
+    )
+
+    parser.add_argument(
+        "--workers",
+        default="12",
+        help="Number of workers to use when evaluating",
+    )
+
+    args = parser.parse_args()
+
+    # Validate input file
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    # Determine output file
+    if args.output_file:
+        output_file = Path(args.output_file)
+    else:
+        output_file = input_file.with_suffix(".swebench.jsonl")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+    logger.info(f"Dataset: {args.dataset}")
+    logger.info(f"Model name: {args.model_name}")
+
+    try:
+        # Convert format
+        convert_to_swebench_format(str(input_file), str(output_file), args.model_name)
+
+        if not args.skip_evaluation:
+            # Run evaluation
+            run_swebench_evaluation(str(output_file), args.dataset, args.workers)
+
+        logger.info("Script completed successfully!")
+
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/utils/binary_patch_utils.py b/benchmarks/utils/binary_patch_utils.py
deleted file mode 100644
index f17737e7..00000000
--- a/benchmarks/utils/binary_patch_utils.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Utilities for handling binary files and patch generation in SWE-bench evaluation.
-"""
-
-
-def remove_binary_diffs(patch_text):
-    """
-    Remove binary file diffs from a git patch.
-
-    Args:
-        patch_text (str): The git patch text
-
-    Returns:
-        str: The cleaned patch text with binary diffs removed
-    """
-    lines = patch_text.splitlines()
-    cleaned_lines = []
-    block = []
-    is_binary_block = False
-
-    for line in lines:
-        if line.startswith("diff --git "):
-            if block and not is_binary_block:
-                cleaned_lines.extend(block)
-            block = [line]
-            is_binary_block = False
-        elif "Binary files" in line:
-            is_binary_block = True
-            block.append(line)
-        else:
-            block.append(line)
-
-    if block and not is_binary_block:
-        cleaned_lines.extend(block)
-    return "\n".join(cleaned_lines)
-
-
-def remove_binary_files_from_git():
-    """
-    Generate a bash command to remove binary files from git staging.
-
-    Returns:
-        str: A bash command that removes binary files from git staging
-    """
-    return """
-    for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
-        if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\
-            git check-attr binary "$file" | grep -q "binary: set"); then
-            git rm -f "$file" 2>/dev/null || rm -f "$file"
-            echo "Removed: $file"
-        fi
-    done
-    """.strip()
diff --git a/benchmarks/utils/patch_utils.py b/benchmarks/utils/patch_utils.py
new file mode 100644
index 00000000..aa8afc01
--- /dev/null
+++ b/benchmarks/utils/patch_utils.py
@@ -0,0 +1,116 @@
+"""
+Utilities for handling patch generation in SWE-bench evaluation.
+"""
+
+import re
+
+
+def remove_files_from_patch(git_patch, files):
+    """
+    Remove files modifications from a git patch string.
+    Args:
+        git_patch (str): The original git patch string
+        files (List[str]): The files to remove form the patch
+    Returns:
+        str: The git patch with files modifications removed
+    """
+    if not git_patch:
+        return git_patch
+
+    # Split patch into individual file diffs
+    # Look for diff --git patterns to identify file boundaries
+    diff_pattern = r"diff --git [^\n]*\n"
+
+    # Find all diff headers and their positions
+    diff_matches = list(re.finditer(diff_pattern, git_patch))
+
+    if not diff_matches:
+        return git_patch
+
+    # Extract individual file diffs
+    file_diffs = []
+    for i, match in enumerate(diff_matches):
+        start = match.start()
+        end = (
+            diff_matches[i + 1].start() if i + 1 < len(diff_matches) else len(git_patch)
+        )
+        file_diff = git_patch[start:end]
+        file_diffs.append(file_diff)
+
+    # Filter out files in list
+    filtered_diffs = []
+    for diff in file_diffs:
+        # Extract filenames from diff header to do exact matching
+        should_skip = False
+        if "diff --git" in diff:
+            # Extract the diff header line
+            first_line = diff.split("\n")[0]
+            # Parse diff --git a/file b/file format
+            match = re.match(r"diff --git a/(.+) b/(.+)", first_line)
+            if match:
+                file_a, file_b = match.groups()
+                # Check if either filename (before or after) matches any file to remove
+                if file_a in files or file_b in files:
+                    should_skip = True
+
+        if should_skip:
+            # Skip this diff
+            continue
+        filtered_diffs.append(diff)
+
+    # Rejoin the filtered diffs with proper newlines
+    if not filtered_diffs:
+        return ""
+
+    # Join the diffs while preserving their original structure
+    # Each diff already contains its proper ending from the original split
+    result = "".join(filtered_diffs)
+
+    return result
+
+
+def remove_binary_diffs(patch_text):
+    """
+    Remove binary file diffs from a git patch.
+    Args:
+        patch_text (str): The git patch text
+    Returns:
+        str: The cleaned patch text with binary diffs removed
+    """
+    lines = patch_text.splitlines()
+    cleaned_lines = []
+    block = []
+    is_binary_block = False
+
+    for line in lines:
+        if line.startswith("diff --git "):
+            if block and not is_binary_block:
+                cleaned_lines.extend(block)
+            block = [line]
+            is_binary_block = False
+        elif "Binary files" in line:
+            is_binary_block = True
+            block.append(line)
+        else:
+            block.append(line)
+
+    if block and not is_binary_block:
+        cleaned_lines.extend(block)
+    return "\n".join(cleaned_lines)
+
+
+def remove_binary_files_from_git():
+    """
+    Generate a bash command to remove binary files from git staging.
+    Returns:
+        str: A bash command that removes binary files from git staging
+    """
+    return """
+    for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
+        if [ -f "$file" ] && (file "$file" | grep -q "executable" || \\
+            git check-attr binary "$file" | grep -q "binary: set"); then
+            git rm -f "$file" 2>/dev/null || rm -f "$file"
+            echo "Removed: $file"
+        fi
+    done
+    """.strip()
diff --git a/pyproject.toml b/pyproject.toml
index 73792ba1..04f8e506 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,13 +18,16 @@ dependencies = [
     "openhands-agent-server",
     "openhands-workspace",
     "modal>=1.1.4",
+    "swebench",
 ]
 
 [project.scripts]
 validate-cfg = "benchmarks.scripts.validate_cfg:main"
 swebench-infer = "benchmarks.swe_bench.run_infer:main"
+swebench-eval = "benchmarks.swe_bench.eval_infer:main"
 gaia-infer = "benchmarks.gaia.run_infer:main"
 
+
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
diff --git a/uv.lock b/uv.lock
index 2a3814fd..95619ec6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -204,6 +204,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f4/be/6985abb1011fda8a523cfe21ed9629e397d6e06fb5bae99750402b25c95b/bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa", size = 69539, upload-time = "2023-01-18T15:21:24.167Z" },
 ]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822, upload-time = "2025-09-29T10:05:42.613Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392, upload-time = "2025-09-29T10:05:43.771Z" },
+]
+
 [[package]]
 name = "binaryornot"
 version = "0.4.4"
@@ -689,6 +702,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/54e2bdaad22ca91a59455251998d43094d5c3d3567c52c7c04774b3f43f2/fastapi-0.118.0-py3-none-any.whl", hash = "sha256:705137a61e2ef71019d2445b123aa8845bd97273c395b744d5a7dfe559056855", size = 97694, upload-time = "2025-09-29T03:37:21.338Z" },
 ]
 
+[[package]]
+name = "fastcore"
+version = "1.8.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/41/fc/4782041a7e96ae3de2b6bc7a287693d619688d938f43e6d9e70a23874d51/fastcore-1.8.14.tar.gz", hash = "sha256:869735ef493dbc7e5e8cbfc35fa3310772ce4c768d5b3a82d6a0d571148401be", size = 83648, upload-time = "2025-10-29T05:38:46.285Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/c6/236247deaa155fad1b38b6deb95b8b76efd20f5107b4577eee42002cbf11/fastcore-1.8.14-py3-none-any.whl", hash = "sha256:a02a749c26243ffd54d6dd11165cf4a556c7cb08f4c7e47ff67b32c7b0183ce9", size = 86791, upload-time = "2025-10-29T05:38:44.343Z" },
+]
+
 [[package]]
 name = "fastmcp"
 version = "2.12.4"
@@ -830,6 +855,43 @@ version = "4.3.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/b3/0d/bf0567477f7281d9a3926c582bfef21bff7498fc0ffd3e9de21811896a0b/func_timeout-4.3.5.tar.gz", hash = "sha256:74cd3c428ec94f4edfba81f9b2f14904846d5ffccc27c92433b8b5939b5575dd", size = 44264, upload-time = "2019-08-19T21:32:07.43Z" }
 
+[[package]]
+name = "ghapi"
+version = "1.0.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fastcore" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/45/714e944ca610758c5fd7cece1aea366fb88e318983d6a8e52857598b5b09/ghapi-1.0.8.tar.gz", hash = "sha256:9ab02dcd06b3c622ea2d9b21a2efee316076a744ce7847251a2fe9f542f381df", size = 72049, upload-time = "2025-09-16T23:57:48.847Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/cd/63129b50c44da8461f663f09f8e4801acaf0ff5bc8b201d75cee82d5b35c/ghapi-1.0.8-py3-none-any.whl", hash = "sha256:3e4023f475ec966995dd3feeacd3f42f9e296dd23148e6f28d15e80487300e66", size = 68569, upload-time = "2025-09-16T23:57:47.017Z" },
+]
+
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.26.0"
@@ -947,6 +1009,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
     { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
+    { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
+    { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
     { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -956,6 +1020,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
     { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
+    { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
     { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -963,6 +1029,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
+    { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
     { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
 ]
 
@@ -1765,6 +1833,7 @@ dependencies = [
     { name = "openhands-workspace" },
     { name = "pandas" },
     { name = "pillow" },
+    { name = "swebench" },
     { name = "toml" },
     { name = "tqdm" },
     { name = "unidiff" },
@@ -1791,6 +1860,7 @@ requires-dist = [
     { name = "openhands-workspace", editable = "vendor/agent-sdk/openhands-workspace" },
     { name = "pandas" },
     { name = "pillow" },
+    { name = "swebench" },
     { name = "toml" },
     { name = "tqdm" },
     { name = "unidiff", specifier = ">=0.7.5,<0.8.0" },
@@ -5458,6 +5528,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -5467,6 +5546,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472, upload-time = "2025-08-27T15:39:51.78Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.43"
@@ -5521,6 +5609,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ce/fd/901cfa59aaa5b30a99e16876f11abe38b59a1a2c51ffb3d7142bb6089069/starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51", size = 72991, upload-time = "2025-08-24T13:36:40.887Z" },
 ]
 
+[[package]]
+name = "swebench"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "chardet" },
+    { name = "datasets" },
+    { name = "docker" },
+    { name = "ghapi" },
+    { name = "gitpython" },
+    { name = "modal" },
+    { name = "pre-commit" },
+    { name = "python-dotenv" },
+    { name = "requests" },
+    { name = "rich" },
+    { name = "tenacity" },
+    { name = "tqdm" },
+    { name = "unidiff" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/e1/c997299ad7bf088876d30398203aa1eed7dec897670dc1aa35b1d748ffcc/swebench-4.1.0.tar.gz", hash = "sha256:5aaa6a92c2db1aa64892d28a47483ca46a45a15cf1d2df673d7744f71811dc9a", size = 134341, upload-time = "2025-09-11T02:58:00.447Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/67/981d8b642ac3eac7c8a7b7832ff8b2fb74f96b28b5fcd9a8979879e5c46d/swebench-4.1.0-py3-none-any.whl", hash = "sha256:1243776f720047cc9e20a427f7a52b75c13a07abda6154fb60fe77f82ec8af57", size = 157231, upload-time = "2025-09-11T02:57:58.953Z" },
+]
+
 [[package]]
 name = "synchronicity"
 version = "0.10.2"