Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
c9b86c1
Add GitHub workflow for building SWE-Bench images with Blacksmith cac…
openhands-agent Oct 27, 2025
5752043
Use Blacksmith's setup-docker-builder action for faster Docker layer …
openhands-agent Nov 3, 2025
282f863
Merge commit 'bb150852c64a555806cfa939f31e8f9abd7b3791' into openhand…
xingyaoww Nov 4, 2025
8508006
revert unneed stuff
xingyaoww Nov 4, 2025
a565e77
simplify setup dependency
xingyaoww Nov 4, 2025
9bbd7fb
set eval-agent-server
xingyaoww Nov 4, 2025
c661b2c
fix line break
xingyaoww Nov 4, 2025
632432e
default to 10 for testing
xingyaoww Nov 4, 2025
c536903
run on all prs for debugging
xingyaoww Nov 4, 2025
efb731f
Fix pyarrow build issue by forcing binary wheel installation
openhands-agent Nov 4, 2025
29084f2
Pin Python version to 3.12 to fix pyarrow compatibility
openhands-agent Nov 4, 2025
551405b
Fix artifact upload naming to avoid invalid characters
openhands-agent Nov 4, 2025
90b6ed6
Fix artifact upload by archiving logs to avoid invalid filename chara…
openhands-agent Nov 4, 2025
3ba1e46
Fix Docker cache tag length exceeding 128 character limit
openhands-agent Nov 4, 2025
21bb226
Update patch with pre-commit formatting fixes
openhands-agent Nov 4, 2025
2f89775
checkout to v1.0.0 of sdk
xingyaoww Nov 6, 2025
dfb966b
update uv.lock
xingyaoww Nov 6, 2025
d04de8a
Merge commit 'dfb966bd2d3e4d2086223cf4ff85d998d15354d4' into openhand…
xingyaoww Nov 6, 2025
cdd7200
Revert "Fix Docker cache tag length exceeding 128 character limit"
xingyaoww Nov 6, 2025
001bcee
Fix log file mixing issue by using ProcessPoolExecutor
openhands-agent Nov 6, 2025
271b527
Improve Docker image tagging for reproducibility
openhands-agent Nov 6, 2025
92f04c1
refactor: omit target suffix for binary builds (default case)
openhands-agent Nov 6, 2025
49d9667
fix: update SDK to use SDK_VERSION for commit tags
openhands-agent Nov 6, 2025
c2711a3
refactor: remove SDK_VERSION_OVERRIDE logic
openhands-agent Nov 6, 2025
6d6845e
chore: update SDK to commit 85e436df
openhands-agent Nov 6, 2025
8d8ed8c
update agent-sdk version
xingyaoww Nov 7, 2025
8763fad
improve custom tags for swebench image
xingyaoww Nov 7, 2025
99927f8
Revert "update agent-sdk version"
xingyaoww Nov 7, 2025
8ed14f3
Merge commit '2ca8a917036ddb6ac069b3ecbb0f14ec616a4883' into openhand…
xingyaoww Nov 7, 2025
7e3c50e
update sha
xingyaoww Nov 7, 2025
c118297
fix: update run_infer.py to use new SDK tag format
openhands-agent Nov 7, 2025
4f3f9b1
refactor: deduplicate extract_custom_tag by importing from run_infer
openhands-agent Nov 7, 2025
26c3f02
docs: clarify SHORT_SHA source in run_infer.py
openhands-agent Nov 7, 2025
89e4cda
update sdk
xingyaoww Nov 7, 2025
eacfe0b
refactor
xingyaoww Nov 7, 2025
3a2c009
remove tagging changes
xingyaoww Nov 7, 2025
84c8876
bump commit
xingyaoww Nov 7, 2025
de46db7
simplify build script
xingyaoww Nov 7, 2025
bcbd455
bump version
xingyaoww Nov 7, 2025
96f2da6
bump
xingyaoww Nov 7, 2025
aad870b
bump
xingyaoww Nov 7, 2025
acee9cb
refactor build util into shared file
xingyaoww Nov 7, 2025
a4bf9e4
simplify build on the fly logic
xingyaoww Nov 7, 2025
9ef0d48
remove targets and platform
xingyaoww Nov 7, 2025
06e994a
Add automatic comment to issue #81 on successful build
openhands-agent Nov 7, 2025
fba2a55
Fix SDK URL and add workflow trigger information
openhands-agent Nov 7, 2025
0ab219f
Update .gitignore to properly allow .openhands/microagents/
openhands-agent Nov 7, 2025
aa8b452
Add error handling to skip comment when no images are built
openhands-agent Nov 7, 2025
a95969e
Fix manifest file path detection using find command
openhands-agent Nov 7, 2025
46b5266
bump sdk
xingyaoww Nov 7, 2025
16526b3
increase n work and n limit
xingyaoww Nov 7, 2025
90ee94e
Show only one tag per image in issue comment
openhands-agent Nov 7, 2025
2d10954
bump sdk commit
xingyaoww Nov 8, 2025
178123e
increase to 500 limit and 32 concurrency
xingyaoww Nov 8, 2025
0619134
disable rebuild on every push
xingyaoww Nov 10, 2025
e67b9b0
Fix workflow summary mismatch: use manifest.jsonl instead of summary.…
openhands-agent Nov 10, 2025
822e417
Remove redundant 'Upload build manifest' step
openhands-agent Nov 10, 2025
04f0cf4
bump sdk to v1.1
xingyaoww Nov 11, 2025
a1c93c9
support remote runtime & bump ver again
xingyaoww Nov 11, 2025
07abd72
fix target type
xingyaoww Nov 11, 2025
59b6631
Merge commit '89162cbbba455b5b6aa69c9facbd8c11eb6ed9f2' into xw/remot…
xingyaoww Nov 11, 2025
4949957
bump sdk
xingyaoww Nov 11, 2025
cc121b5
Merge commit '4dab8b1e02bd89e2ffa258847c917746967e67dd' into xw/remot…
xingyaoww Nov 11, 2025
94c4326
check image exists before launching remote runtime job
xingyaoww Nov 12, 2025
0f621e4
Merge commit '34bcaea6fbf0477b6f6691ec9d2bbcda7dcafbcc' into xw/remot…
xingyaoww Nov 12, 2025
d7d6faf
Merge commit '15fd19d91fa933d20790abb3f87098f3d0874399' into xw/remot…
xingyaoww Nov 12, 2025
422282e
Merge commit '03cd6395e407d1463ed99e2eb80466fe9b10d590' into xw/remot…
xingyaoww Nov 13, 2025
5d734aa
trying fixing docker build trigger
xingyaoww Nov 13, 2025
3e1f8f9
fix typo
xingyaoww Nov 13, 2025
8601875
tweak
xingyaoww Nov 13, 2025
af6966a
tweak
xingyaoww Nov 13, 2025
2160810
drop default
xingyaoww Nov 13, 2025
19d58fa
Merge commit 'b3f5ab74e589803943cd65414ef2510e6b1d2966' into xw/remot…
xingyaoww Nov 13, 2025
fd5c0c6
sleep after failure
xingyaoww Nov 13, 2025
ea3f69f
check target image existence before build
xingyaoww Nov 13, 2025
fbe7657
misc improvements
xingyaoww Nov 13, 2025
fe66a87
Improve multiprocessing logging and add metadata persistence
xingyaoww Nov 13, 2025
f6abf40
Merge commit '3dfeb4b443a693e0327aa6cf83eb21c926c8246c' into xw/misc-…
xingyaoww Nov 13, 2025
297491d
Add stdout/stderr redirection for instance-specific logging
xingyaoww Nov 13, 2025
c56b0f9
Add proper KeyboardInterrupt handling with process cleanup
xingyaoww Nov 13, 2025
3d5f197
rename output file
xingyaoww Nov 13, 2025
8bf1038
Refactor: simplify pool cleanup with helper method
xingyaoww Nov 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="OpenHands",
help="Model name to use in the model_name_or_path field (default: OpenHands)",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
)

parser.add_argument(
Expand Down
256 changes: 199 additions & 57 deletions benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

import json
import os
import sys
from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import contextmanager
from typing import Callable, List, Optional, Tuple

from pydantic import BaseModel, Field
Expand Down Expand Up @@ -35,6 +37,17 @@ class Evaluation(ABC, BaseModel):
metadata: EvalMetadata
num_workers: int = Field(default=1, ge=1)

def model_post_init(self, __context) -> None:
"""Save metadata to output directory after initialization."""
# Ensure output directory exists
os.makedirs(self.metadata.eval_output_dir, exist_ok=True)

# Save metadata to JSON file
metadata_file = os.path.join(self.metadata.eval_output_dir, "metadata.json")
with open(metadata_file, "w", encoding="utf-8") as f:
f.write(self.metadata.model_dump_json(indent=2))
logger.info(f"Saved metadata to {metadata_file}")

@property
def output_path(self) -> str:
return os.path.join(self.metadata.eval_output_dir, OUTPUT_FILENAME)
Expand Down Expand Up @@ -247,9 +260,9 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
logger.warning("on_result callback failed: %s", cb_err)

# Run evaluation for this attempt
with ProcessPoolExecutor(
max_workers=self.num_workers, initializer=_child_init
) as pool:
pool = ProcessPoolExecutor(max_workers=self.num_workers)
futures = []
try:
futures = [
pool.submit(self._process_one_mp, inst)
for inst in instances_to_process
Expand All @@ -271,6 +284,17 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
stack_info=True,
)

# Normal completion - shutdown gracefully
pool.shutdown(wait=True)
except KeyboardInterrupt:
logger.warning("KeyboardInterrupt received, shutting down workers...")
self._cleanup_pool(pool, futures, wait=False)
logger.info("All workers terminated")
raise
except Exception:
self._cleanup_pool(pool, futures, wait=False)
raise

# Restore original temperature
if attempt > 1 and original_temperature == 0.0:
self.metadata.llm.temperature = original_temperature
Expand All @@ -296,6 +320,34 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
)
return all_outputs

def _cleanup_pool(
self,
pool: ProcessPoolExecutor,
futures: list,
wait: bool = False,
) -> None:
"""Clean up pool by canceling futures, terminating workers, and shutting down.

Args:
pool: The ProcessPoolExecutor to clean up
futures: List of futures to cancel
wait: Whether to wait for workers to finish (True) or terminate immediately (False)
"""
# Cancel all pending futures
for fut in futures:
fut.cancel()

# Forcefully terminate all worker processes if not waiting
if not wait and hasattr(pool, "_processes") and pool._processes:
for process in pool._processes.values():
try:
process.terminate()
except Exception:
pass

# Shutdown the pool
pool.shutdown(wait=wait, cancel_futures=True)

# --- Worker-side method (executed in child processes) ---------------------------
def _process_one_mp(
self, instance: EvalInstance
Expand All @@ -307,67 +359,157 @@ def _process_one_mp(
- Ensures proper context-managed cleanup
- Returns (instance, output) so the parent can stream results
"""
logger.info("[child] start id=%s", instance.id)
# Set up instance-specific logging
log_dir = os.path.join(self.metadata.eval_output_dir, "logs")
reset_logger_for_multiprocessing(log_dir, instance.id)

retry_count = 0
last_error = None
max_retries = self.metadata.max_retries
# Get log file path for stdout/stderr redirection
log_file = os.path.join(log_dir, f"instance_{instance.id}.output.log")

while retry_count <= max_retries:
workspace = None
try:
workspace = self.prepare_workspace(instance)
out = self.evaluate_instance(instance, workspace)
logger.info("[child] done id=%s", instance.id)
return instance, out
except Exception as e:
last_error = e
retry_count += 1

if retry_count <= max_retries:
logger.warning(
f"[child] Instance {instance.id} failed "
f"(attempt {retry_count}/{max_retries}): "
f"{str(e)[:50]}"
)
else:
logger.error(
f"[child] Instance {instance.id} failed after "
f"{max_retries} retries. Last error: {str(e)[:50]}",
exc_info=True,
)
# Create error output for final failure
error_output = self._create_error_output(
instance, last_error, max_retries
)
return instance, error_output
finally:
# Ensure workspace cleanup happens regardless of success or failure
if workspace is not None:
try:
# Use the context manager protocol for cleanup
workspace.__exit__(None, None, None)
logger.debug(
"[child] cleaned up workspace for id=%s", instance.id
)
except Exception as cleanup_error:
# Redirect stdout/stderr to capture all output (SDK visualizations, etc.)
with redirect_stdout_stderr(log_file):
logger.info("[child] start id=%s", instance.id)

retry_count = 0
last_error = None
max_retries = self.metadata.max_retries

while retry_count <= max_retries:
workspace = None
try:
workspace = self.prepare_workspace(instance)
out = self.evaluate_instance(instance, workspace)
logger.info("[child] done id=%s", instance.id)
return instance, out
except Exception as e:
last_error = e
retry_count += 1

if retry_count <= max_retries:
logger.warning(
f"[child] Failed to cleanup workspace for {instance.id}: "
f"{str(cleanup_error)[:50]}"
f"[child] Instance {instance.id} failed "
f"(attempt {retry_count}/{max_retries}): "
f"{str(e)[:50]}"
)
else:
logger.error(
f"[child] Instance {instance.id} failed after "
f"{max_retries} retries. Last error: {str(e)[:50]}",
exc_info=True,
)
# Create error output for final failure
error_output = self._create_error_output(
instance, last_error, max_retries
)
return instance, error_output
finally:
# Ensure workspace cleanup happens regardless of success or failure
if workspace is not None:
try:
# Use the context manager protocol for cleanup
workspace.__exit__(None, None, None)
logger.debug(
"[child] cleaned up workspace for id=%s", instance.id
)
except Exception as cleanup_error:
logger.warning(
f"[child] Failed to cleanup workspace for {instance.id}: "
f"{str(cleanup_error)[:50]}"
)

# This should never be reached, but added for type safety
error_output = self._create_error_output(
instance, Exception("Unexpected error: no attempts made"), max_retries
)
return instance, error_output

# This should never be reached, but added for type safety
error_output = self._create_error_output(
instance, Exception("Unexpected error: no attempts made"), max_retries
)
return instance, error_output

# ---------- Multiprocessing logging helper ---------------------------------------


# ---------- Optional per-process initializer ---------------------------------------
def reset_logger_for_multiprocessing(log_dir: str, instance_id: str) -> None:
"""Reset the logger for multiprocessing with instance-specific logging.

Save logs to a separate file for each instance, instead of trying to write to the
same file/console from multiple processes. This provides:
- One INFO line to console at start with tail hint
- All subsequent logs go to instance-specific file
- Only WARNING+ messages go to console after initial message

def _child_init() -> None:
"""Per-process initializer (placeholder).
Put signal handlers or per-process setup here if needed.
Args:
log_dir: Directory to store log files
instance_id: Unique identifier for the instance being processed
"""
pass
import logging

# Set up logger
log_file = os.path.join(log_dir, f"instance_{instance_id}.log")

# Get root logger and remove all existing handlers
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)

# Create console handler for initial message
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(
logging.Formatter(
f"Instance {instance_id} - " + "%(asctime)s - %(levelname)s - %(message)s"
)
)
root_logger.addHandler(console_handler)
root_logger.setLevel(logging.DEBUG)

# Print one INFO line with helpful hint
root_logger.info(
f"Starting evaluation for instance {instance_id}.\n"
f'Hint: run "tail -f {log_file}" to see live logs in a separate shell'
)

# Now set console to WARNING+ only
console_handler.setLevel(logging.WARNING)

# Add file handler for detailed logs
os.makedirs(log_dir, exist_ok=True)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
)
file_handler.setLevel(logging.INFO)
root_logger.addHandler(file_handler)


@contextmanager
def redirect_stdout_stderr(log_file_path: str):
"""Context manager to redirect stdout/stderr to a log file.

This captures all print() statements, SDK visualizations, and any other
output that goes to stdout/stderr.

Args:
log_file_path: Path to the log file where output should be redirected
"""
# Save original stdout/stderr
original_stdout = sys.stdout
original_stderr = sys.stderr
log_file = None

try:
# Open log file in append mode with line buffering
log_file = open(log_file_path, "a", buffering=1, encoding="utf-8")

# Redirect stdout and stderr
sys.stdout = log_file
sys.stderr = log_file

yield

finally:
# Restore original stdout/stderr
sys.stdout = original_stdout
sys.stderr = original_stderr

# Close the log file if it was opened
if log_file is not None and not log_file.closed:
log_file.close()
Loading