diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index 71d5048eb..f03e73f66 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -15,6 +15,7 @@ import sys from pathlib import Path +from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report @@ -199,6 +200,9 @@ def main() -> None: # Process results and generate report process_commit0_results(str(input_file), str(output_file), args.model_name) + # Update Laminar datapoints with evaluation scores + LaminarService.get().update_evaluation_scores(str(input_file), str(output_file)) + # Generate cost report as final step generate_cost_report(str(input_file)) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 156564fe0..1d7673135 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -152,7 +152,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Total instances to process: %d", len(instances)) return instances - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """ Create workspace and set up the commit0 repository. """ @@ -167,6 +169,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: base_image=base_docker_image, working_dir="/workspace", target=build_target, + forward_env=forward_env or [], ) logger.info( f"Building workspace from {base_docker_image}. This may take a while..." @@ -201,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 17d99adce..889d132d1 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -18,6 +18,7 @@ import sys from pathlib import Path +from benchmarks.utils.laminar import LaminarService from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -226,6 +227,9 @@ def main() -> None: args.model_name, ) + # Update Laminar datapoints with evaluation scores + LaminarService.get().update_evaluation_scores(str(input_file), str(output_file)) + # Generate cost report as final step generate_cost_report(str(input_file)) diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 0a0569ab9..8210a6f53 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -116,7 +116,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info(f"Total instances to process: {len(instances)}") return instances - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """Create workspace and copy necessary files.""" logger.info(f"Preparing workspace for instance {instance.id}") @@ -125,6 +127,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: workspace = DockerDevWorkspace( base_image="nikolaik/python-nodejs:python3.12-nodejs22", working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image @@ -159,6 +162,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="binary", # GAIA images use binary target + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py index a39d92d45..3bb88cf1a 100644 --- a/benchmarks/multiswebench/eval_infer.py +++ b/benchmarks/multiswebench/eval_infer.py @@ -18,6 +18,7 @@ from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import ( update_multi_swe_config, ) +from benchmarks.utils.laminar import LaminarService from openhands.sdk import get_logger @@ -143,6 +144,11 @@ def main(): shutil.move(str(results_file), str(output_report_path)) logger.info(f"Report moved to {output_report_path}") + # Update Laminar datapoints with evaluation scores + LaminarService.get().update_evaluation_scores( + str(args.input_file), str(output_report_path) + ) + if __name__ == "__main__": main() diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 9a64c73df..ff9d99b84 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -173,7 +173,9 @@ def prepare_instances(self) -> List[EvalInstance]: return instances # ---- Hook: prepare a workspace per instance ---------------------------------- - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """ Use DockerWorkspace by default. """ @@ -228,6 +230,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -255,6 +258,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index d92064568..3b77e68d8 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -359,7 +359,9 @@ def prepare_instances(self) -> List[EvalInstance]: logger.info("Total instances to process: %d", len(instances)) return instances - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """Create a fresh Docker workspace for this instance.""" server_image = build_workspace_image() @@ -367,6 +369,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: server_image=server_image, platform="linux/amd64", extra_ports=True, + forward_env=forward_env or [], ) # Setup host mapping for The Agent Company services diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index 39ae7c0c2..f252a56a3 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -16,6 +16,7 @@ import sys from pathlib import Path +from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -265,6 +266,11 @@ def main() -> None: shutil.move(str(report_path), str(dest_report_path)) logger.info(f"Moved report file to: {dest_report_path}") + # Update Laminar datapoints with evaluation scores + LaminarService.get().update_evaluation_scores( + str(input_file), str(dest_report_path) + ) + # Generate cost report as final step generate_cost_report(str(input_file)) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 94846c4dc..036751b14 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -96,7 +96,9 @@ def prepare_instances(self) -> List[EvalInstance]: return instances # ---- Hook: prepare a workspace per instance ---------------------------------- - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """ Use DockerWorkspace by default. """ @@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 2311c7783..1dcd4ae96 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -103,7 +103,9 @@ def prepare_instances(self) -> List[EvalInstance]: return instances # ---- Hook: prepare a workspace per instance ---------------------------------- - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """ Use DockerWorkspace by default. """ @@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 27c82a8e0..94cb120ad 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -17,6 +17,7 @@ import sys from pathlib import Path +from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import get_logger @@ -380,6 +381,11 @@ def main() -> None: logger.info(f"Moved evaluation report to: {target_file}") update_report_with_submitted_instances(target_file, output_file) + # Update Laminar datapoints with evaluation scores + LaminarService.get().update_evaluation_scores( + str(input_file), str(target_file) + ) + # Generate cost report as final step generate_cost_report(str(input_file)) diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 983bf6d87..56426b53b 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -135,7 +135,9 @@ def prepare_instances(self) -> List[EvalInstance]: return instances # ---- Hook: prepare a workspace per instance ---------------------------------- - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """ Create workspace based on workspace_type (docker or remote). """ @@ -168,11 +170,13 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: base_image=official_docker_image, working_dir="/workspace", target=build_target, + forward_env=forward_env or [], ) else: workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -200,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: runtime_api_key=runtime_api_key, server_image=agent_server_image, target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], ) else: raise ValueError( diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 0391bedcb..70ea6c711 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -9,15 +9,19 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, as_completed from contextlib import contextmanager +from datetime import datetime from pathlib import Path from typing import Callable, List, Optional, Tuple +from uuid import UUID +from lmnr import Laminar from pydantic import BaseModel, Field from tqdm import tqdm from benchmarks.utils.constants import OUTPUT_FILENAME from benchmarks.utils.critics import get_completed_instances from benchmarks.utils.iterative import aggregate_results, get_failed_instances +from benchmarks.utils.laminar import LMNR_ENV_VARS, LaminarEvalMetadata, LaminarService from benchmarks.utils.models import ( EvalInstance, EvalInstanceID, @@ -75,7 +79,9 @@ def prepare_instances(self) -> List[EvalInstance]: raise NotImplementedError @abstractmethod - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """Create and return a context-managed Workspace for the given instance.""" raise NotImplementedError @@ -232,6 +238,19 @@ def _run_iterative_mode( """Run evaluation with support for single or multiple attempts.""" all_instances = self.prepare_instances() + # Initialize Laminar + LaminarService.get().initialize() + + # Create Laminar evaluation + now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + self.metadata.lmnr = LaminarEvalMetadata( + eval_id=LaminarService.get().create_evaluation( + name=f"{self.metadata.dataset} {self.metadata.dataset_split} {now}", + group_name=f"{self.metadata.dataset} {self.metadata.dataset_split}", + metadata=self.metadata.model_dump(mode="json"), + ) + ) + total_instances = len(all_instances) logger.info("prepared %d instances for evaluation", total_instances) @@ -290,10 +309,23 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: pool = ProcessPoolExecutor(max_workers=self.num_workers) futures = [] try: - futures = [ - pool.submit(self._process_one_mp, inst) - for inst in instances_to_process - ] + futures = [] + lmnr_datapoints: dict[str, UUID] = dict() + for index, inst in enumerate(instances_to_process): + datapoint_id, lmnr_span_ctx = ( + LaminarService.get().create_evaluation_datapoint( + self.metadata.lmnr.eval_id, + inst.id, + self.metadata.model_dump(mode="json"), + index, + ) + ) + if datapoint_id is not None: + lmnr_datapoints[inst.id] = datapoint_id + + futures.append( + pool.submit(self._process_one_mp, inst, lmnr_span_ctx) + ) for fut in tqdm( as_completed(futures), @@ -303,6 +335,15 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: ): try: instance, out = fut.result() + + # Add Laminar metadata to EvalOutput so we can use it in the evaluation process + if out.metadata is None: + out.metadata = self.metadata.model_copy(deep=True) + out.metadata.lmnr = LaminarEvalMetadata( + eval_id=self.metadata.lmnr.eval_id, + datapoint_id=lmnr_datapoints.get(instance.id, None), + ) + attempt_on_result(instance, out) except Exception as e: logger.error( @@ -377,7 +418,7 @@ def _cleanup_pool( # --- Worker-side method (executed in child processes) --------------------------- def _process_one_mp( - self, instance: EvalInstance + self, instance: EvalInstance, eval_span_ctx: str | None ) -> Tuple[EvalInstance, EvalOutput]: """Execute one instance in a child process with retry logic. @@ -403,8 +444,23 @@ def _process_one_mp( while retry_count <= max_retries: workspace = None + + # Start Laminar execution span and inject context into os.environ so workspace can pick it up + # Escape the serialized context to safely pass as a cli argument + lmnr_span = Laminar.start_active_span( + "Execution", + span_type="EXECUTOR", # type: ignore + parent_span_context=Laminar.deserialize_span_context(eval_span_ctx) + if eval_span_ctx + else None, + ) + exec_span_ctx = json.dumps(Laminar.serialize_span_context(lmnr_span)) + os.environ["LMNR_SPAN_CONTEXT"] = exec_span_ctx or "" + try: - workspace = self.prepare_workspace(instance) + workspace = self.prepare_workspace( + instance, forward_env=LMNR_ENV_VARS + ) out = self.evaluate_instance(instance, workspace) # Capture conversation archive after successful evaluation @@ -415,6 +471,7 @@ def _process_one_mp( except Exception as e: last_error = e retry_count += 1 + lmnr_span.record_exception(e) if retry_count <= max_retries: logger.warning( @@ -447,6 +504,7 @@ def _process_one_mp( f"[child] Failed to cleanup workspace for {instance.id}: " f"{str(cleanup_error)[:50]}" ) + lmnr_span.end() # This should never be reached, but added for type safety error_output = self._create_error_output( diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py new file mode 100644 index 000000000..24b91ed23 --- /dev/null +++ b/benchmarks/utils/laminar.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any +from uuid import UUID + +from lmnr import Laminar, LaminarClient +from pydantic import BaseModel + +from openhands.sdk import get_logger + + +# Environment variables to forward to the workspace +LMNR_ENV_VARS = [ + "LMNR_PROJECT_API_KEY", + "LMNR_SPAN_CONTEXT", +] + +logger = get_logger(__name__) + + +class LaminarEvalMetadata(BaseModel): + eval_id: UUID | None = None + datapoint_id: UUID | None = None + + +class LaminarService: + """Singleton helper around Laminar client usage.""" + + _object: LaminarService | None = None + + def __init__(self) -> None: + self._client: LaminarClient | None = None + self._laminar_initialized = False + + @classmethod + def get(cls) -> "LaminarService": + if cls._object is None: + cls._object = cls() + return cls._object + + def _is_enabled(self) -> bool: + return bool(os.environ.get("LMNR_PROJECT_API_KEY")) + + def initialize(self) -> bool: + """ + Initialize the Laminar SDK once per process. + Returns True if initialization succeeded (or was already done), False otherwise. + """ + + if self._laminar_initialized: + return True + + if not self._is_enabled(): + return False + + try: + Laminar.initialize() + except Exception as exc: # pragma: no cover - defensive logging + logger.debug("Failed to initialize Laminar SDK: %s", exc) + return False + + self._laminar_initialized = True + return True + + def _get_client(self) -> LaminarClient | None: + if not self._laminar_initialized or not self._is_enabled(): + return None + + if self._client is None: + try: + self._client = LaminarClient() + except Exception as exc: + logger.warning("Failed to create LaminarClient: %s", exc) + return None + + return self._client + + def create_evaluation( + self, name: str, group_name: str, metadata: dict[str, Any] | None = None + ): + client = self._get_client() + if client is None: + return None + + try: + eval_id = client.evals.create_evaluation( + name=name, + group_name=group_name, + metadata=metadata, + ) + return eval_id + except Exception as exc: # pragma: no cover - defensive logging + logger.debug( + "Laminar evaluation %s (%s): %s", + name, + group_name, + exc, + ) + + def create_evaluation_datapoint( + self, + eval_id: UUID | None, + data: Any, + metadata: dict[str, Any], + index: int, + ) -> tuple[UUID | None, str | None]: + """ + Create a Laminar datapoint. + Creates a new span for the evaluation and returns the span context. + """ + + if eval_id is None: + return None, None + + client = self._get_client() + if client is None: + return None, None + + try: + eval_span = Laminar.start_active_span( + "Evaluation", + span_type="EVALUATION", # type: ignore + ) + lmnr_span_ctx = Laminar.serialize_span_context(eval_span) + eval_span.end() + + return client.evals.create_datapoint( + eval_id=eval_id, + data=data, + target=1, + metadata=metadata, + index=index, + trace_id=UUID(int=eval_span.get_span_context().trace_id), + ), lmnr_span_ctx + except Exception as exc: + logger.debug( + "Failed to create Laminar datapoint for eval %s: %s", + eval_id, + exc, + ) + return None, None + + def _update_evaluation_datapoint( + self, + datapoint_id: UUID | None, + eval_id: UUID | None, + executor_output: Any, + scores: dict[str, Any], + ) -> None: + """ + Update a Laminar datapoint. + """ + + client = self._get_client() + if client is None or not eval_id or not datapoint_id: + return + + try: + client.evals.update_datapoint( + eval_id=eval_id, + datapoint_id=datapoint_id, + executor_output=executor_output, + scores=scores, + ) + except Exception as exc: # pragma: no cover - defensive logging + logger.debug( + "Failed to update Laminar datapoint %s for eval %s: %s", + datapoint_id, + eval_id, + exc, + ) + + def _update_evaluation_scores_from_output_file( + self, + output_file: str, + resolved_ids: set[str], + ) -> None: + """ + Update Laminar datapoints with scores based on an output.jsonl file. + + Reads the output file, extracts Laminar metadata (datapoint_id, eval_id) + from each entry, and updates each datapoint with {"Score": 1} if the + instance is in given resolved_ids, or {"Score": 0} otherwise. + + Args: + output_file: Path to the output.jsonl file containing evaluation results + resolved_ids: Set of instance IDs that are considered resolved/passed + """ + if not self.initialize(): + logger.debug("Laminar not enabled, skipping score updates") + return + + try: + with open(output_file, "r") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + eval_output = json.loads(line) + instance_id = eval_output.get("instance_id") + metadata_dict = eval_output.get("metadata", {}) + + if not metadata_dict: + logger.debug( + f"Line {line_num}: No metadata for {instance_id}, skipping" + ) + continue + + # Extract Laminar metadata + lmnr_dict = metadata_dict.get("lmnr", {}) + if not lmnr_dict: + logger.debug( + f"Line {line_num}: No lmnr metadata for {instance_id}, skipping" + ) + continue + + # Convert to LaminarEvalMetadata instance + try: + lmnr_metadata = LaminarEvalMetadata.model_validate( + lmnr_dict + ) + except Exception as e: + logger.debug( + f"Line {line_num}: Failed to parse Laminar metadata for {instance_id}: {e}" + ) + continue + + if not lmnr_metadata.datapoint_id or not lmnr_metadata.eval_id: + logger.debug( + f"Line {line_num}: Missing Laminar IDs for {instance_id}, skipping" + ) + continue + + # Determine score: 1 if resolved, 0 otherwise + score = 1 if instance_id in resolved_ids else 0 + + # Update the Laminar datapoint with the score + self._update_evaluation_datapoint( + datapoint_id=lmnr_metadata.datapoint_id, + eval_id=lmnr_metadata.eval_id, + executor_output=eval_output, + scores={"Score": score}, + ) + + logger.debug(f"Updated {instance_id}: Score={score}") + except json.JSONDecodeError as e: + logger.debug(f"Line {line_num}: Invalid JSON - {e}") + except Exception as e: + logger.debug(f"Line {line_num}: Error processing - {e}") + + except Exception as e: + logger.debug(f"Failed to read output file: {e}") + return + + logger.debug("Laminar score updates complete") + + def update_evaluation_scores( + self, output_file: str, report_file: str | None = None + ) -> None: + """ + Update Laminar evaluation datapoints with evaluation scores. + + Reads the report file to determine which instances resolved, + then updates Laminar datapoints with scores. + + Args: + output_file: Path to the output.jsonl file containing evaluation results + report_file: Path to the report file with resolved_ids. If None, defaults + to output.report.json in the same directory as output_file. + """ + if report_file is None: + report_path = Path(output_file).parent / "output.report.json" + else: + report_path = Path(report_file) + + if not report_path.exists(): + logger.debug( + f"Report file not found: {report_path}. Skipping Laminar score updates." + ) + return + + # Read resolved instance IDs from report + try: + with open(report_path, "r") as f: + report_data = json.load(f) + resolved_ids = set(report_data.get("resolved_ids", [])) + logger.debug(f"Found {len(resolved_ids)} resolved instances in report") + except Exception as e: + logger.warning(f"[Laminar] Failed to read report file: {e}") + return + + # Update Laminar datapoints with scores + self._update_evaluation_scores_from_output_file(output_file, resolved_ids) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 2cabbc5cb..51a5266ca 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, Field +from benchmarks.utils.laminar import LaminarEvalMetadata from openhands.sdk import LLM, Event, get_logger from openhands.sdk.critic import CriticBase from openhands.sdk.llm import Metrics @@ -50,6 +51,10 @@ class EvalMetadata(BaseModel): default="docker", description="Type of workspace to use, e.g., 'docker' or 'remote'", ) + lmnr: LaminarEvalMetadata | None = Field( + default=None, + description="Laminar evaluation metadata", + ) EvalInstanceID = str diff --git a/pyproject.toml b/pyproject.toml index e333b1355..c5e706332 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "python-json-logger>=3.3.0", "tenacity>=9.1.2", "websockets>=12", - "lmnr>=0.7.20", + "lmnr>=0.7.24", "multi-swe-bench>=1.1.1", ] diff --git a/tests/test_iterative_resume.py b/tests/test_iterative_resume.py index 0bad21667..9333e7403 100644 --- a/tests/test_iterative_resume.py +++ b/tests/test_iterative_resume.py @@ -25,11 +25,14 @@ def prepare_instances(self) -> List[EvalInstance]: """Return pre-configured instances.""" return object.__getattribute__(self, "_test_instances") - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ) -> RemoteWorkspace: """Return a mock workspace.""" mock_workspace = Mock(spec=RemoteWorkspace) mock_workspace.__enter__ = Mock(return_value=mock_workspace) mock_workspace.__exit__ = Mock(return_value=None) + mock_workspace.forward_env = forward_env or [] return mock_workspace def evaluate_instance( diff --git a/tests/test_keyboard_interrupt.py b/tests/test_keyboard_interrupt.py index 3324a3a7f..740ac0791 100644 --- a/tests/test_keyboard_interrupt.py +++ b/tests/test_keyboard_interrupt.py @@ -36,10 +36,11 @@ def prepare_instances(self) -> List[EvalInstance]: for i in range(10) ] - def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace: + def prepare_workspace(self, instance: EvalInstance, forward_env: list[str] | None = None) -> RemoteWorkspace: mock_workspace = Mock(spec=RemoteWorkspace) mock_workspace.__enter__ = Mock(return_value=mock_workspace) mock_workspace.__exit__ = Mock(return_value=None) + mock_workspace.forward_env = forward_env or [] return mock_workspace def evaluate_instance( diff --git a/tests/test_workspace_cleanup.py b/tests/test_workspace_cleanup.py index f87d0929c..6721ade73 100644 --- a/tests/test_workspace_cleanup.py +++ b/tests/test_workspace_cleanup.py @@ -52,7 +52,10 @@ class TestEvaluation(Evaluation): def prepare_instances(self) -> List[EvalInstance]: return [test_instance] - def prepare_workspace(self, instance: EvalInstance): + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ): + mock_workspace.forward_env = forward_env or [] return mock_workspace def evaluate_instance(self, instance, workspace): @@ -61,7 +64,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) # Call the method directly - result_instance, result_output = evaluator._process_one_mp(test_instance) + result_instance, result_output = evaluator._process_one_mp(test_instance, None) # Verify the workspace cleanup was called mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -102,7 +105,9 @@ class TestEvaluation(Evaluation): def prepare_instances(self) -> List[EvalInstance]: return [test_instance] - def prepare_workspace(self, instance: EvalInstance): + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ): return mock_workspace def evaluate_instance(self, instance, workspace): @@ -111,7 +116,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) # Call the method directly - result_instance, result_output = evaluator._process_one_mp(test_instance) + result_instance, result_output = evaluator._process_one_mp(test_instance, None) # Verify the workspace cleanup was called even on failure mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -163,7 +168,9 @@ class TestEvaluation(Evaluation): def prepare_instances(self) -> List[EvalInstance]: return [test_instance] - def prepare_workspace(self, instance: EvalInstance): + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ): return mock_workspace def evaluate_instance(self, instance, workspace): @@ -172,7 +179,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) # Call the method directly - should not raise an exception - result_instance, result_output = evaluator._process_one_mp(test_instance) + result_instance, result_output = evaluator._process_one_mp(test_instance, None) # Verify the workspace cleanup was attempted mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -221,7 +228,9 @@ class TestEvaluation(Evaluation): def prepare_instances(self) -> List[EvalInstance]: return [test_instance] - def prepare_workspace(self, instance: EvalInstance): + def prepare_workspace( + self, instance: EvalInstance, forward_env: list[str] | None = None + ): return create_mock_workspace() def evaluate_instance(self, instance, workspace): @@ -241,7 +250,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) # Call the method directly - result_instance, result_output = evaluator._process_one_mp(test_instance) + result_instance, result_output = evaluator._process_one_mp(test_instance, None) # Verify cleanup was called for all attempts (3 total: initial + 2 retries) assert len(workspaces_created) == 3, "Should create workspace for each attempt" diff --git a/uv.lock b/uv.lock index d61a720f4..ecb85920b 100644 --- a/uv.lock +++ b/uv.lock @@ -2269,7 +2269,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.6.0" +version = "1.7.2" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2367,7 +2367,7 @@ requires-dist = [ { name = "huggingface-hub" }, { name = "jinja2" }, { name = "litellm", specifier = ">=1.77.7.dev9" }, - { name = "lmnr", specifier = ">=0.7.20" }, + { name = "lmnr", specifier = ">=0.7.24" }, { name = "modal", specifier = ">=1.1.4" }, { name = "multi-swe-bench", specifier = ">=1.1.1" }, { name = "openhands-agent-server", editable = "vendor/software-agent-sdk/openhands-agent-server" }, @@ -2383,8 +2383,8 @@ requires-dist = [ { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "requests" }, - { name = "tenacity", specifier = ">=9.1.2" }, { name = "swebench", specifier = "==4.1.0" }, + { name = "tenacity", specifier = ">=9.1.2" }, { name = "toml" }, { name = "tqdm" }, { name = "unidiff", specifier = ">=0.7.5,<0.8.0" }, @@ -2405,7 +2405,7 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.6.0" +version = "1.7.2" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "deprecation" }, @@ -2443,7 +2443,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.6.0" +version = "1.7.2" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2472,15 +2472,17 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.6.0" +version = "1.7.2" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ + { name = "openhands-agent-server" }, { name = "openhands-sdk" }, { name = "pydantic" }, ] [package.metadata] requires-dist = [ + { name = "openhands-agent-server", editable = "vendor/software-agent-sdk/openhands-agent-server" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "pydantic", specifier = ">=2.11.7" }, ] @@ -6548,6 +6550,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/fd/901cfa59aaa5b30a99e16876f11abe38b59a1a2c51ffb3d7142bb6089069/starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51", size = 72991, upload-time = "2025-08-24T13:36:40.887Z" }, ] +[[package]] +name = "strenum" +version = "0.4.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" }, +] + [[package]] name = "swe-rex" version = "1.4.0" @@ -6567,15 +6578,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/0d/d06ab2aa78138055c297490762cd7b4d8ac58a544783f874c869cdb7b534/swe_rex-1.4.0-py3-none-any.whl", hash = "sha256:61261ad03eb23b717b5901cd5d229f24f6e1be2e120aad5c2e5ea3384a1d15ad", size = 47756, upload-time = "2025-08-14T01:19:18.93Z" }, ] -[[package]] -name = "strenum" -version = "0.4.15" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" }, -] - [[package]] name = "swebench" version = "4.1.0" diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index e8f73e71b..73769d5e9 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit e8f73e71b979984242245efeb701ee4b18480f9e +Subproject commit 73769d5e9d5f75333054d098a2e86c02555fa8d6