OpenHands · simonrosenberg · Jan 7, 2026 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
@@ -15,6 +15,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 
 
@@ -199,6 +200,9 @@ def main() -> None:
         # Process results and generate report
         process_commit0_results(str(input_file), str(output_file), args.model_name)
 
+        # Update Laminar datapoints with evaluation scores
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+
         # Generate cost report as final step
         generate_cost_report(str(input_file))
 

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
@@ -152,7 +152,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Total instances to process: %d", len(instances))
         return instances
 
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """
         Create workspace and set up the commit0 repository.
         """
@@ -167,6 +169,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 base_image=base_docker_image,
                 working_dir="/workspace",
                 target=build_target,
+                forward_env=forward_env or [],
             )
             logger.info(
                 f"Building workspace from {base_docker_image}. This may take a while..."
@@ -201,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(

diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py
@@ -18,6 +18,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
 
@@ -226,6 +227,9 @@ def main() -> None:
             args.model_name,
         )
 
+        # Update Laminar datapoints with evaluation scores
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+
         # Generate cost report as final step
         generate_cost_report(str(input_file))
 

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
@@ -116,7 +116,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         logger.info(f"Total instances to process: {len(instances)}")
         return instances
 
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """Create workspace and copy necessary files."""
         logger.info(f"Preparing workspace for instance {instance.id}")
 
@@ -125,6 +127,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
             workspace = DockerDevWorkspace(
                 base_image="nikolaik/python-nodejs:python3.12-nodejs22",
                 working_dir="/workspace",
+                forward_env=forward_env or [],
             )
         elif self.metadata.workspace_type == "remote":
             # For workflow, use APIRemoteWorkspace with pre-built GAIA image
@@ -159,6 +162,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="binary",  # GAIA images use binary target
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(

diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py
@@ -18,6 +18,7 @@
 from benchmarks.multiswebench.scripts.eval.update_multi_swe_bench_config import (
     update_multi_swe_config,
 )
+from benchmarks.utils.laminar import LaminarService
 from openhands.sdk import get_logger
 
 
@@ -143,6 +144,11 @@ def main():
         shutil.move(str(results_file), str(output_report_path))
         logger.info(f"Report moved to {output_report_path}")
 
+        # Update Laminar datapoints with evaluation scores
+        LaminarService.get().update_evaluation_scores(
+            str(args.input_file), str(output_report_path)
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
@@ -173,7 +173,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         return instances
 
     # ---- Hook: prepare a workspace per instance ----------------------------------
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """
         Use DockerWorkspace by default.
         """
@@ -228,6 +230,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
                 working_dir="/workspace",
+                forward_env=forward_env or [],
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -255,6 +258,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
@@ -359,14 +359,17 @@ def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Total instances to process: %d", len(instances))
         return instances
 
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """Create a fresh Docker workspace for this instance."""
         server_image = build_workspace_image()
 
         workspace = DockerWorkspace(
             server_image=server_image,
             platform="linux/amd64",
             extra_ports=True,
+            forward_env=forward_env or [],
         )
 
         # Setup host mapping for The Agent Company services

diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
@@ -16,6 +16,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -265,6 +266,11 @@ def main() -> None:
             shutil.move(str(report_path), str(dest_report_path))
             logger.info(f"Moved report file to: {dest_report_path}")
 
+            # Update Laminar datapoints with evaluation scores
+            LaminarService.get().update_evaluation_scores(
+                str(input_file), str(dest_report_path)
+            )
+
         # Generate cost report as final step
         generate_cost_report(str(input_file))
 

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
@@ -96,7 +96,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         return instances
 
     # ---- Hook: prepare a workspace per instance ----------------------------------
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """
         Use DockerWorkspace by default.
         """
@@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
                 working_dir="/workspace",
+                forward_env=forward_env or [],
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(

diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
@@ -103,7 +103,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         return instances
 
     # ---- Hook: prepare a workspace per instance ----------------------------------
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """
         Use DockerWorkspace by default.
         """
@@ -148,6 +150,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
                 working_dir="/workspace",
+                forward_env=forward_env or [],
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -175,6 +178,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
@@ -17,6 +17,7 @@
 import sys
 from pathlib import Path
 
+from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import get_logger
@@ -380,6 +381,11 @@ def main() -> None:
             logger.info(f"Moved evaluation report to: {target_file}")
             update_report_with_submitted_instances(target_file, output_file)
 
+            # Update Laminar datapoints with evaluation scores
+            LaminarService.get().update_evaluation_scores(
+                str(input_file), str(target_file)
+            )
+
         # Generate cost report as final step
         generate_cost_report(str(input_file))
 

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
@@ -135,7 +135,9 @@ def prepare_instances(self) -> List[EvalInstance]:
         return instances
 
     # ---- Hook: prepare a workspace per instance ----------------------------------
-    def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
+    def prepare_workspace(
+        self, instance: EvalInstance, forward_env: list[str] | None = None
+    ) -> RemoteWorkspace:
         """
         Create workspace based on workspace_type (docker or remote).
         """
@@ -168,11 +170,13 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                     base_image=official_docker_image,
                     working_dir="/workspace",
                     target=build_target,
+                    forward_env=forward_env or [],
                 )
             else:
                 workspace = DockerWorkspace(
                     server_image=agent_server_image,
                     working_dir="/workspace",
+                    forward_env=forward_env or [],
                 )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -200,6 +204,7 @@ def prepare_workspace(self, instance: EvalInstance) -> RemoteWorkspace:
                 runtime_api_key=runtime_api_key,
                 server_image=agent_server_image,
                 target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
             )
         else:
             raise ValueError(