diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml
index 284c9ed2f0..783bc3853c 100644
--- a/.github/workflows/run-examples.yml
+++ b/.github/workflows/run-examples.yml
@@ -25,6 +25,7 @@ jobs:
         timeout-minutes: 60
         steps:
             - name: Wait for agent server to finish build
+              if: github.event_name == 'pull_request'
               uses: lewagon/wait-on-check-action@v1.4.1
               with:
                   ref: ${{ github.event.pull_request.head.ref }}
@@ -64,261 +65,105 @@ jobs:
                   REPO_NAME: ${{ github.event.repository.name }}
                   GITHUB_SHA: ${{ github.event.pull_request.head.sha }}
               run: |
-                  # List of examples to test
-                  # Excluded examples:
-                  # - 01_hello_world.py: requires LiteLLM proxy URL (OPENAI_BASE_URL) not set in CI
-                  # - 04_confirmation_mode_example.py: requires user input
-                  # - 06_interactive_terminal_w_reasoning.py: interactive terminal
-                  # - 08_mcp_with_oauth.py: requires OAuth setup
-                  # - 15_browser_use.py: requires browser setup
-                  # - 16_llm_security_analyzer.py: requires user input
-                  # - 04_convo_with_api_sandboxed_server.py: requires sandbox API keys
-                  # - 04_vscode_with_docker_sandboxed_server.py: requires VSCode setup
-                  EXAMPLES=(
-                      "examples/01_standalone_sdk/02_custom_tools.py"
-                      "examples/01_standalone_sdk/03_activate_skill.py"
-                      "examples/01_standalone_sdk/05_use_llm_registry.py"
-                      "examples/01_standalone_sdk/07_mcp_integration.py"
-                      "examples/01_standalone_sdk/09_pause_example.py"
-                      "examples/01_standalone_sdk/10_persistence.py"
-                      "examples/01_standalone_sdk/11_async.py"
-                      "examples/01_standalone_sdk/12_custom_secrets.py"
-                      "examples/01_standalone_sdk/13_get_llm_metrics.py"
-                      "examples/01_standalone_sdk/14_context_condenser.py"
-                      "examples/01_standalone_sdk/17_image_input.py"
-                      "examples/01_standalone_sdk/18_send_message_while_processing.py"
-                      "examples/01_standalone_sdk/19_llm_routing.py"
-                      "examples/01_standalone_sdk/20_stuck_detector.py"
-                      "examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py"
-                      "examples/01_standalone_sdk/22_anthropic_thinking.py"
-                      "examples/01_standalone_sdk/23_responses_reasoning.py"
-                      "examples/01_standalone_sdk/24_planning_agent_workflow.py"
-                      "examples/01_standalone_sdk/25_agent_delegation.py"
-                      "examples/01_standalone_sdk/26_custom_visualizer.py"
-                      "examples/02_remote_agent_server/01_convo_with_local_agent_server.py"
-                      "examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py"
-                      "examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py"
-                      "examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py"
-                  )
+                  RESULTS_DIR=".example-test-results"
+                  REPORT_PATH="examples_report.md"
+                  rm -rf "$RESULTS_DIR"
+                  mkdir -p "$RESULTS_DIR"
 
-                  # GitHub API setup (only for PR events)
-                  if [ "${{ github.event_name }}" = "pull_request" ]; then
-                      API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments"
-                  fi
-
-                  # Function to sanitize @OpenHands mentions using the SDK utility
-                  sanitize_comment() {
-                      local text="$1"
-                      printf "%s" "$text" | uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')"
-                  }
-
-                  # Function to update PR comment
                   update_comment() {
-                      # Skip if not a PR event
-                      if [ "${{ github.event_name }}" != "pull_request" ]; then
+                      if [ -z "$API_URL" ]; then
+                          echo "Skipping PR comment update because API_URL is unset."
                           return
                       fi
-                      
+
                       local comment_body="$1"
+                      local payload
                       local response
-                      
-                      # Sanitize @OpenHands mentions before posting
-                      comment_body=$(sanitize_comment "$comment_body")
-                      
+
+                      payload=$(jq -n --arg body "$comment_body" '{body: $body}')
+
                       if [ -z "$COMMENT_ID" ]; then
-                          # Create new comment
-                          response=$(curl -s -X POST \
+                          echo "Creating PR comment..."
+                          if ! response=$(curl -sSf -X POST \
                               -H "Authorization: token ${GITHUB_TOKEN}" \
                               -H "Accept: application/vnd.github.v3+json" \
+                              -H "Content-Type: application/json" \
                               "${API_URL}" \
-                              -d "{\"body\":$(echo "$comment_body" | jq -Rs .)}")
-                          COMMENT_ID=$(echo "$response" | jq -r '.id')
+                              -d "$payload"); then
+                              echo "::error::Failed to create PR comment."
+                              exit 1
+                          fi
+                          COMMENT_ID=$(echo "$response" | jq -r '.id // ""')
+                          if [ -z "$COMMENT_ID" ]; then
+                              echo "::error::GitHub API response did not include a comment id: $response"
+                              exit 1
+                          fi
                           echo "Created comment with ID: $COMMENT_ID"
                       else
-                          # Update existing comment
-                          curl -s -X PATCH \
+                          echo "Updating PR comment (ID: $COMMENT_ID)..."
+                          if ! curl -sSf -X PATCH \
                               -H "Authorization: token ${GITHUB_TOKEN}" \
                               -H "Accept: application/vnd.github.v3+json" \
+                              -H "Content-Type: application/json" \
                               "https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/comments/${COMMENT_ID}" \
-                              -d "{\"body\":$(echo "$comment_body" | jq -Rs .)}" > /dev/null
-                      fi
-                  }
-
-                  # Function to format cost with 2 decimal places
-                  format_cost() {
-                      local cost="$1"
-                      if [ -z "$cost" ] || [ "$cost" = "N/A" ]; then
-                          echo "N/A"
-                      else
-                          printf "\$%.2f" "$cost" 2>/dev/null || echo "N/A"
-                      fi
-                  }
-
-                  # Function to generate markdown table
-                  generate_table() {
-                      local header="## 🔄 Running Examples with \`${LLM_MODEL}\`\n\n"
-                      header+="_Last updated: $(date -u '+%Y-%m-%d %H:%M:%S UTC')_\n\n"
-                      header+="| Example | Status | Duration | Cost |\n"
-                      header+="|---------|--------|----------|------|\n"
-                      
-                      local rows=""
-                      for example in "${EXAMPLES[@]}"; do
-                          # Strip examples/ prefix and show relative path from there
-                          local short_name="${example#examples/}"
-                          local status="${TEST_STATUS[$example]:-⏳ Pending}"
-                          local duration="${TEST_DURATION[$example]:--}"
-                          local cost="${TEST_COST[$example]:--}"
-                          rows+="| ${short_name} | ${status} | ${duration} | ${cost} |\n"
-                      done
-                      
-                      local summary="\n---\n\n"
-                      if [ $COMPLETED -eq ${#EXAMPLES[@]} ]; then
-                          if [ $FAILED -eq 0 ]; then
-                              summary+="### ✅ All tests passed!\n\n"
-                          else
-                              summary+="### ❌ Some tests failed\n\n"
-                          fi
-                          summary+="**Total:** ${#EXAMPLES[@]} | **Passed:** ${PASSED} | **Failed:** ${FAILED}"
-                          
-                          # Calculate and display total cost if available
-                          if [ -n "$TOTAL_COST" ]; then
-                              summary+=" | **Total Cost:** $(format_cost $TOTAL_COST)"
+                              -d "$payload" > /dev/null; then
+                              echo "::error::Failed to update PR comment (ID: $COMMENT_ID)."
+                              exit 1
                           fi
-                          
-                          summary+="\n\n[View full workflow run](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})"
-                      else
-                          summary+="**Progress:** ${COMPLETED}/${#EXAMPLES[@]} completed | **Passed:** ${PASSED} | **Failed:** ${FAILED}"
                       fi
-                      
-                      echo -e "${header}${rows}${summary}"
                   }
 
-                  # Initialize tracking variables
-                  declare -A TEST_STATUS
-                  declare -A TEST_DURATION
-                  declare -A TEST_COST
-                  FAILED=0
-                  PASSED=0
-                  COMPLETED=0
-                  TOTAL_COST=0
-                  FAILED_EXAMPLES=()
-                  RESULTS_FILE="test-results.txt"
+                  API_URL=""
                   COMMENT_ID=""
 
-                  # Clear results file
-                  > "$RESULTS_FILE"
-
-                  # Create initial comment with all tests pending (only for PR events)
                   if [ "${{ github.event_name }}" = "pull_request" ]; then
-                      echo "Creating initial PR comment..."
-                      update_comment "$(generate_table)"
+                      API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments"
+                      initial_comment="## 🔄 Running Examples with \`${LLM_MODEL}\`"
+                      initial_comment+=$'\n\n'
+                      initial_comment+="_Run in progress..._"
+                      initial_comment+=$'\n'
+                      update_comment "$initial_comment"
                   fi
 
-                  echo "=========================================="
-                  echo "Running ${#EXAMPLES[@]} examples with $LLM_MODEL"
-                  echo "=========================================="
+                  EXIT_CODE=0
+                  uv run pytest tests/examples/test_examples.py \
+                      --run-examples \
+                      --examples-results-dir "$RESULTS_DIR" \
+                      -n 4 || EXIT_CODE=$?
 
-                  for example in "${EXAMPLES[@]}"; do
-                      echo ""
-                      echo "Running: $example"
-                      echo "------------------------------------------"
-                                  
-                      START_TIME=$(date +%s)
-                                  
-                      # Create temp file to capture output
-                      OUTPUT_FILE=$(mktemp)
-                                  
-                      # Run example with timeout (20 minutes per example)
-                      # Capture output while still displaying it
-                      # Use || true to prevent script exit on failure
-                      (timeout 1200 uv run python "$example" 2>&1 || true) | tee "$OUTPUT_FILE"
-                                  
-                      # Check if command succeeded by looking at Python exit
-                      if ! grep -q "EXAMPLE_COST:" "$OUTPUT_FILE"; then
-                          EXIT_CODE=1
-                      else
-                          EXIT_CODE=0
-                      fi
-                                  
-                      END_TIME=$(date +%s)
-                      DURATION=$((END_TIME - START_TIME))
-                      DURATION_STR="${DURATION}s"
-                                  
-                      # Extract cost from output
-                      COST=$(grep "EXAMPLE_COST:" "$OUTPUT_FILE" | awk '{print $2}' | tail -1 || echo "0.00")
-                      if [ -z "$COST" ]; then
-                          COST="0.00"
-                      fi
-                                  
-                      # Accumulate total cost
-                      TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc -l 2>/dev/null || echo "$TOTAL_COST")
-                                  
-                      if [ "$EXIT_CODE" -eq 0 ]; then
-                          echo "✓ PASSED: $example (${DURATION_STR}, cost: \$${COST})"
-                          PASSED=$((PASSED + 1))
-                          COMPLETED=$((COMPLETED + 1))
-                          TEST_STATUS[$example]="✅ PASS"
-                          TEST_DURATION[$example]="${DURATION_STR}"
-                          TEST_COST[$example]="$(format_cost $COST)"
-                          echo "PASS|$example|${DURATION}|${COST}" >> "$RESULTS_FILE"
-                      else
-                          echo "✗ FAILED: $example (exit code: $EXIT_CODE, ${DURATION_STR}, cost: \$${COST})"
-                          FAILED=$((FAILED + 1))
-                          COMPLETED=$((COMPLETED + 1))
-                          FAILED_EXAMPLES+=("$example")
-                          TEST_STATUS[$example]="❌ FAIL (exit: ${EXIT_CODE})"
-                          TEST_DURATION[$example]="${DURATION_STR}"
-                          TEST_COST[$example]="$(format_cost $COST)"
-                          echo "FAIL|$example|$EXIT_CODE|${DURATION}|${COST}" >> "$RESULTS_FILE"
-                      fi
-                                  
-                      # Clean up temp file
-                      rm -f "$OUTPUT_FILE"
-                                  
-                      # Update PR comment after each test (with error handling)
-                      echo "Updating PR comment..."
-                      update_comment "$(generate_table)" || echo "Warning: Failed to update PR comment"
-                  done
+                  TIMESTAMP="$(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+                  WORKFLOW_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
 
-                  echo ""
-                  echo "=========================================="
-                  echo "Test Results Summary"
-                  echo "=========================================="
-                  echo "Total: ${#EXAMPLES[@]}"
-                  echo "Passed: $PASSED"
-                  echo "Failed: $FAILED"
-                  echo "Total Cost: $(format_cost $TOTAL_COST)"
+                  uv run python scripts/render_examples_report.py \
+                      --results-dir "$RESULTS_DIR" \
+                      --model "$LLM_MODEL" \
+                      --workflow-url "$WORKFLOW_URL" \
+                      --timestamp "$TIMESTAMP" \
+                      --output "$REPORT_PATH"
 
-                  # Generate final report and save to file
-                  FINAL_REPORT=$(generate_table)
-                  echo "$FINAL_REPORT" > examples_report.md
-                  echo "Final report saved to examples_report.md"
+                  COMMENT_BODY="$(cat "$REPORT_PATH")"
+                  echo "$COMMENT_BODY"
 
-                  if [ $FAILED -gt 0 ]; then
-                      echo ""
-                      echo "Failed examples:"
-                      for failed_example in "${FAILED_EXAMPLES[@]}"; do
-                          echo "  - $failed_example"
-                      done
-                      exit 1
+                  if [ "${{ github.event_name }}" = "pull_request" ]; then
+                      echo "Publishing PR comment..."
+                      update_comment "$COMMENT_BODY"
                   fi
 
-                  echo ""
-                  echo "All examples passed! ✓"
-
+                  if [ $EXIT_CODE -ne 0 ]; then
+                      exit $EXIT_CODE
+                  fi
             - name: Read examples report for issue comment
               if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
               id: read_report
               shell: bash
               run: |
                   if [ -f examples_report.md ]; then
-                      # Sanitize @OpenHands mentions before posting
-                      REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < examples_report.md)
-                      echo "report<<EOF" >> $GITHUB_OUTPUT
-                      echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT
-                      echo "EOF" >> $GITHUB_OUTPUT
+                      REPORT_CONTENT=$(cat examples_report.md)
+                      echo "report<<EOF" >> "$GITHUB_OUTPUT"
+                      echo "$REPORT_CONTENT" >> "$GITHUB_OUTPUT"
+                      echo "EOF" >> "$GITHUB_OUTPUT"
                   else
-                      echo "report=Report file not found" >> $GITHUB_OUTPUT
+                      echo "report=Report file not found" >> "$GITHUB_OUTPUT"
                   fi
 
             - name: Comment with results on tracker issue
diff --git a/examples/01_standalone_sdk/25_agent_delegation.py b/examples/01_standalone_sdk/25_agent_delegation.py
index 8d3221434e..3ab0abb49b 100644
--- a/examples/01_standalone_sdk/25_agent_delegation.py
+++ b/examples/01_standalone_sdk/25_agent_delegation.py
@@ -71,3 +71,7 @@
     "Ask the lodging sub-agent what it thinks about Covent Garden."
 )
 conversation.run()
+
+# Report cost
+cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/26_custom_visualizer.py b/examples/01_standalone_sdk/26_custom_visualizer.py
index 81feca85b1..9908fb8936 100644
--- a/examples/01_standalone_sdk/26_custom_visualizer.py
+++ b/examples/01_standalone_sdk/26_custom_visualizer.py
@@ -65,4 +65,4 @@ def on_event(self, event: Event) -> None:
 
 # Report cost
 cost = llm.metrics.accumulated_cost
-print(f"EXAMPLE_COST: ${cost:.4f}")
+print(f"EXAMPLE_COST: {cost:.4f}")
diff --git a/pyproject.toml b/pyproject.toml
index db1508c773..2f1c150489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dev = [
     "pycodestyle>=2.12.0",
     "pytest-asyncio>=1.1.0",
     "pytest-forked>=1.6.0",
+    "pytest-xdist>=3.6.0",
     "tabulate>=0.9.0",
     "pyinstaller>=6.16.0",
     "streamlit>=1.49.1",
diff --git a/scripts/render_examples_report.py b/scripts/render_examples_report.py
new file mode 100644
index 0000000000..346df600ee
--- /dev/null
+++ b/scripts/render_examples_report.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+import argparse
+import json
+from collections.abc import Iterable
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from decimal import ROUND_HALF_UP, Decimal, InvalidOperation
+from pathlib import Path
+
+from openhands.sdk.utils.github import sanitize_openhands_mentions
+
+
+@dataclass(slots=True)
+class ExampleResult:
+    name: str
+    status: str
+    duration_seconds: float | None
+    cost: str | None
+    failure_reason: str | None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Render markdown summary for example runs."
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        required=True,
+        help="Directory containing per-example JSON results.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Unknown model",
+        help="LLM model name used for the run.",
+    )
+    parser.add_argument(
+        "--workflow-url",
+        type=str,
+        default="",
+        help="URL to the workflow run details page.",
+    )
+    parser.add_argument(
+        "--timestamp",
+        type=str,
+        default="",
+        help="UTC timestamp string to include in the report header.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Optional path to write the markdown report to.",
+    )
+    return parser.parse_args()
+
+
+def iter_result_files(results_dir: Path) -> Iterable[Path]:
+    yield from sorted(results_dir.glob("*.json"))
+
+
+def load_results(results_dir: Path) -> list[ExampleResult]:
+    results: list[ExampleResult] = []
+    for path in iter_result_files(results_dir):
+        try:
+            payload = json.loads(path.read_text())
+        except json.JSONDecodeError:
+            continue
+        results.append(
+            ExampleResult(
+                name=str(payload.get("example", path.stem)),
+                status=str(payload.get("status", "unknown")),
+                duration_seconds=_coerce_float(payload.get("duration_seconds")),
+                cost=_coerce_cost(payload.get("cost")),
+                failure_reason=_sanitize_reason(payload.get("failure_reason")),
+            )
+        )
+    return sorted(results, key=lambda item: item.name)
+
+
+def _coerce_float(value: object) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        stripped = value.strip()
+        if not stripped:
+            return None
+        try:
+            return float(stripped)
+        except ValueError:
+            return None
+    return None
+
+
+def _coerce_cost(value: object) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str) and not value.strip():
+        return None
+    return str(value)
+
+
+def _sanitize_reason(value: object) -> str | None:
+    if value is None:
+        return None
+    reason = str(value).strip()
+    return reason or None
+
+
+def format_duration(seconds: float | None) -> str:
+    if seconds is None:
+        return "--"
+    seconds = max(0.0, seconds)
+    if seconds < 60:
+        return f"{seconds:.1f}s"
+    minutes, sec = divmod(int(seconds + 0.5), 60)
+    if minutes < 60:
+        return f"{minutes}m {sec}s"
+    hours, minutes = divmod(minutes, 60)
+    return f"{hours}h {minutes}m"
+
+
+def format_cost(value: str | None) -> str:
+    if not value:
+        return "--"
+    try:
+        amount = Decimal(value)
+    except InvalidOperation:
+        return "--"
+    quantized = amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
+    return f"${quantized}"
+
+
+def format_total_cost(values: Iterable[str | None]) -> str | None:
+    total = Decimal("0")
+    seen = False
+    for value in values:
+        if not value:
+            continue
+        try:
+            amount = Decimal(value)
+        except InvalidOperation:
+            continue
+        total += amount
+        seen = True
+    if not seen:
+        return None
+    quantized = total.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
+    return f"${quantized}"
+
+
+def markdown_header(model: str, timestamp: str) -> list[str]:
+    ts = timestamp or datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
+    return [f"## 🔄 Running Examples with `{model}`", "", f"_Generated: {ts}_", ""]
+
+
+def markdown_table(results: list[ExampleResult]) -> list[str]:
+    lines = [
+        "| Example | Status | Duration | Cost |",
+        "|---------|--------|----------|------|",
+    ]
+    for result in results:
+        example = result.name
+        if example.startswith("examples/"):
+            example = example[len("examples/") :]
+        status = "✅ PASS" if result.status == "passed" else "❌ FAIL"
+        if result.status != "passed" and result.failure_reason:
+            status = f"{status}<br>{_escape_cell(result.failure_reason)}"
+        duration_display = format_duration(result.duration_seconds)
+        cost_display = format_cost(result.cost)
+        cells = [
+            _escape_cell(example),
+            status,
+            duration_display,
+            cost_display,
+        ]
+        row = "| " + " | ".join(cells) + " |"
+        lines.append(row)
+    if len(results) == 0:
+        lines.append("| _No results_ | -- | -- | -- |")
+    return lines
+
+
+def markdown_summary(results: list[ExampleResult], workflow_url: str) -> list[str]:
+    total = len(results)
+    passed = sum(1 for item in results if item.status == "passed")
+    failed = total - passed
+    cost_summary = format_total_cost(item.cost for item in results)
+
+    lines = ["", "---", ""]
+    if failed == 0 and total > 0:
+        lines.append("### ✅ All tests passed!")
+    elif failed == 0:
+        lines.append("### ℹ️ No examples were executed")
+    else:
+        lines.append("### ❌ Some tests failed")
+
+    summary = f"**Total:** {total} | **Passed:** {passed} | **Failed:** {failed}"
+    if cost_summary:
+        summary += f" | **Total Cost:** {cost_summary}"
+    lines.append(summary)
+
+    if failed:
+        lines.append("")
+        lines.append("**Failed examples:**")
+        for item in results:
+            if item.status != "passed":
+                reason = item.failure_reason or "See logs"
+                lines.append(f"- {item.name}: {reason}")
+
+    if workflow_url:
+        lines.append("")
+        lines.append(f"[View full workflow run]({workflow_url})")
+
+    return lines
+
+
+def _escape_cell(text: str) -> str:
+    return text.replace("|", "\\|").replace("\n", "<br>")
+
+
+def build_report(args: argparse.Namespace, results: list[ExampleResult]) -> str:
+    lines = markdown_header(args.model, args.timestamp)
+    lines.extend(markdown_table(results))
+    lines.extend(markdown_summary(results, args.workflow_url))
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def main() -> int:
+    args = parse_args()
+    results = load_results(args.results_dir)
+    report = build_report(args, results)
+    sanitized = sanitize_openhands_mentions(report)
+
+    if args.output is not None:
+        args.output.write_text(sanitized)
+
+    print(sanitized)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/conftest.py b/tests/conftest.py
index 9d19019b61..c1789211a1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 """Common test fixtures and utilities."""
 
 import uuid
+from pathlib import Path
 from unittest.mock import MagicMock
 
 import pytest
@@ -14,6 +15,48 @@
 from openhands.sdk.workspace import LocalWorkspace
 
 
+REPO_ROOT = Path(__file__).resolve().parent.parent
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("examples")
+    group.addoption(
+        "--run-examples",
+        action="store_true",
+        default=False,
+        help="Execute example scripts. Disabled by default for faster test runs.",
+    )
+    group.addoption(
+        "--examples-results-dir",
+        action="store",
+        default=None,
+        help=(
+            "Directory to store per-example JSON results "
+            "(defaults to .example-test-results)."
+        ),
+    )
+
+
+@pytest.fixture(scope="session")
+def examples_enabled(pytestconfig: pytest.Config) -> bool:
+    return bool(pytestconfig.getoption("--run-examples"))
+
+
+@pytest.fixture(scope="session")
+def examples_results_dir(pytestconfig: pytest.Config) -> Path:
+    configured = pytestconfig.getoption("--examples-results-dir")
+    result_dir = (
+        Path(configured)
+        if configured is not None
+        else REPO_ROOT / ".example-test-results"
+    )
+    result_dir.mkdir(parents=True, exist_ok=True)
+    if not hasattr(pytestconfig, "workerinput"):
+        for existing in result_dir.glob("*.json"):
+            existing.unlink()
+    return result_dir
+
+
 @pytest.fixture
 def mock_llm():
     """Create a standard mock LLM instance for testing."""
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
new file mode 100644
index 0000000000..6623b22af3
--- /dev/null
+++ b/tests/examples/test_examples.py
@@ -0,0 +1,130 @@
+"""Integration tests that execute example scripts via pytest.
+
+These tests are disabled by default. Pass ``--run-examples`` to enable them.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import time
+from collections.abc import Iterable
+from pathlib import Path
+
+import pytest
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+EXAMPLES_ROOT = REPO_ROOT / "examples"
+
+_TARGET_DIRECTORIES = (
+    EXAMPLES_ROOT / "01_standalone_sdk",
+    EXAMPLES_ROOT / "02_remote_agent_server",
+)
+
+# Examples that require interactive input or additional infrastructure.
+_EXCLUDED_EXAMPLES = {
+    "examples/01_standalone_sdk/01_hello_world.py",
+    "examples/01_standalone_sdk/04_confirmation_mode_example.py",
+    "examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py",
+    "examples/01_standalone_sdk/08_mcp_with_oauth.py",
+    "examples/01_standalone_sdk/15_browser_use.py",
+    "examples/01_standalone_sdk/16_llm_security_analyzer.py",
+    "examples/01_standalone_sdk/27_observability_laminar.py",
+    "examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py",
+}
+
+
+def _discover_examples() -> list[Path]:
+    candidates: list[Path] = []
+    for directory in _TARGET_DIRECTORIES:
+        if not directory.exists():
+            continue
+        candidates.extend(sorted(directory.glob("*.py")))
+    return candidates
+
+
+def _iter_examples() -> Iterable[Path]:
+    excluded = {_normalize_path(REPO_ROOT / p) for p in _EXCLUDED_EXAMPLES}
+    for example_path in _discover_examples():
+        normalized = _normalize_path(example_path)
+        if normalized in excluded:
+            continue
+        yield example_path
+
+
+def _normalize_path(path: Path) -> str:
+    return str(path.relative_to(REPO_ROOT)).replace(os.sep, "/")
+
+
+EXAMPLES = tuple(_iter_examples())
+
+
+@pytest.mark.parametrize("example_path", EXAMPLES, ids=_normalize_path)
+def test_example_scripts(
+    example_path: Path,
+    examples_enabled: bool,
+    examples_results_dir: Path,
+) -> None:
+    if not examples_enabled:
+        pytest.skip("Use --run-examples to execute example scripts.")
+
+    rel_path = example_path.relative_to(REPO_ROOT)
+    result_file = (
+        examples_results_dir
+        / f"{_normalize_path(example_path).replace('/', '__')}.json"
+    )
+
+    start = time.perf_counter()
+    env = os.environ.copy()
+    env.setdefault("PYTHONUNBUFFERED", "1")
+    process = subprocess.run(  # noqa: S603
+        [sys.executable, str(example_path)],
+        cwd=str(REPO_ROOT),
+        env=env,
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    duration = time.perf_counter() - start
+
+    stdout = process.stdout
+    stderr = process.stderr
+
+    cost = None
+    for line in stdout.splitlines():
+        if line.startswith("EXAMPLE_COST:"):
+            cost = line.split("EXAMPLE_COST:", 1)[1].strip()
+            break
+
+    status = "passed"
+    failure_reason = None
+
+    if process.returncode != 0:
+        status = "failed"
+        failure_reason = f"Exit code {process.returncode}"
+    elif cost is None:
+        status = "failed"
+        failure_reason = "Missing EXAMPLE_COST marker in stdout"
+
+    result_payload = {
+        "example": _normalize_path(example_path),
+        "status": status,
+        "duration_seconds": duration,
+        "cost": cost,
+        "returncode": process.returncode,
+        "failure_reason": failure_reason,
+    }
+
+    result_file.write_text(json.dumps(result_payload, indent=2))
+
+    if status != "passed":
+        pytest.fail(
+            "Example script failed:\n"
+            f"Example: {rel_path}\n"
+            f"Reason: {failure_reason}\n"
+            f"Stdout:\n{stdout}\n"
+            f"Stderr:\n{stderr}"
+        )
diff --git a/uv.lock b/uv.lock
index a8e6fc1bd5..c377ffb79d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -26,6 +26,7 @@ dev = [
     { name = "pytest-cov", specifier = ">=5.0.0" },
     { name = "pytest-forked", specifier = ">=1.6.0" },
     { name = "pytest-timeout", specifier = ">=2.4.0" },
+    { name = "pytest-xdist", specifier = ">=3.6.0" },
     { name = "ruff", specifier = ">=0.12.10" },
     { name = "streamlit", specifier = ">=1.49.1" },
     { name = "tabulate", specifier = ">=0.9.0" },
@@ -781,6 +782,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
 ]
 
+[[package]]
+name = "execnet"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" },
+]
+
 [[package]]
 name = "fastapi"
 version = "0.119.0"
@@ -5584,6 +5594,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" },
 ]
 
+[[package]]
+name = "pytest-xdist"
+version = "3.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "execnet" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"