diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index 284c9ed2f0..783bc3853c 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -25,6 +25,7 @@ jobs: timeout-minutes: 60 steps: - name: Wait for agent server to finish build + if: github.event_name == 'pull_request' uses: lewagon/wait-on-check-action@v1.4.1 with: ref: ${{ github.event.pull_request.head.ref }} @@ -64,261 +65,105 @@ jobs: REPO_NAME: ${{ github.event.repository.name }} GITHUB_SHA: ${{ github.event.pull_request.head.sha }} run: | - # List of examples to test - # Excluded examples: - # - 01_hello_world.py: requires LiteLLM proxy URL (OPENAI_BASE_URL) not set in CI - # - 04_confirmation_mode_example.py: requires user input - # - 06_interactive_terminal_w_reasoning.py: interactive terminal - # - 08_mcp_with_oauth.py: requires OAuth setup - # - 15_browser_use.py: requires browser setup - # - 16_llm_security_analyzer.py: requires user input - # - 04_convo_with_api_sandboxed_server.py: requires sandbox API keys - # - 04_vscode_with_docker_sandboxed_server.py: requires VSCode setup - EXAMPLES=( - "examples/01_standalone_sdk/02_custom_tools.py" - "examples/01_standalone_sdk/03_activate_skill.py" - "examples/01_standalone_sdk/05_use_llm_registry.py" - "examples/01_standalone_sdk/07_mcp_integration.py" - "examples/01_standalone_sdk/09_pause_example.py" - "examples/01_standalone_sdk/10_persistence.py" - "examples/01_standalone_sdk/11_async.py" - "examples/01_standalone_sdk/12_custom_secrets.py" - "examples/01_standalone_sdk/13_get_llm_metrics.py" - "examples/01_standalone_sdk/14_context_condenser.py" - "examples/01_standalone_sdk/17_image_input.py" - "examples/01_standalone_sdk/18_send_message_while_processing.py" - "examples/01_standalone_sdk/19_llm_routing.py" - "examples/01_standalone_sdk/20_stuck_detector.py" - "examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py" - "examples/01_standalone_sdk/22_anthropic_thinking.py" - "examples/01_standalone_sdk/23_responses_reasoning.py" - "examples/01_standalone_sdk/24_planning_agent_workflow.py" - "examples/01_standalone_sdk/25_agent_delegation.py" - "examples/01_standalone_sdk/26_custom_visualizer.py" - "examples/02_remote_agent_server/01_convo_with_local_agent_server.py" - "examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py" - "examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py" - "examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py" - ) + RESULTS_DIR=".example-test-results" + REPORT_PATH="examples_report.md" + rm -rf "$RESULTS_DIR" + mkdir -p "$RESULTS_DIR" - # GitHub API setup (only for PR events) - if [ "${{ github.event_name }}" = "pull_request" ]; then - API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments" - fi - - # Function to sanitize @OpenHands mentions using the SDK utility - sanitize_comment() { - local text="$1" - printf "%s" "$text" | uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" - } - - # Function to update PR comment update_comment() { - # Skip if not a PR event - if [ "${{ github.event_name }}" != "pull_request" ]; then + if [ -z "$API_URL" ]; then + echo "Skipping PR comment update because API_URL is unset." return fi - + local comment_body="$1" + local payload local response - - # Sanitize @OpenHands mentions before posting - comment_body=$(sanitize_comment "$comment_body") - + + payload=$(jq -n --arg body "$comment_body" '{body: $body}') + if [ -z "$COMMENT_ID" ]; then - # Create new comment - response=$(curl -s -X POST \ + echo "Creating PR comment..." + if ! response=$(curl -sSf -X POST \ -H "Authorization: token ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ + -H "Content-Type: application/json" \ "${API_URL}" \ - -d "{\"body\":$(echo "$comment_body" | jq -Rs .)}") - COMMENT_ID=$(echo "$response" | jq -r '.id') + -d "$payload"); then + echo "::error::Failed to create PR comment." + exit 1 + fi + COMMENT_ID=$(echo "$response" | jq -r '.id // ""') + if [ -z "$COMMENT_ID" ]; then + echo "::error::GitHub API response did not include a comment id: $response" + exit 1 + fi echo "Created comment with ID: $COMMENT_ID" else - # Update existing comment - curl -s -X PATCH \ + echo "Updating PR comment (ID: $COMMENT_ID)..." + if ! curl -sSf -X PATCH \ -H "Authorization: token ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ + -H "Content-Type: application/json" \ "https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/comments/${COMMENT_ID}" \ - -d "{\"body\":$(echo "$comment_body" | jq -Rs .)}" > /dev/null - fi - } - - # Function to format cost with 2 decimal places - format_cost() { - local cost="$1" - if [ -z "$cost" ] || [ "$cost" = "N/A" ]; then - echo "N/A" - else - printf "\$%.2f" "$cost" 2>/dev/null || echo "N/A" - fi - } - - # Function to generate markdown table - generate_table() { - local header="## 🔄 Running Examples with \`${LLM_MODEL}\`\n\n" - header+="_Last updated: $(date -u '+%Y-%m-%d %H:%M:%S UTC')_\n\n" - header+="| Example | Status | Duration | Cost |\n" - header+="|---------|--------|----------|------|\n" - - local rows="" - for example in "${EXAMPLES[@]}"; do - # Strip examples/ prefix and show relative path from there - local short_name="${example#examples/}" - local status="${TEST_STATUS[$example]:-âŗ Pending}" - local duration="${TEST_DURATION[$example]:--}" - local cost="${TEST_COST[$example]:--}" - rows+="| ${short_name} | ${status} | ${duration} | ${cost} |\n" - done - - local summary="\n---\n\n" - if [ $COMPLETED -eq ${#EXAMPLES[@]} ]; then - if [ $FAILED -eq 0 ]; then - summary+="### ✅ All tests passed!\n\n" - else - summary+="### ❌ Some tests failed\n\n" - fi - summary+="**Total:** ${#EXAMPLES[@]} | **Passed:** ${PASSED} | **Failed:** ${FAILED}" - - # Calculate and display total cost if available - if [ -n "$TOTAL_COST" ]; then - summary+=" | **Total Cost:** $(format_cost $TOTAL_COST)" + -d "$payload" > /dev/null; then + echo "::error::Failed to update PR comment (ID: $COMMENT_ID)." + exit 1 fi - - summary+="\n\n[View full workflow run](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})" - else - summary+="**Progress:** ${COMPLETED}/${#EXAMPLES[@]} completed | **Passed:** ${PASSED} | **Failed:** ${FAILED}" fi - - echo -e "${header}${rows}${summary}" } - # Initialize tracking variables - declare -A TEST_STATUS - declare -A TEST_DURATION - declare -A TEST_COST - FAILED=0 - PASSED=0 - COMPLETED=0 - TOTAL_COST=0 - FAILED_EXAMPLES=() - RESULTS_FILE="test-results.txt" + API_URL="" COMMENT_ID="" - # Clear results file - > "$RESULTS_FILE" - - # Create initial comment with all tests pending (only for PR events) if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "Creating initial PR comment..." - update_comment "$(generate_table)" + API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments" + initial_comment="## 🔄 Running Examples with \`${LLM_MODEL}\`" + initial_comment+=$'\n\n' + initial_comment+="_Run in progress..._" + initial_comment+=$'\n' + update_comment "$initial_comment" fi - echo "==========================================" - echo "Running ${#EXAMPLES[@]} examples with $LLM_MODEL" - echo "==========================================" + EXIT_CODE=0 + uv run pytest tests/examples/test_examples.py \ + --run-examples \ + --examples-results-dir "$RESULTS_DIR" \ + -n 4 || EXIT_CODE=$? - for example in "${EXAMPLES[@]}"; do - echo "" - echo "Running: $example" - echo "------------------------------------------" - - START_TIME=$(date +%s) - - # Create temp file to capture output - OUTPUT_FILE=$(mktemp) - - # Run example with timeout (20 minutes per example) - # Capture output while still displaying it - # Use || true to prevent script exit on failure - (timeout 1200 uv run python "$example" 2>&1 || true) | tee "$OUTPUT_FILE" - - # Check if command succeeded by looking at Python exit - if ! grep -q "EXAMPLE_COST:" "$OUTPUT_FILE"; then - EXIT_CODE=1 - else - EXIT_CODE=0 - fi - - END_TIME=$(date +%s) - DURATION=$((END_TIME - START_TIME)) - DURATION_STR="${DURATION}s" - - # Extract cost from output - COST=$(grep "EXAMPLE_COST:" "$OUTPUT_FILE" | awk '{print $2}' | tail -1 || echo "0.00") - if [ -z "$COST" ]; then - COST="0.00" - fi - - # Accumulate total cost - TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc -l 2>/dev/null || echo "$TOTAL_COST") - - if [ "$EXIT_CODE" -eq 0 ]; then - echo "✓ PASSED: $example (${DURATION_STR}, cost: \$${COST})" - PASSED=$((PASSED + 1)) - COMPLETED=$((COMPLETED + 1)) - TEST_STATUS[$example]="✅ PASS" - TEST_DURATION[$example]="${DURATION_STR}" - TEST_COST[$example]="$(format_cost $COST)" - echo "PASS|$example|${DURATION}|${COST}" >> "$RESULTS_FILE" - else - echo "✗ FAILED: $example (exit code: $EXIT_CODE, ${DURATION_STR}, cost: \$${COST})" - FAILED=$((FAILED + 1)) - COMPLETED=$((COMPLETED + 1)) - FAILED_EXAMPLES+=("$example") - TEST_STATUS[$example]="❌ FAIL (exit: ${EXIT_CODE})" - TEST_DURATION[$example]="${DURATION_STR}" - TEST_COST[$example]="$(format_cost $COST)" - echo "FAIL|$example|$EXIT_CODE|${DURATION}|${COST}" >> "$RESULTS_FILE" - fi - - # Clean up temp file - rm -f "$OUTPUT_FILE" - - # Update PR comment after each test (with error handling) - echo "Updating PR comment..." - update_comment "$(generate_table)" || echo "Warning: Failed to update PR comment" - done + TIMESTAMP="$(date -u '+%Y-%m-%d %H:%M:%S UTC')" + WORKFLOW_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" - echo "" - echo "==========================================" - echo "Test Results Summary" - echo "==========================================" - echo "Total: ${#EXAMPLES[@]}" - echo "Passed: $PASSED" - echo "Failed: $FAILED" - echo "Total Cost: $(format_cost $TOTAL_COST)" + uv run python scripts/render_examples_report.py \ + --results-dir "$RESULTS_DIR" \ + --model "$LLM_MODEL" \ + --workflow-url "$WORKFLOW_URL" \ + --timestamp "$TIMESTAMP" \ + --output "$REPORT_PATH" - # Generate final report and save to file - FINAL_REPORT=$(generate_table) - echo "$FINAL_REPORT" > examples_report.md - echo "Final report saved to examples_report.md" + COMMENT_BODY="$(cat "$REPORT_PATH")" + echo "$COMMENT_BODY" - if [ $FAILED -gt 0 ]; then - echo "" - echo "Failed examples:" - for failed_example in "${FAILED_EXAMPLES[@]}"; do - echo " - $failed_example" - done - exit 1 + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "Publishing PR comment..." + update_comment "$COMMENT_BODY" fi - echo "" - echo "All examples passed! ✓" - + if [ $EXIT_CODE -ne 0 ]; then + exit $EXIT_CODE + fi - name: Read examples report for issue comment if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' id: read_report shell: bash run: | if [ -f examples_report.md ]; then - # Sanitize @OpenHands mentions before posting - REPORT_CONTENT=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < examples_report.md) - echo "report<> $GITHUB_OUTPUT - echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT + REPORT_CONTENT=$(cat examples_report.md) + echo "report<> "$GITHUB_OUTPUT" + echo "$REPORT_CONTENT" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" else - echo "report=Report file not found" >> $GITHUB_OUTPUT + echo "report=Report file not found" >> "$GITHUB_OUTPUT" fi - name: Comment with results on tracker issue diff --git a/examples/01_standalone_sdk/25_agent_delegation.py b/examples/01_standalone_sdk/25_agent_delegation.py index 8d3221434e..3ab0abb49b 100644 --- a/examples/01_standalone_sdk/25_agent_delegation.py +++ b/examples/01_standalone_sdk/25_agent_delegation.py @@ -71,3 +71,7 @@ "Ask the lodging sub-agent what it thinks about Covent Garden." ) conversation.run() + +# Report cost +cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost +print(f"EXAMPLE_COST: {cost}") diff --git a/examples/01_standalone_sdk/26_custom_visualizer.py b/examples/01_standalone_sdk/26_custom_visualizer.py index 81feca85b1..9908fb8936 100644 --- a/examples/01_standalone_sdk/26_custom_visualizer.py +++ b/examples/01_standalone_sdk/26_custom_visualizer.py @@ -65,4 +65,4 @@ def on_event(self, event: Event) -> None: # Report cost cost = llm.metrics.accumulated_cost -print(f"EXAMPLE_COST: ${cost:.4f}") +print(f"EXAMPLE_COST: {cost:.4f}") diff --git a/pyproject.toml b/pyproject.toml index db1508c773..2f1c150489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dev = [ "pycodestyle>=2.12.0", "pytest-asyncio>=1.1.0", "pytest-forked>=1.6.0", + "pytest-xdist>=3.6.0", "tabulate>=0.9.0", "pyinstaller>=6.16.0", "streamlit>=1.49.1", diff --git a/scripts/render_examples_report.py b/scripts/render_examples_report.py new file mode 100644 index 0000000000..346df600ee --- /dev/null +++ b/scripts/render_examples_report.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import argparse +import json +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import UTC, datetime +from decimal import ROUND_HALF_UP, Decimal, InvalidOperation +from pathlib import Path + +from openhands.sdk.utils.github import sanitize_openhands_mentions + + +@dataclass(slots=True) +class ExampleResult: + name: str + status: str + duration_seconds: float | None + cost: str | None + failure_reason: str | None + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Render markdown summary for example runs." + ) + parser.add_argument( + "--results-dir", + type=Path, + required=True, + help="Directory containing per-example JSON results.", + ) + parser.add_argument( + "--model", + type=str, + default="Unknown model", + help="LLM model name used for the run.", + ) + parser.add_argument( + "--workflow-url", + type=str, + default="", + help="URL to the workflow run details page.", + ) + parser.add_argument( + "--timestamp", + type=str, + default="", + help="UTC timestamp string to include in the report header.", + ) + parser.add_argument( + "--output", + type=Path, + default=None, + help="Optional path to write the markdown report to.", + ) + return parser.parse_args() + + +def iter_result_files(results_dir: Path) -> Iterable[Path]: + yield from sorted(results_dir.glob("*.json")) + + +def load_results(results_dir: Path) -> list[ExampleResult]: + results: list[ExampleResult] = [] + for path in iter_result_files(results_dir): + try: + payload = json.loads(path.read_text()) + except json.JSONDecodeError: + continue + results.append( + ExampleResult( + name=str(payload.get("example", path.stem)), + status=str(payload.get("status", "unknown")), + duration_seconds=_coerce_float(payload.get("duration_seconds")), + cost=_coerce_cost(payload.get("cost")), + failure_reason=_sanitize_reason(payload.get("failure_reason")), + ) + ) + return sorted(results, key=lambda item: item.name) + + +def _coerce_float(value: object) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + try: + return float(stripped) + except ValueError: + return None + return None + + +def _coerce_cost(value: object) -> str | None: + if value is None: + return None + if isinstance(value, str) and not value.strip(): + return None + return str(value) + + +def _sanitize_reason(value: object) -> str | None: + if value is None: + return None + reason = str(value).strip() + return reason or None + + +def format_duration(seconds: float | None) -> str: + if seconds is None: + return "--" + seconds = max(0.0, seconds) + if seconds < 60: + return f"{seconds:.1f}s" + minutes, sec = divmod(int(seconds + 0.5), 60) + if minutes < 60: + return f"{minutes}m {sec}s" + hours, minutes = divmod(minutes, 60) + return f"{hours}h {minutes}m" + + +def format_cost(value: str | None) -> str: + if not value: + return "--" + try: + amount = Decimal(value) + except InvalidOperation: + return "--" + quantized = amount.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP) + return f"${quantized}" + + +def format_total_cost(values: Iterable[str | None]) -> str | None: + total = Decimal("0") + seen = False + for value in values: + if not value: + continue + try: + amount = Decimal(value) + except InvalidOperation: + continue + total += amount + seen = True + if not seen: + return None + quantized = total.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP) + return f"${quantized}" + + +def markdown_header(model: str, timestamp: str) -> list[str]: + ts = timestamp or datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC") + return [f"## 🔄 Running Examples with `{model}`", "", f"_Generated: {ts}_", ""] + + +def markdown_table(results: list[ExampleResult]) -> list[str]: + lines = [ + "| Example | Status | Duration | Cost |", + "|---------|--------|----------|------|", + ] + for result in results: + example = result.name + if example.startswith("examples/"): + example = example[len("examples/") :] + status = "✅ PASS" if result.status == "passed" else "❌ FAIL" + if result.status != "passed" and result.failure_reason: + status = f"{status}
{_escape_cell(result.failure_reason)}" + duration_display = format_duration(result.duration_seconds) + cost_display = format_cost(result.cost) + cells = [ + _escape_cell(example), + status, + duration_display, + cost_display, + ] + row = "| " + " | ".join(cells) + " |" + lines.append(row) + if len(results) == 0: + lines.append("| _No results_ | -- | -- | -- |") + return lines + + +def markdown_summary(results: list[ExampleResult], workflow_url: str) -> list[str]: + total = len(results) + passed = sum(1 for item in results if item.status == "passed") + failed = total - passed + cost_summary = format_total_cost(item.cost for item in results) + + lines = ["", "---", ""] + if failed == 0 and total > 0: + lines.append("### ✅ All tests passed!") + elif failed == 0: + lines.append("### â„šī¸ No examples were executed") + else: + lines.append("### ❌ Some tests failed") + + summary = f"**Total:** {total} | **Passed:** {passed} | **Failed:** {failed}" + if cost_summary: + summary += f" | **Total Cost:** {cost_summary}" + lines.append(summary) + + if failed: + lines.append("") + lines.append("**Failed examples:**") + for item in results: + if item.status != "passed": + reason = item.failure_reason or "See logs" + lines.append(f"- {item.name}: {reason}") + + if workflow_url: + lines.append("") + lines.append(f"[View full workflow run]({workflow_url})") + + return lines + + +def _escape_cell(text: str) -> str: + return text.replace("|", "\\|").replace("\n", "
") + + +def build_report(args: argparse.Namespace, results: list[ExampleResult]) -> str: + lines = markdown_header(args.model, args.timestamp) + lines.extend(markdown_table(results)) + lines.extend(markdown_summary(results, args.workflow_url)) + return "\n".join(lines).rstrip() + "\n" + + +def main() -> int: + args = parse_args() + results = load_results(args.results_dir) + report = build_report(args, results) + sanitized = sanitize_openhands_mentions(report) + + if args.output is not None: + args.output.write_text(sanitized) + + print(sanitized) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/conftest.py b/tests/conftest.py index 9d19019b61..c1789211a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ """Common test fixtures and utilities.""" import uuid +from pathlib import Path from unittest.mock import MagicMock import pytest @@ -14,6 +15,48 @@ from openhands.sdk.workspace import LocalWorkspace +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def pytest_addoption(parser: pytest.Parser) -> None: + group = parser.getgroup("examples") + group.addoption( + "--run-examples", + action="store_true", + default=False, + help="Execute example scripts. Disabled by default for faster test runs.", + ) + group.addoption( + "--examples-results-dir", + action="store", + default=None, + help=( + "Directory to store per-example JSON results " + "(defaults to .example-test-results)." + ), + ) + + +@pytest.fixture(scope="session") +def examples_enabled(pytestconfig: pytest.Config) -> bool: + return bool(pytestconfig.getoption("--run-examples")) + + +@pytest.fixture(scope="session") +def examples_results_dir(pytestconfig: pytest.Config) -> Path: + configured = pytestconfig.getoption("--examples-results-dir") + result_dir = ( + Path(configured) + if configured is not None + else REPO_ROOT / ".example-test-results" + ) + result_dir.mkdir(parents=True, exist_ok=True) + if not hasattr(pytestconfig, "workerinput"): + for existing in result_dir.glob("*.json"): + existing.unlink() + return result_dir + + @pytest.fixture def mock_llm(): """Create a standard mock LLM instance for testing.""" diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py new file mode 100644 index 0000000000..6623b22af3 --- /dev/null +++ b/tests/examples/test_examples.py @@ -0,0 +1,130 @@ +"""Integration tests that execute example scripts via pytest. + +These tests are disabled by default. Pass ``--run-examples`` to enable them. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +import time +from collections.abc import Iterable +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +EXAMPLES_ROOT = REPO_ROOT / "examples" + +_TARGET_DIRECTORIES = ( + EXAMPLES_ROOT / "01_standalone_sdk", + EXAMPLES_ROOT / "02_remote_agent_server", +) + +# Examples that require interactive input or additional infrastructure. +_EXCLUDED_EXAMPLES = { + "examples/01_standalone_sdk/01_hello_world.py", + "examples/01_standalone_sdk/04_confirmation_mode_example.py", + "examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py", + "examples/01_standalone_sdk/08_mcp_with_oauth.py", + "examples/01_standalone_sdk/15_browser_use.py", + "examples/01_standalone_sdk/16_llm_security_analyzer.py", + "examples/01_standalone_sdk/27_observability_laminar.py", + "examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py", +} + + +def _discover_examples() -> list[Path]: + candidates: list[Path] = [] + for directory in _TARGET_DIRECTORIES: + if not directory.exists(): + continue + candidates.extend(sorted(directory.glob("*.py"))) + return candidates + + +def _iter_examples() -> Iterable[Path]: + excluded = {_normalize_path(REPO_ROOT / p) for p in _EXCLUDED_EXAMPLES} + for example_path in _discover_examples(): + normalized = _normalize_path(example_path) + if normalized in excluded: + continue + yield example_path + + +def _normalize_path(path: Path) -> str: + return str(path.relative_to(REPO_ROOT)).replace(os.sep, "/") + + +EXAMPLES = tuple(_iter_examples()) + + +@pytest.mark.parametrize("example_path", EXAMPLES, ids=_normalize_path) +def test_example_scripts( + example_path: Path, + examples_enabled: bool, + examples_results_dir: Path, +) -> None: + if not examples_enabled: + pytest.skip("Use --run-examples to execute example scripts.") + + rel_path = example_path.relative_to(REPO_ROOT) + result_file = ( + examples_results_dir + / f"{_normalize_path(example_path).replace('/', '__')}.json" + ) + + start = time.perf_counter() + env = os.environ.copy() + env.setdefault("PYTHONUNBUFFERED", "1") + process = subprocess.run( # noqa: S603 + [sys.executable, str(example_path)], + cwd=str(REPO_ROOT), + env=env, + text=True, + capture_output=True, + check=False, + ) + duration = time.perf_counter() - start + + stdout = process.stdout + stderr = process.stderr + + cost = None + for line in stdout.splitlines(): + if line.startswith("EXAMPLE_COST:"): + cost = line.split("EXAMPLE_COST:", 1)[1].strip() + break + + status = "passed" + failure_reason = None + + if process.returncode != 0: + status = "failed" + failure_reason = f"Exit code {process.returncode}" + elif cost is None: + status = "failed" + failure_reason = "Missing EXAMPLE_COST marker in stdout" + + result_payload = { + "example": _normalize_path(example_path), + "status": status, + "duration_seconds": duration, + "cost": cost, + "returncode": process.returncode, + "failure_reason": failure_reason, + } + + result_file.write_text(json.dumps(result_payload, indent=2)) + + if status != "passed": + pytest.fail( + "Example script failed:\n" + f"Example: {rel_path}\n" + f"Reason: {failure_reason}\n" + f"Stdout:\n{stdout}\n" + f"Stderr:\n{stderr}" + ) diff --git a/uv.lock b/uv.lock index a8e6fc1bd5..c377ffb79d 100644 --- a/uv.lock +++ b/uv.lock @@ -26,6 +26,7 @@ dev = [ { name = "pytest-cov", specifier = ">=5.0.0" }, { name = "pytest-forked", specifier = ">=1.6.0" }, { name = "pytest-timeout", specifier = ">=2.4.0" }, + { name = "pytest-xdist", specifier = ">=3.6.0" }, { name = "ruff", specifier = ">=0.12.10" }, { name = "streamlit", specifier = ">=1.49.1" }, { name = "tabulate", specifier = ">=0.9.0" }, @@ -781,6 +782,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "fastapi" version = "0.119.0" @@ -5584,6 +5594,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"