From 9e04be88c32dc88aecadb13c4467cf611dee5e68 Mon Sep 17 00:00:00 2001 From: Delicious233 <101502465+DeliciousBuding@users.noreply.github.com> Date: Mon, 29 Jun 2026 03:09:13 +0800 Subject: [PATCH] test(e2e): harden approved-real manifest boundaries --- .agents/skills/real-e2e-acceptance/SKILL.md | 1 + .../src/testing/e2eDataModeContract.test.ts | 95 ++++++++++++++ app/shared/src/testing/e2eDataModeContract.ts | 18 +++ .../src/testing/e2eEvidenceManifest.test.ts | 116 +++++++++++++++++- app/shared/src/testing/e2eEvidenceManifest.ts | 15 ++- docs/progress/MASTER.md | 1 + scripts/verify/verify-real-e2e-contract.ps1 | 21 ++++ .../scripts/verify-e2e-smoke-matrix.ps1 | 9 +- 8 files changed, 269 insertions(+), 7 deletions(-) diff --git a/.agents/skills/real-e2e-acceptance/SKILL.md b/.agents/skills/real-e2e-acceptance/SKILL.md index 65920b77..dcaa7a37 100644 --- a/.agents/skills/real-e2e-acceptance/SKILL.md +++ b/.agents/skills/real-e2e-acceptance/SKILL.md @@ -41,6 +41,7 @@ Smoke matrix manifests must be machine-honest: - Skipped gates may appear only under `skipped_evidence_levels` and row-level `status: "skipped"`. - Stubbed Hub rows use `evidence_level: "stubbed-hub"` and `real_tested: false`; they must not be named or reported as real login/model/API execution. - Desktop Vite rows use Playwright/UI wording; packaged Desktop claims require a separate packaged-release row. +- Rows that set `evidence_level: "approved-real"` and `real_tested: true` must include an explicit approval reference plus real-login and real CLI/model/API evidence claims. Readiness-only approved-real preflight rows keep `real_tested: false`. ## Gate Matrix diff --git a/app/shared/src/testing/e2eDataModeContract.test.ts b/app/shared/src/testing/e2eDataModeContract.test.ts index bf892282..eb072809 100644 --- a/app/shared/src/testing/e2eDataModeContract.test.ts +++ b/app/shared/src/testing/e2eDataModeContract.test.ts @@ -250,6 +250,101 @@ describe('e2e data-mode contract', () => { }); }); + it('keeps observed replay read-only and non-real', () => { + const scenario = createE2EDataModeScenario({ + name: 'desktop-observed-replay', + surface: 'desktop', + dataMode: 'observed', + dataSource: 'observed-hub-replay', + appOrigin: 'http://127.0.0.1:5199', + hubOrigin: 'http://localhost:8080', + }); + + expect(validateE2EDataModeScenario(scenario, [ + { method: 'GET', url: 'http://localhost:8080/client/sessions/session-1/messages' }, + ])).toEqual({ ok: true, errors: [] }); + expect(buildE2EDataModeManifest(scenario, [ + { method: 'GET', url: 'http://localhost:8080/client/sessions/session-1/messages' }, + ])).toMatchObject({ + evidence_level: 'observed-local', + dataSource: 'observed-hub-replay', + realLoginTested: false, + realCliOrModelExecuted: false, + tokenDanceIdSecretUsed: false, + mockAdapterUsed: false, + real_tested: false, + requestedBoundaries: ['hub'], + }); + }); + + it('rejects observed replay that claims login, model execution, secrets, or the wrong mode', () => { + const scenario = createE2EDataModeScenario({ + name: 'desktop-observed-replay', + surface: 'desktop', + dataMode: 'observed', + dataSource: 'observed-hub-replay', + appOrigin: 'http://127.0.0.1:5199', + hubOrigin: 'http://localhost:8080', + }); + + expect(validateE2EDataModeScenario({ + ...scenario, + dataMode: 'approved-real', + realLoginTested: true, + realCliOrModelExecuted: true, + tokenDanceIdSecretUsed: true, + }, [])).toEqual({ + ok: false, + errors: [ + 'desktop-observed-replay uses observed-hub-replay but dataMode is approved-real', + 'desktop-observed-replay uses observed-hub-replay but claims real login was tested', + 'desktop-observed-replay uses observed-hub-replay but claims real CLI/model execution', + 'desktop-observed-replay uses observed-hub-replay but marks TokenDance ID secret usage', + ], + }); + }); + + it('keeps approved-real preflight separate from mock and readiness-only evidence', () => { + const scenario = createE2EDataModeScenario({ + name: 'approved-real-preflight', + surface: 'desktop', + dataMode: 'approved-real', + dataSource: 'approved-real-preflight', + appOrigin: 'http://127.0.0.1:5199', + hubOrigin: 'http://localhost:8080', + realLoginTested: true, + realCliOrModelExecuted: true, + directLocalEdge: true, + }); + + expect(buildE2EDataModeManifest(scenario, [ + { method: 'GET', url: 'http://localhost:8080/client/auth/me' }, + { method: 'GET', url: 'http://127.0.0.1:3210/v1/health' }, + { method: 'GET', url: 'https://id.vectorcontrol.tech/oidc/userinfo' }, + { method: 'POST', url: 'https://api.vectorcontrol.tech/v1/responses' }, + ])).toMatchObject({ + evidence_level: 'approved-real', + dataSource: 'approved-real-preflight', + realLoginTested: true, + realCliOrModelExecuted: true, + mockAdapterUsed: false, + real_tested: true, + requestedBoundaries: ['gateway', 'hub', 'local-edge', 'tokendance-id'], + }); + + expect(validateE2EDataModeScenario({ + ...scenario, + dataMode: 'observed', + mockAdapterUsed: true, + }, [])).toEqual({ + ok: false, + errors: [ + 'approved-real-preflight uses approved-real-preflight but dataMode is observed', + 'approved-real-preflight uses approved-real-preflight but marks mock adapter usage', + ], + }); + }); + it('rejects TokenDance ID and Gateway traffic in Web stubbed Hub replay', () => { const scenario = createE2EDataModeScenario({ name: 'web-stubbed-hub-replay-smoke', diff --git a/app/shared/src/testing/e2eDataModeContract.ts b/app/shared/src/testing/e2eDataModeContract.ts index a0735f44..d9939208 100644 --- a/app/shared/src/testing/e2eDataModeContract.ts +++ b/app/shared/src/testing/e2eDataModeContract.ts @@ -165,6 +165,24 @@ export function validateE2EDataModeScenario( if (scenario.dataSource === 'stubbed-hub-session' && scenario.tokenDanceIdSecretUsed) { errors.push(`${scenario.name} uses stubbed-hub-session but marks TokenDance ID secret usage`); } + if (scenario.dataSource === 'observed-hub-replay' && scenario.dataMode !== 'observed') { + errors.push(`${scenario.name} uses observed-hub-replay but dataMode is ${scenario.dataMode}`); + } + if (scenario.dataSource === 'observed-hub-replay' && scenario.realLoginTested) { + errors.push(`${scenario.name} uses observed-hub-replay but claims real login was tested`); + } + if (scenario.dataSource === 'observed-hub-replay' && scenario.realCliOrModelExecuted) { + errors.push(`${scenario.name} uses observed-hub-replay but claims real CLI/model execution`); + } + if (scenario.dataSource === 'observed-hub-replay' && scenario.tokenDanceIdSecretUsed) { + errors.push(`${scenario.name} uses observed-hub-replay but marks TokenDance ID secret usage`); + } + if (scenario.dataSource === 'approved-real-preflight' && scenario.dataMode !== 'approved-real') { + errors.push(`${scenario.name} uses approved-real-preflight but dataMode is ${scenario.dataMode}`); + } + if (scenario.dataSource === 'approved-real-preflight' && scenario.mockAdapterUsed) { + errors.push(`${scenario.name} uses approved-real-preflight but marks mock adapter usage`); + } for (const request of requests) { const boundary = classifyE2ERequest(request.url, scenario); diff --git a/app/shared/src/testing/e2eEvidenceManifest.test.ts b/app/shared/src/testing/e2eEvidenceManifest.test.ts index fe5c5fa8..3aa4b04f 100644 --- a/app/shared/src/testing/e2eEvidenceManifest.test.ts +++ b/app/shared/src/testing/e2eEvidenceManifest.test.ts @@ -105,6 +105,93 @@ describe('chat-flow evidence manifest contract', () => { }); }); + it('records observed-local rows as read-only and non-real', () => { + const manifest = buildChatFlowEvidenceManifest({ + scenario: 'desktop-observed-local', + surface: 'desktop', + dataSource: 'observed-hub-replay', + authExecution: 'local-only', + rows: [ + { + id: 'observed-edge-health', + claim: 'Local Edge was observed without model/API spend', + evidenceLevel: 'observed-local', + realTested: false, + status: 'passed', + command: 'pwsh ./scripts/smoke/verify-localhost-real-services.ps1', + }, + ], + }); + + expect(manifest).toMatchObject({ + evidence_levels: ['observed-local'], + real_tested: false, + rows: [ + { + id: 'observed-edge-health', + evidence_level: 'observed-local', + real_tested: false, + }, + ], + }); + expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] }); + }); + + it('requires approved-real rows to carry approval and real login plus CLI/model evidence claims', () => { + const manifest = buildChatFlowEvidenceManifest({ + scenario: 'approved-real-missing-claims', + surface: 'desktop', + dataSource: 'approved-real-preflight', + authExecution: 'approved-real', + rows: [ + { + id: 'approved-real-row', + claim: 'Approved real path ran', + evidenceLevel: 'approved-real', + realTested: true, + status: 'passed', + command: 'pwsh ./scripts/verify/verify-approved-real-preflight.ps1 -ManifestPath approved.json', + }, + ], + }); + + expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ + ok: false, + errors: [ + 'approved-real-missing-claims row approved-real-row sets real_tested=true without approval_ref', + 'approved-real-missing-claims row approved-real-row sets real_tested=true without real_login claim', + 'approved-real-missing-claims row approved-real-row sets real_tested=true without real_cli_or_model claim', + ], + }); + }); + + it('accepts approved-real rows only when the real evidence claims are explicit', () => { + const manifest = buildChatFlowEvidenceManifest({ + scenario: 'approved-real-gold-path', + surface: 'desktop', + dataSource: 'approved-real-preflight', + authExecution: 'approved-real', + rows: [ + { + id: 'approved-real-row', + claim: 'Approved real login and CLI/model path ran', + evidenceLevel: 'approved-real', + realTested: true, + status: 'passed', + command: 'pwsh ./scripts/smoke/verify-p0-approved-real-gold-path.ps1', + approvalRef: 'approval-2026-06-29-001', + claims: { + realLogin: true, + realCliOrModel: true, + }, + }, + ], + }); + + expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] }); + expect(manifest.real_tested).toBe(true); + }); + it('rejects packaged Desktop and release claims without matching evidence levels', () => { const manifest = buildChatFlowEvidenceManifest({ scenario: 'desktop-vite-chat-flow', @@ -131,8 +218,35 @@ describe('chat-flow evidence manifest contract', () => { ok: false, errors: [ 'desktop-vite-chat-flow row desktop-vite claims packaged Desktop without packaged-release evidence', - 'desktop-vite-chat-flow row desktop-vite claims release upload without release evidence', + 'desktop-vite-chat-flow row desktop-vite claims release upload without packaged-release evidence', ], }); }); + + it('keeps packaged Desktop and release upload claims on packaged-release evidence only', () => { + const manifest = buildChatFlowEvidenceManifest({ + scenario: 'desktop-packaged-release', + surface: 'desktop', + dataSource: 'approved-real-preflight', + authExecution: 'approved-real', + rows: [ + { + id: 'tauri-package', + claim: 'Tauri package policy and release dry gate passed', + evidenceLevel: 'packaged-release', + realTested: false, + status: 'passed', + command: 'pwsh ./scripts/release/verify-tauri-package-dry.ps1', + claims: { + packagedDesktop: true, + releaseUpload: true, + }, + }, + ], + }); + + expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] }); + expect(manifest.real_tested).toBe(false); + expect(manifest.evidence_levels).toEqual(['packaged-release']); + }); }); diff --git a/app/shared/src/testing/e2eEvidenceManifest.ts b/app/shared/src/testing/e2eEvidenceManifest.ts index 3c1b0c8d..a448fe88 100644 --- a/app/shared/src/testing/e2eEvidenceManifest.ts +++ b/app/shared/src/testing/e2eEvidenceManifest.ts @@ -194,6 +194,17 @@ function validateEvidenceRow( if (row.real_tested && row.evidence_level !== REAL_EXECUTION_LEVEL) { errors.push(`${scenario} row ${row.id} uses ${row.evidence_level} evidence but sets real_tested=true`); } + if (row.real_tested && row.evidence_level === REAL_EXECUTION_LEVEL) { + if (!row.approval_ref) { + errors.push(`${scenario} row ${row.id} sets real_tested=true without approval_ref`); + } + if (row.claims?.real_login !== true) { + errors.push(`${scenario} row ${row.id} sets real_tested=true without real_login claim`); + } + if (row.claims?.real_cli_or_model !== true) { + errors.push(`${scenario} row ${row.id} sets real_tested=true without real_cli_or_model claim`); + } + } if (row.claims?.real_login && row.evidence_level !== REAL_EXECUTION_LEVEL) { errors.push(`${scenario} row ${row.id} claims real login without approved-real evidence`); } @@ -203,8 +214,8 @@ function validateEvidenceRow( if (row.claims?.packaged_desktop && row.evidence_level !== 'packaged-release') { errors.push(`${scenario} row ${row.id} claims packaged Desktop without packaged-release evidence`); } - if (row.claims?.release_upload && row.evidence_level !== REAL_EXECUTION_LEVEL) { - errors.push(`${scenario} row ${row.id} claims release upload without release evidence`); + if (row.claims?.release_upload && row.evidence_level !== 'packaged-release') { + errors.push(`${scenario} row ${row.id} claims release upload without packaged-release evidence`); } validateScreenshots(scenario, row, errors); validateMetrics(scenario, row, errors); diff --git a/docs/progress/MASTER.md b/docs/progress/MASTER.md index 9a077d2e..fd2a0646 100644 --- a/docs/progress/MASTER.md +++ b/docs/progress/MASTER.md @@ -112,3 +112,4 @@ Per-task telemetry is stored in GitHub issue comments before task closure. Adapt | 2026-06-29 | Phase 3 sync | Updated Phase 3 live state after #406: milestone #19 is 1/3 complete, T3.2 (#387) is the active next task, and adaptive milestone `completed_tasks` is 1. | | 2026-06-29 | T3.2 implementation | Added Desktop entry/workbench data-boundary Playwright coverage, moved chat-flow phase marking to the Demo transition boundary, hardened disabled health polling against in-flight updates, and verified Desktop Vite gates with `real_tested=false`; #408 merged and #387 closed manually because non-default base did not auto-close it. | | 2026-06-29 | Phase 3 sync | Updated Phase 3 live state after #408: milestone #19 is 2/3 complete, adaptive drift_score is 1, #388 has a drift warning, and T3.3 (#388) is the active next task. | +| 2026-06-29 | T3.3 implementation | Hardened observed/approved-real manifest boundaries, kept packaged-release claims separate, aligned smoke-matrix contract checks with current stubbed-Hub replay names, and verified shared/contract gates with `real_tested=false`; PR pending. | diff --git a/scripts/verify/verify-real-e2e-contract.ps1 b/scripts/verify/verify-real-e2e-contract.ps1 index 046ff8b9..b6a93d05 100644 --- a/scripts/verify/verify-real-e2e-contract.ps1 +++ b/scripts/verify/verify-real-e2e-contract.ps1 @@ -98,6 +98,27 @@ foreach ($required in @( } } +foreach ($level in @("fixture-unit", "playwright-ui", "visual-qa", "stubbed-hub", "observed-local", "packaged-release")) { + $realTestedTruePattern = '(?s)-EvidenceLevel\s+"' + [regex]::Escape($level) + '".{0,500}-RealTested\s+\$true' + if ($smokeMatrix -match $realTestedTruePattern) { + Fail "smoke matrix must not set real_tested=true for '$level' evidence" + } +} +$loginReadinessRealTestedPattern = '(?s)-Name\s+"login-real-readiness-gate".{0,1200}-EvidenceLevel\s+"approved-real".{0,500}-RealTested\s+\$true' +if ($smokeMatrix -match $loginReadinessRealTestedPattern) { + Fail "login-real-readiness-gate is readiness-only and must keep real_tested=false" +} +foreach ($skillBoundary in @( + 'real_tested: true', + 'approval reference', + 'real-login', + 'real CLI/model/API' +)) { + if ($skill -notmatch [regex]::Escape($skillBoundary)) { + Fail "real-e2e skill must document approved-real row boundary: $skillBoundary" + } +} + $declaredEvidenceLevels = [regex]::Matches($smokeMatrix, '-EvidenceLevel\s+"(?[^"]+)"') | ForEach-Object { $_.Groups["level"].Value } | Sort-Object -Unique diff --git a/tests/contract/scripts/verify-e2e-smoke-matrix.ps1 b/tests/contract/scripts/verify-e2e-smoke-matrix.ps1 index 8352161a..d5a5b2a7 100644 --- a/tests/contract/scripts/verify-e2e-smoke-matrix.ps1 +++ b/tests/contract/scripts/verify-e2e-smoke-matrix.ps1 @@ -94,7 +94,7 @@ try { if (Test-Path -LiteralPath $scriptImplementationPath) { $scriptText = Get-Content -Raw -LiteralPath $scriptImplementationPath Assert-True ($scriptText -match "agenthub-e2e-smoke-matrix-v1") "matrix writes stable schema" - Assert-True ($scriptText -match "web-real-mode-playwright") "matrix includes Web real-mode Playwright row" + Assert-True ($scriptText -match "web-stubbed-hub-playwright") "matrix includes Web stubbed-Hub Playwright row" Assert-True ($scriptText -match "desktop-renderer-playwright") "matrix includes Desktop renderer Playwright row" Assert-True ($scriptText -match "localhost-services-smoke") "matrix includes local services smoke row" Assert-True ($scriptText -match "edge-client-smoke") "matrix includes Edge client smoke row" @@ -141,15 +141,16 @@ try { Assert-True ([string]$appPackage.scripts."test:smoke:matrix" -match "verify-e2e-smoke-matrix\.ps1") "app package exposes matrix script" Assert-True ([string]$appPackage.scripts."test:e2e:web" -match "agenthub-web") "app package exposes Web E2E script" Assert-True ([string]$appPackage.scripts."test:e2e:desktop" -match "agenthub-desktop") "app package exposes Desktop E2E script" - Assert-True ([string]$webPackage.scripts."test:e2e:real-mode" -match "web-hub-real-mode-smoke\.spec\.ts" -and [string]$webPackage.scripts."test:e2e:real-mode" -match "task-contract\.spec\.ts") "web package exposes real-mode and replay E2E" + Assert-True ([string]$webPackage.scripts."test:e2e:stubbed-hub" -match "web-stubbed-hub-replay-smoke\.spec\.ts" -and [string]$webPackage.scripts."test:e2e:stubbed-hub" -match "task-contract\.spec\.ts") "web package exposes stubbed-Hub replay E2E" Assert-True ([string]$desktopPackage.scripts."test:e2e:smoke" -match "smoke\.spec\.ts") "desktop package exposes renderer smoke E2E" if (Test-Path -LiteralPath $taskContractSpecPath) { $taskSpecText = Get-Content -Raw -LiteralPath $taskContractSpecPath - Assert-True ($taskSpecText -match "agenthub\.web_task_contract_replay\.v1") "task contract writes replay manifest schema" + Assert-True ($taskSpecText -match "buildE2EDataModeManifest") "task contract writes data-mode replay manifest" + Assert-True ($taskSpecText -match "dataSource:\s*'stubbed-hub-session'") "task contract records stubbed Hub source" Assert-True ($taskSpecText -match "approvalReplayObserved") "task contract records approval replay" Assert-True ($taskSpecText -match "artifactReplayObserved") "task contract records artifact replay" - Assert-True ($taskSpecText -match "directLocalEdge:\s*false") "task contract records no direct Local Edge path" + Assert-True ($taskSpecText -notmatch "127\.0\.0\.1:3210|localhost:3210") "task contract does not use direct Local Edge path" } } finally {