Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .agents/skills/real-e2e-acceptance/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Smoke matrix manifests must be machine-honest:
- Skipped gates may appear only under `skipped_evidence_levels` and row-level `status: "skipped"`.
- Stubbed Hub rows use `evidence_level: "stubbed-hub"` and `real_tested: false`; they must not be named or reported as real login/model/API execution.
- Desktop Vite rows use Playwright/UI wording; packaged Desktop claims require a separate packaged-release row.
- Rows that set `evidence_level: "approved-real"` and `real_tested: true` must include an explicit approval reference plus real-login and real CLI/model/API evidence claims. Readiness-only approved-real preflight rows keep `real_tested: false`.

## Gate Matrix

Expand Down
95 changes: 95 additions & 0 deletions app/shared/src/testing/e2eDataModeContract.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,101 @@ describe('e2e data-mode contract', () => {
});
});

it('keeps observed replay read-only and non-real', () => {
const scenario = createE2EDataModeScenario({
name: 'desktop-observed-replay',
surface: 'desktop',
dataMode: 'observed',
dataSource: 'observed-hub-replay',
appOrigin: 'http://127.0.0.1:5199',
hubOrigin: 'http://localhost:8080',
});

expect(validateE2EDataModeScenario(scenario, [
{ method: 'GET', url: 'http://localhost:8080/client/sessions/session-1/messages' },
])).toEqual({ ok: true, errors: [] });
expect(buildE2EDataModeManifest(scenario, [
{ method: 'GET', url: 'http://localhost:8080/client/sessions/session-1/messages' },
])).toMatchObject({
evidence_level: 'observed-local',
dataSource: 'observed-hub-replay',
realLoginTested: false,
realCliOrModelExecuted: false,
tokenDanceIdSecretUsed: false,
mockAdapterUsed: false,
real_tested: false,
requestedBoundaries: ['hub'],
});
});

it('rejects observed replay that claims login, model execution, secrets, or the wrong mode', () => {
const scenario = createE2EDataModeScenario({
name: 'desktop-observed-replay',
surface: 'desktop',
dataMode: 'observed',
dataSource: 'observed-hub-replay',
appOrigin: 'http://127.0.0.1:5199',
hubOrigin: 'http://localhost:8080',
});

expect(validateE2EDataModeScenario({
...scenario,
dataMode: 'approved-real',
realLoginTested: true,
realCliOrModelExecuted: true,
tokenDanceIdSecretUsed: true,
}, [])).toEqual({
ok: false,
errors: [
'desktop-observed-replay uses observed-hub-replay but dataMode is approved-real',
'desktop-observed-replay uses observed-hub-replay but claims real login was tested',
'desktop-observed-replay uses observed-hub-replay but claims real CLI/model execution',
'desktop-observed-replay uses observed-hub-replay but marks TokenDance ID secret usage',
],
});
Comment on lines +296 to +304

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

📐 Maintainability & Code Quality | 🟡 Minor | ⚡ Quick win

Keep the failure assertions semantic, not prose-coupled.

These cases lock the suite to exact validator wording instead of the contract violation being reported. That makes harmless message edits look like regressions. As per coding guidelines, "**/*.{ts,tsx,js,jsx}: Do not write tests that merely replicate implementation branches, assert constant strings, hard-code error text as behavior, or mock the function under test itself."

Also applies to: 339-344

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/shared/src/testing/e2eDataModeContract.test.ts` around lines 296 - 304,
The e2e data mode contract tests are asserting exact validator error prose,
which makes them brittle to wording-only changes. Update the affected
expectations in the e2eDataModeContract.test suite to verify the semantic
violation shape returned by the validator (for example, the specific failure
categories or identifiers in the result from the contract check) rather than
matching hard-coded full error strings. Keep the existing test cases for
desktop-observed-replay, but assert the meaningful contract fields from the
validator output instead of exact message text.

Source: Coding guidelines

});

it('keeps approved-real preflight separate from mock and readiness-only evidence', () => {
const scenario = createE2EDataModeScenario({
name: 'approved-real-preflight',
surface: 'desktop',
dataMode: 'approved-real',
dataSource: 'approved-real-preflight',
appOrigin: 'http://127.0.0.1:5199',
hubOrigin: 'http://localhost:8080',
realLoginTested: true,
realCliOrModelExecuted: true,
directLocalEdge: true,
});

expect(buildE2EDataModeManifest(scenario, [
{ method: 'GET', url: 'http://localhost:8080/client/auth/me' },
{ method: 'GET', url: 'http://127.0.0.1:3210/v1/health' },
{ method: 'GET', url: 'https://id.vectorcontrol.tech/oidc/userinfo' },
{ method: 'POST', url: 'https://api.vectorcontrol.tech/v1/responses' },
])).toMatchObject({
evidence_level: 'approved-real',
dataSource: 'approved-real-preflight',
realLoginTested: true,
realCliOrModelExecuted: true,
mockAdapterUsed: false,
real_tested: true,
requestedBoundaries: ['gateway', 'hub', 'local-edge', 'tokendance-id'],
});

expect(validateE2EDataModeScenario({
...scenario,
dataMode: 'observed',
mockAdapterUsed: true,
}, [])).toEqual({
ok: false,
errors: [
'approved-real-preflight uses approved-real-preflight but dataMode is observed',
'approved-real-preflight uses approved-real-preflight but marks mock adapter usage',
],
});
});

it('rejects TokenDance ID and Gateway traffic in Web stubbed Hub replay', () => {
const scenario = createE2EDataModeScenario({
name: 'web-stubbed-hub-replay-smoke',
Expand Down
18 changes: 18 additions & 0 deletions app/shared/src/testing/e2eDataModeContract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,24 @@ export function validateE2EDataModeScenario(
if (scenario.dataSource === 'stubbed-hub-session' && scenario.tokenDanceIdSecretUsed) {
errors.push(`${scenario.name} uses stubbed-hub-session but marks TokenDance ID secret usage`);
}
if (scenario.dataSource === 'observed-hub-replay' && scenario.dataMode !== 'observed') {
errors.push(`${scenario.name} uses observed-hub-replay but dataMode is ${scenario.dataMode}`);
}
if (scenario.dataSource === 'observed-hub-replay' && scenario.realLoginTested) {
errors.push(`${scenario.name} uses observed-hub-replay but claims real login was tested`);
}
if (scenario.dataSource === 'observed-hub-replay' && scenario.realCliOrModelExecuted) {
errors.push(`${scenario.name} uses observed-hub-replay but claims real CLI/model execution`);
}
if (scenario.dataSource === 'observed-hub-replay' && scenario.tokenDanceIdSecretUsed) {
errors.push(`${scenario.name} uses observed-hub-replay but marks TokenDance ID secret usage`);
}
if (scenario.dataSource === 'approved-real-preflight' && scenario.dataMode !== 'approved-real') {
errors.push(`${scenario.name} uses approved-real-preflight but dataMode is ${scenario.dataMode}`);
}
if (scenario.dataSource === 'approved-real-preflight' && scenario.mockAdapterUsed) {
errors.push(`${scenario.name} uses approved-real-preflight but marks mock adapter usage`);
}

for (const request of requests) {
const boundary = classifyE2ERequest(request.url, scenario);
Expand Down
116 changes: 115 additions & 1 deletion app/shared/src/testing/e2eEvidenceManifest.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,93 @@ describe('chat-flow evidence manifest contract', () => {
});
});

it('records observed-local rows as read-only and non-real', () => {
const manifest = buildChatFlowEvidenceManifest({
scenario: 'desktop-observed-local',
surface: 'desktop',
dataSource: 'observed-hub-replay',
authExecution: 'local-only',
rows: [
{
id: 'observed-edge-health',
claim: 'Local Edge was observed without model/API spend',
evidenceLevel: 'observed-local',
realTested: false,
status: 'passed',
command: 'pwsh ./scripts/smoke/verify-localhost-real-services.ps1',
},
],
});

expect(manifest).toMatchObject({
evidence_levels: ['observed-local'],
real_tested: false,
rows: [
{
id: 'observed-edge-health',
evidence_level: 'observed-local',
real_tested: false,
},
],
});
expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] });
});

it('requires approved-real rows to carry approval and real login plus CLI/model evidence claims', () => {
const manifest = buildChatFlowEvidenceManifest({
scenario: 'approved-real-missing-claims',
surface: 'desktop',
dataSource: 'approved-real-preflight',
authExecution: 'approved-real',
rows: [
{
id: 'approved-real-row',
claim: 'Approved real path ran',
evidenceLevel: 'approved-real',
realTested: true,
status: 'passed',
command: 'pwsh ./scripts/verify/verify-approved-real-preflight.ps1 -ManifestPath approved.json',
},
],
});

expect(validateChatFlowEvidenceManifest(manifest)).toEqual({
ok: false,
errors: [
'approved-real-missing-claims row approved-real-row sets real_tested=true without approval_ref',
'approved-real-missing-claims row approved-real-row sets real_tested=true without real_login claim',
'approved-real-missing-claims row approved-real-row sets real_tested=true without real_cli_or_model claim',
],
});
Comment on lines +158 to +165

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

📐 Maintainability & Code Quality | 🟡 Minor | ⚡ Quick win

Avoid pinning validator prose in these tests.

These assertions hard-code the full validation messages, so wording-only changes will fail the suite even when the contract is unchanged. Prefer stable machine-readable reasons/codes, or at least predicate-based checks for the relevant invariant instead of exact prose. As per coding guidelines, "**/*.{ts,tsx,js,jsx}: Do not write tests that merely replicate implementation branches, assert constant strings, hard-code error text as behavior, or mock the function under test itself."

Also applies to: 221-223

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/shared/src/testing/e2eEvidenceManifest.test.ts` around lines 158 - 165,
The test for validateChatFlowEvidenceManifest is pinning exact validator prose,
making wording-only changes fail unnecessarily. Update the assertions to check
stable machine-readable error codes/reasons, or use predicate-based matching for
the expected invariant instead of comparing full error strings. Keep the
coverage around the approved-real-missing-claims cases, but make the checks
resilient to message wording changes in e2eEvidenceManifest.test.

Source: Coding guidelines

});

it('accepts approved-real rows only when the real evidence claims are explicit', () => {
const manifest = buildChatFlowEvidenceManifest({
scenario: 'approved-real-gold-path',
surface: 'desktop',
dataSource: 'approved-real-preflight',
authExecution: 'approved-real',
rows: [
{
id: 'approved-real-row',
claim: 'Approved real login and CLI/model path ran',
evidenceLevel: 'approved-real',
realTested: true,
status: 'passed',
command: 'pwsh ./scripts/smoke/verify-p0-approved-real-gold-path.ps1',
approvalRef: 'approval-2026-06-29-001',
claims: {
realLogin: true,
realCliOrModel: true,
},
},
],
});

expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] });
expect(manifest.real_tested).toBe(true);
});

it('rejects packaged Desktop and release claims without matching evidence levels', () => {
const manifest = buildChatFlowEvidenceManifest({
scenario: 'desktop-vite-chat-flow',
Expand All @@ -131,8 +218,35 @@ describe('chat-flow evidence manifest contract', () => {
ok: false,
errors: [
'desktop-vite-chat-flow row desktop-vite claims packaged Desktop without packaged-release evidence',
'desktop-vite-chat-flow row desktop-vite claims release upload without release evidence',
'desktop-vite-chat-flow row desktop-vite claims release upload without packaged-release evidence',
],
});
});

it('keeps packaged Desktop and release upload claims on packaged-release evidence only', () => {
const manifest = buildChatFlowEvidenceManifest({
scenario: 'desktop-packaged-release',
surface: 'desktop',
dataSource: 'approved-real-preflight',
authExecution: 'approved-real',
rows: [
{
id: 'tauri-package',
claim: 'Tauri package policy and release dry gate passed',
evidenceLevel: 'packaged-release',
realTested: false,
status: 'passed',
command: 'pwsh ./scripts/release/verify-tauri-package-dry.ps1',
claims: {
packagedDesktop: true,
releaseUpload: true,
},
},
],
});

expect(validateChatFlowEvidenceManifest(manifest)).toEqual({ ok: true, errors: [] });
expect(manifest.real_tested).toBe(false);
expect(manifest.evidence_levels).toEqual(['packaged-release']);
});
});
15 changes: 13 additions & 2 deletions app/shared/src/testing/e2eEvidenceManifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,17 @@ function validateEvidenceRow(
if (row.real_tested && row.evidence_level !== REAL_EXECUTION_LEVEL) {
errors.push(`${scenario} row ${row.id} uses ${row.evidence_level} evidence but sets real_tested=true`);
}
if (row.real_tested && row.evidence_level === REAL_EXECUTION_LEVEL) {
if (!row.approval_ref) {
errors.push(`${scenario} row ${row.id} sets real_tested=true without approval_ref`);
}
if (row.claims?.real_login !== true) {
errors.push(`${scenario} row ${row.id} sets real_tested=true without real_login claim`);
}
if (row.claims?.real_cli_or_model !== true) {
errors.push(`${scenario} row ${row.id} sets real_tested=true without real_cli_or_model claim`);
}
}
if (row.claims?.real_login && row.evidence_level !== REAL_EXECUTION_LEVEL) {
errors.push(`${scenario} row ${row.id} claims real login without approved-real evidence`);
}
Expand All @@ -203,8 +214,8 @@ function validateEvidenceRow(
if (row.claims?.packaged_desktop && row.evidence_level !== 'packaged-release') {
errors.push(`${scenario} row ${row.id} claims packaged Desktop without packaged-release evidence`);
}
if (row.claims?.release_upload && row.evidence_level !== REAL_EXECUTION_LEVEL) {
errors.push(`${scenario} row ${row.id} claims release upload without release evidence`);
if (row.claims?.release_upload && row.evidence_level !== 'packaged-release') {
errors.push(`${scenario} row ${row.id} claims release upload without packaged-release evidence`);
}
validateScreenshots(scenario, row, errors);
validateMetrics(scenario, row, errors);
Expand Down
1 change: 1 addition & 0 deletions docs/progress/MASTER.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,4 @@ Per-task telemetry is stored in GitHub issue comments before task closure. Adapt
| 2026-06-29 | Phase 3 sync | Updated Phase 3 live state after #406: milestone #19 is 1/3 complete, T3.2 (#387) is the active next task, and adaptive milestone `completed_tasks` is 1. |
| 2026-06-29 | T3.2 implementation | Added Desktop entry/workbench data-boundary Playwright coverage, moved chat-flow phase marking to the Demo transition boundary, hardened disabled health polling against in-flight updates, and verified Desktop Vite gates with `real_tested=false`; #408 merged and #387 closed manually because non-default base did not auto-close it. |
| 2026-06-29 | Phase 3 sync | Updated Phase 3 live state after #408: milestone #19 is 2/3 complete, adaptive drift_score is 1, #388 has a drift warning, and T3.3 (#388) is the active next task. |
| 2026-06-29 | T3.3 implementation | Hardened observed/approved-real manifest boundaries, kept packaged-release claims separate, aligned smoke-matrix contract checks with current stubbed-Hub replay names, and verified shared/contract gates with `real_tested=false`; PR pending. |
21 changes: 21 additions & 0 deletions scripts/verify/verify-real-e2e-contract.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,27 @@ foreach ($required in @(
}
}

foreach ($level in @("fixture-unit", "playwright-ui", "visual-qa", "stubbed-hub", "observed-local", "packaged-release")) {
$realTestedTruePattern = '(?s)-EvidenceLevel\s+"' + [regex]::Escape($level) + '".{0,500}-RealTested\s+\$true'
if ($smokeMatrix -match $realTestedTruePattern) {
Fail "smoke matrix must not set real_tested=true for '$level' evidence"
}
}
$loginReadinessRealTestedPattern = '(?s)-Name\s+"login-real-readiness-gate".{0,1200}-EvidenceLevel\s+"approved-real".{0,500}-RealTested\s+\$true'
if ($smokeMatrix -match $loginReadinessRealTestedPattern) {
Fail "login-real-readiness-gate is readiness-only and must keep real_tested=false"
}
foreach ($skillBoundary in @(
'real_tested: true',
'approval reference',
'real-login',
'real CLI/model/API'
)) {
if ($skill -notmatch [regex]::Escape($skillBoundary)) {
Fail "real-e2e skill must document approved-real row boundary: $skillBoundary"
}
}

$declaredEvidenceLevels = [regex]::Matches($smokeMatrix, '-EvidenceLevel\s+"(?<level>[^"]+)"') |
ForEach-Object { $_.Groups["level"].Value } |
Sort-Object -Unique
Expand Down
9 changes: 5 additions & 4 deletions tests/contract/scripts/verify-e2e-smoke-matrix.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ try {
if (Test-Path -LiteralPath $scriptImplementationPath) {
$scriptText = Get-Content -Raw -LiteralPath $scriptImplementationPath
Assert-True ($scriptText -match "agenthub-e2e-smoke-matrix-v1") "matrix writes stable schema"
Assert-True ($scriptText -match "web-real-mode-playwright") "matrix includes Web real-mode Playwright row"
Assert-True ($scriptText -match "web-stubbed-hub-playwright") "matrix includes Web stubbed-Hub Playwright row"
Assert-True ($scriptText -match "desktop-renderer-playwright") "matrix includes Desktop renderer Playwright row"
Assert-True ($scriptText -match "localhost-services-smoke") "matrix includes local services smoke row"
Assert-True ($scriptText -match "edge-client-smoke") "matrix includes Edge client smoke row"
Expand Down Expand Up @@ -141,15 +141,16 @@ try {
Assert-True ([string]$appPackage.scripts."test:smoke:matrix" -match "verify-e2e-smoke-matrix\.ps1") "app package exposes matrix script"
Assert-True ([string]$appPackage.scripts."test:e2e:web" -match "agenthub-web") "app package exposes Web E2E script"
Assert-True ([string]$appPackage.scripts."test:e2e:desktop" -match "agenthub-desktop") "app package exposes Desktop E2E script"
Assert-True ([string]$webPackage.scripts."test:e2e:real-mode" -match "web-hub-real-mode-smoke\.spec\.ts" -and [string]$webPackage.scripts."test:e2e:real-mode" -match "task-contract\.spec\.ts") "web package exposes real-mode and replay E2E"
Assert-True ([string]$webPackage.scripts."test:e2e:stubbed-hub" -match "web-stubbed-hub-replay-smoke\.spec\.ts" -and [string]$webPackage.scripts."test:e2e:stubbed-hub" -match "task-contract\.spec\.ts") "web package exposes stubbed-Hub replay E2E"
Assert-True ([string]$desktopPackage.scripts."test:e2e:smoke" -match "smoke\.spec\.ts") "desktop package exposes renderer smoke E2E"

if (Test-Path -LiteralPath $taskContractSpecPath) {
$taskSpecText = Get-Content -Raw -LiteralPath $taskContractSpecPath
Assert-True ($taskSpecText -match "agenthub\.web_task_contract_replay\.v1") "task contract writes replay manifest schema"
Assert-True ($taskSpecText -match "buildE2EDataModeManifest") "task contract writes data-mode replay manifest"
Assert-True ($taskSpecText -match "dataSource:\s*'stubbed-hub-session'") "task contract records stubbed Hub source"
Assert-True ($taskSpecText -match "approvalReplayObserved") "task contract records approval replay"
Assert-True ($taskSpecText -match "artifactReplayObserved") "task contract records artifact replay"
Assert-True ($taskSpecText -match "directLocalEdge:\s*false") "task contract records no direct Local Edge path"
Assert-True ($taskSpecText -notmatch "127\.0\.0\.1:3210|localhost:3210") "task contract does not use direct Local Edge path"
}
}
finally {
Expand Down
Loading