From 4651eaa001165022965ccea3df43dca316f1ad05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20Villase=C3=B1or=20Montfort?= <195970+montfort@users.noreply.github.com> Date: Sun, 3 May 2026 00:05:16 -0600 Subject: [PATCH] =?UTF-8?q?feat(framework):=20Phase=203=20PR=201=20?= =?UTF-8?q?=E2=80=94=20audit=20prompt=20templates=20+=20output=20schema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of 6 PRs implementing Phase 3 (multi-model external audit) + the open frictions F2/F5/F7. Framework-only — no CLI code yet. Artifacts (all under dist/.devtrail/, auto-distributed via the existing recursive manifest pattern): - audit-prompts/auditor-primary.md - audit-prompts/auditor-secondary.md - audit-prompts/calibrator-reconciler.md - schemas/audit-output.schema.v0.json Architectural decision A1 (per the Phase 3 plan): Phase 3 v0 is ORCHESTRATION-ONLY, not an HTTP-API client. The CLI prepares and persists prompts, awaits the operator's responses, validates outputs against the schema, integrates findings into the Charter telemetry — but does NOT invoke any LLM API directly. Adopters paste the resolved prompts into their auditor of choice (Copilot, Gemini, Claude, etc.), save responses to the canonical paths, and the CLI consolidates. Rationale for orchestration-only: - Implementing 3 HTTP clients (OpenAI / Google / Anthropic) is 1-2 weeks of work + perpetual maintenance when APIs change. For an EXPERIMENTAL v0 schema, that investment is premature. - Sentinel's empirical pattern (the 6-cycle dual-audit experiment that motivated Phase 3) ALREADY uses this human-in-the-loop shape via /plan-audit skills. The CLI's value-add is the canon (prompt shape + output schema + telemetry integration), not the API call. - Closes RFC #82 (audit visibility) by design — the prompt-resolution and the auditor's response are both files on disk, version-controlled, inspectable, and reproducible by hand if the API call fails. - Aligns with principle #10 (honesty about what the tool does NOT do): "no LLM gateway, no model evaluation". Schema design: - audit-output.schema.v0.json uses oneOf to distinguish auditor outputs (primary/secondary, fresh findings) from calibrator outputs (reconciliation across the two). The `audit_role` field is the discriminator — three fixed roles, not arbitrary N. - findings_by_category enum (hallucination | implementation_gap | real_debt | false_positive) is the same vocabulary used by the external_audit array in charter-telemetry.schema.v0.json. The audit cycle output integrates directly into Charter telemetry at close. - Every output declares prompt_used: , satisfying RFC #82's requirement that the prompt path be discoverable from the output. Prompt design: - Primary and secondary prompts are STRUCTURALLY IDENTICAL. The heterogeneity signal lives in the auditor MODEL (different family per §5.2), not in different prompts. A/B-testing prompt phrasings is forward-looking; v0 keeps them symmetric for clean comparability. - Calibrator prompt assumes both auditor outputs as context and asks for status assignment (agreed | disputed | unique_primary | unique_secondary | rejected) per finding. Status counts cross-check against body section count — the schema enforces consistency. - All three prompts include explicit categorization rules + discipline rules ("don't fabricate findings", "no external sources beyond the prompt"). The rules are duplicated across the three so the auditor doesn't need to consult external documentation. What's NOT in this PR: - No CLI code yet — the `devtrail charter audit` command lands in PR 2. - No heterogeneity validation (`--implementer-family` enforcement) — v1. - No invocation of LLM APIs — orchestration-only by design. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../audit-prompts/auditor-primary.md | 154 ++++++++++++++++ .../audit-prompts/auditor-secondary.md | 131 +++++++++++++ .../audit-prompts/calibrator-reconciler.md | 173 ++++++++++++++++++ .../schemas/audit-output.schema.v0.json | 164 +++++++++++++++++ 4 files changed, 622 insertions(+) create mode 100644 dist/.devtrail/audit-prompts/auditor-primary.md create mode 100644 dist/.devtrail/audit-prompts/auditor-secondary.md create mode 100644 dist/.devtrail/audit-prompts/calibrator-reconciler.md create mode 100644 dist/.devtrail/schemas/audit-output.schema.v0.json diff --git a/dist/.devtrail/audit-prompts/auditor-primary.md b/dist/.devtrail/audit-prompts/auditor-primary.md new file mode 100644 index 0000000..e873c00 --- /dev/null +++ b/dist/.devtrail/audit-prompts/auditor-primary.md @@ -0,0 +1,154 @@ + + +You are an external auditor reviewing the execution of a DevTrail Charter. +Your job is to compare what the Charter declared (ex-ante) against what the +commits actually changed (ex-post) and produce a categorized list of findings. + +You are the **{{audit_role}}** auditor in a dual-audit cycle. Another +auditor of a different model family is being given the same Charter and diff +in parallel. A calibrator-reconciler will later compare your findings against +theirs. Cross-model heterogeneity is the point — your distribution of +training and your blind spots differ from the other auditor's, and that is +what makes the convergence (or disagreement) signal valuable. + +# What you are auditing + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +**Originating AILOGs** (rationale + emergent risks documented during execution): + +``` +{{ailog_paths}} +``` + +# Charter content + +```markdown +{{charter_content}} +``` + +# AILOG content + +```markdown +{{ailog_contents}} +``` + +# Diff + +```diff +{{git_diff}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: auditor-primary +auditor: # e.g., copilot-v1.0.37 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/auditor-primary.prompt.md +audited_at: +findings_total: +findings_by_category: + hallucination: + implementation_gap: + real_debt: + false_positive: +--- + +# Audit: {{charter_id}} by + +## Summary + +[1-2 paragraphs: did the execution match the Charter's declared scope? What +is the overall verdict — clean, partial, deviated?] + +## Findings + +### F1 — + +**Where:** `` or `` if span-wide. + +**What I observed:** [Concrete description of the gap, hallucination, or +real debt. Cite specific lines from the diff or the AILOGs.] + +**Why I'm flagging it:** [Reasoning. What about the Charter's declaration vs +the diff makes this a finding?] + +### F2 — ... + +[Continue numbering F1...FN. One section per finding.] +``` + +# Categorization rules + +Apply the following categories. The calibrator will use the same definitions: + +- **`hallucination`** — the Charter or implementation references something + that does not exist (an API, a function, a field name, a behavior). The + agent invented it. Verify by reading the diff or the cited file. +- **`implementation_gap`** — the Charter declared work that the diff did + not deliver, OR the diff delivered work the Charter did not declare, + WITHOUT it being documented as drift in the AILOG. (If documented in + AILOG under `## Risk` as `R`, that is *not* a gap; the AILOG-aware + drift check already accepts it.) +- **`real_debt`** — code-level concern that is correct as far as the + Charter goes but introduces technical debt or a subtle defect (a missing + error path, a leaky resource, a non-idempotent operation). Adopter is + expected to capture as `TDE` doc post-audit. +- **`false_positive`** — what initially looked like a finding but, on + closer inspection of the AILOGs or the diff context, isn't one. + Document anyway; the calibrator uses these to recognize patterns where + one auditor over-reports. + +# Discipline + +- Cite specific file paths and line numbers from the diff. Do not summarize + abstractly. +- If you cannot find anything substantive, return `findings_total: 0` with + a single `## Summary` paragraph explaining what you reviewed. Empty audits + are valid signal — the calibrator will note convergence with the other + auditor's empty audit, if applicable. +- Do not fabricate findings to seem thorough. The categorization rules + above include `false_positive` precisely because over-reporting is a + real audit failure mode. +- Do not consult external sources beyond what is provided in this prompt. + The audit must be reproducible from the prompt + the diff + the AILOGs + alone. diff --git a/dist/.devtrail/audit-prompts/auditor-secondary.md b/dist/.devtrail/audit-prompts/auditor-secondary.md new file mode 100644 index 0000000..a2651fd --- /dev/null +++ b/dist/.devtrail/audit-prompts/auditor-secondary.md @@ -0,0 +1,131 @@ + + +You are an independent external auditor reviewing the execution of a +DevTrail Charter. You are the **{{audit_role}}** auditor. A primary auditor +of a different model family is reviewing the same Charter and diff in +parallel. The two of you may agree or disagree; both are valuable signal. +A calibrator-reconciler will integrate your findings with the primary's. + +You may have been trained on different data than the primary. Your blind +spots and your priors are different. Audit independently — the value of the +dual-audit comes from convergence on real findings and divergence on +boundary cases, not from echoing the primary auditor. + +# What you are auditing + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +**Originating AILOGs** (rationale + emergent risks documented during execution): + +``` +{{ailog_paths}} +``` + +# Charter content + +```markdown +{{charter_content}} +``` + +# AILOG content + +```markdown +{{ailog_contents}} +``` + +# Diff + +```diff +{{git_diff}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: auditor-secondary +auditor: # e.g., gemini-cli-v1.5 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/auditor-secondary.prompt.md +audited_at: +findings_total: +findings_by_category: + hallucination: + implementation_gap: + real_debt: + false_positive: +--- + +# Audit: {{charter_id}} by + +## Summary + +[1-2 paragraphs: did the execution match the Charter's declared scope? +What is the overall verdict?] + +## Findings + +### F1 — + +**Where:** `` or `` if span-wide. + +**What I observed:** [Concrete description. Cite specific lines from the +diff or the AILOGs.] + +**Why I'm flagging it:** [Reasoning. What about the Charter's declaration +vs the diff makes this a finding?] + +### F2 — ... + +[One section per finding.] +``` + +# Categorization rules + +Same categories as the primary auditor — the calibrator uses the same +definitions to compare your findings: + +- **`hallucination`** — Charter or implementation references something + that does not exist (invented API, function, field, behavior). Verify + by reading the diff or cited file. +- **`implementation_gap`** — Charter declared work the diff did not + deliver (or vice versa) WITHOUT it being documented as drift in the + AILOG. (Documented in AILOG `## Risk` as `R` is *not* a gap.) +- **`real_debt`** — code-level concern not strictly within Charter + scope but introducing debt or a subtle defect (missing error path, + leaky resource, non-idempotent operation). Adopter captures as `TDE`. +- **`false_positive`** — looked like a finding but, on closer reading + of the AILOGs or diff context, isn't. Document anyway; calibrator + uses these to detect over-reporting patterns. + +# Discipline + +- Cite specific file paths and line numbers from the diff. No abstract + summaries. +- If you find nothing substantive, return `findings_total: 0` with a + `## Summary` paragraph explaining your review. Empty is valid signal. +- Do not fabricate findings to seem thorough. Over-reporting is a real + audit failure mode — `false_positive` exists precisely for this case. +- Do not consult external sources beyond this prompt. The audit must be + reproducible from the prompt + diff + AILOGs alone. diff --git a/dist/.devtrail/audit-prompts/calibrator-reconciler.md b/dist/.devtrail/audit-prompts/calibrator-reconciler.md new file mode 100644 index 0000000..1dd4763 --- /dev/null +++ b/dist/.devtrail/audit-prompts/calibrator-reconciler.md @@ -0,0 +1,173 @@ + + +You are the **calibrator-reconciler** of a DevTrail dual-audit cycle. Two +external auditors of different model families have already reviewed the +Charter; their outputs are below. Your job is to apply the categorization +schema definitionally, recognize agreement and disagreement, and produce a +consolidated list of findings that the Charter's telemetry can record. + +You are not auditing fresh. You are reading two audits and reconciling them. + +# What you are reconciling + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +# Charter content + +```markdown +{{charter_content}} +``` + +# Originating AILOGs + +``` +{{ailog_paths}} +``` + +```markdown +{{ailog_contents}} +``` + +# Auditor PRIMARY output + +```markdown +{{auditor_primary_findings}} +``` + +# Auditor SECONDARY output + +```markdown +{{auditor_secondary_findings}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: calibrator-reconciler +calibrator: # e.g., claude-opus-4 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/calibrator-reconciler.prompt.md +calibrated_at: +auditors_reconciled: + - auditor-primary.md + - auditor-secondary.md +findings_consolidated: +findings_by_status: + agreed: # both auditors flagged the same finding + disputed: # both flagged but disagreed on category — you picked + unique_primary: # only primary; you validated as legitimate + unique_secondary: # only secondary; you validated + rejected: # both flagged but you determined false positive +--- + +# Calibration: {{charter_id}} + +## Reconciliation summary + +[1-2 paragraphs: how convergent were the auditors? Where did they +disagree, and on what kind of finding? Did one auditor have a higher +false-positive rate?] + +## Reconciled findings + +### C1 — + +**Status:** agreed | disputed | unique_primary | unique_secondary | rejected. + +**Where:** ``. + +**What was observed:** [Combine the auditors' descriptions. If they +disagreed, note both views and your resolution.] + +**Calibration rationale:** [Why this status. If `agreed`, name what each +auditor said. If `disputed`, name the disagreement and your call. If +`unique_*`, explain why you validated. If `rejected`, explain why both +auditors were wrong.] + +### C2 — ... + +[One section per consolidated finding. Numbering C1...CN is independent +of the F1...FN numbering each auditor used; cross-reference auditor +numbering inside each section as needed.] +``` + +# Categorization rules (same as the auditors) + +- **`hallucination`** — invented API, function, field, behavior. +- **`implementation_gap`** — declared but not delivered (or vice versa) + WITHOUT being documented in AILOG as drift. +- **`real_debt`** — code-level debt or subtle defect outside Charter scope. +- **`false_positive`** — appeared to be a finding but isn't. + +# Status assignment rules + +For each distinct finding (deduplicate when both auditors describe the +same gap with different wording): + +- `agreed` — both auditors flagged it AND assigned the same category. + Strongest signal — the convergence between heterogeneous auditors is + what makes a dual-audit valuable. +- `disputed` — both auditors flagged it BUT assigned different categories + (e.g., primary calls it `implementation_gap`, secondary calls it + `hallucination`). You pick the category that fits the schema definitions + best, given the diff and the AILOGs. +- `unique_primary` / `unique_secondary` — only one auditor flagged it, + AND on your reading, they were correct to flag it. +- `rejected` — one or both auditors flagged it, but on closer reading + of the AILOGs (especially `## Risk` `R` documented mitigations) + or the diff, it isn't a finding. Both `unique` flags can become + `rejected` if the unique auditor was wrong. + +# Discipline + +- Use the `findings_by_status` counts as a cross-check against your + body sections. They must add up to `findings_consolidated`. +- Do not introduce findings the auditors did not see. If you spot + something they missed, document it in `## Reconciliation summary` as + an observation, not as a `C` finding. Fresh findings are out of + scope for the calibrator role — that's what the next audit cycle is for. +- The `rejected` count is signal worth tracking — it tells the Charter + author which audit categories tend to over-report on this kind of + Charter, which improves future audit prompt design. +- Do not consult external sources beyond what is provided. The + reconciliation must be reproducible from the prompt + the two auditor + outputs + the Charter + the AILOGs. diff --git a/dist/.devtrail/schemas/audit-output.schema.v0.json b/dist/.devtrail/schemas/audit-output.schema.v0.json new file mode 100644 index 0000000..b629138 --- /dev/null +++ b/dist/.devtrail/schemas/audit-output.schema.v0.json @@ -0,0 +1,164 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://devtrail.dev/schemas/audit-output.schema.v0.json", + "title": "DevTrail Charter Audit Output (experimental v0)", + "description": "Frontmatter schema for the markdown files that auditors and the calibrator-reconciler produce during a `devtrail charter audit` cycle. The shape distinguishes auditor outputs (primary/secondary, fresh findings) from calibrator outputs (reconciliation across auditor outputs) via the `audit_role` field — one of three fixed roles, not arbitrary N.", + "$comment": "EXPERIMENTAL v0. Phase 3 of the CLI roadmap. Crystallized from the dual-audit pattern validated empirically in Sentinel (Copilot + Gemini + claude-analisis across 6 cycles). Will not stabilize to v1.0 until validated in a second domain. The schema is for the YAML inside the file's frontmatter; the body is free-form markdown by convention. The findings_by_category enum (hallucination | implementation_gap | real_debt | false_positive) is the same one used by `external_audit` in charter-telemetry.schema.v0.json — the audit cycle output integrates directly into the Charter telemetry at close time.", + "type": "object", + "oneOf": [ + { "$ref": "#/$defs/auditorOutput" }, + { "$ref": "#/$defs/calibratorOutput" } + ], + "$defs": { + "auditorOutput": { + "type": "object", + "required": [ + "audit_role", + "auditor", + "charter_id", + "git_range", + "prompt_used", + "audited_at", + "findings_total", + "findings_by_category" + ], + "additionalProperties": true, + "properties": { + "audit_role": { + "type": "string", + "enum": ["auditor-primary", "auditor-secondary"], + "description": "Slot this output occupies in the dual-audit. Primary and secondary are heterogeneous by design (see CLI-REFERENCE.md `devtrail charter audit` for the inter-family recommendation)." + }, + "auditor": { + "type": "string", + "minLength": 1, + "description": "Model identifier and version of the auditor that produced this output (e.g., `copilot-v1.0.37`, `gemini-cli-v1.5`, `claude-opus-4`). Free-form so adopters can use any model identifier." + }, + "charter_id": { + "type": "string", + "pattern": "^CHARTER-[0-9]{2,}(-[a-z0-9-]+)?$" + }, + "git_range": { + "type": "string", + "minLength": 1, + "description": "Git revision range the auditor reviewed (e.g., `origin/main..HEAD`)." + }, + "prompt_used": { + "type": "string", + "minLength": 1, + "description": "Path (relative to the audit directory) of the resolved prompt file the auditor was given. Required so the operator can verify the prompt wasn't tampered with mid-cycle. Closes RFC #82." + }, + "audited_at": { + "type": "string", + "format": "date" + }, + "findings_total": { + "type": "integer", + "minimum": 0 + }, + "findings_by_category": { + "type": "object", + "additionalProperties": false, + "properties": { + "hallucination": { "type": "integer", "minimum": 0 }, + "implementation_gap": { "type": "integer", "minimum": 0 }, + "real_debt": { "type": "integer", "minimum": 0 }, + "false_positive": { "type": "integer", "minimum": 0 } + } + }, + "audit_quality": { + "type": "string", + "enum": ["high", "medium", "low"], + "description": "Operator's calibration of this auditor's output quality. Optional — set when consolidating into telemetry." + } + } + }, + "calibratorOutput": { + "type": "object", + "required": [ + "audit_role", + "calibrator", + "charter_id", + "git_range", + "prompt_used", + "calibrated_at", + "auditors_reconciled", + "findings_consolidated" + ], + "additionalProperties": true, + "properties": { + "audit_role": { + "type": "string", + "const": "calibrator-reconciler" + }, + "calibrator": { + "type": "string", + "minLength": 1, + "description": "Model identifier of the calibrator. May be of any family — even the same as the implementer — because the calibrator's job is to apply the schema definitionally over already-produced auditor verdicts, not to discover gaps. Heterogeneity matters for the auditor pair, not the calibrator (see roadmap §5.2)." + }, + "charter_id": { + "type": "string", + "pattern": "^CHARTER-[0-9]{2,}(-[a-z0-9-]+)?$" + }, + "git_range": { + "type": "string", + "minLength": 1 + }, + "prompt_used": { + "type": "string", + "minLength": 1 + }, + "calibrated_at": { + "type": "string", + "format": "date" + }, + "auditors_reconciled": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": { + "type": "string", + "minLength": 1, + "description": "Path (relative to audit dir) of an auditor output file this calibration considered." + }, + "description": "Exactly two auditor outputs are reconciled in v0 (dual-audit pattern). N≥3 audit ensembles are forward-looking." + }, + "findings_consolidated": { + "type": "integer", + "minimum": 0, + "description": "Total distinct findings after reconciliation (after deduplication and rejection of false positives identified across both auditors)." + }, + "findings_by_status": { + "type": "object", + "additionalProperties": false, + "properties": { + "agreed": { + "type": "integer", + "minimum": 0, + "description": "Both auditors flagged the same finding (highest signal — convergence)." + }, + "disputed": { + "type": "integer", + "minimum": 0, + "description": "Auditors flagged but disagreed on category; calibrator picked one." + }, + "unique_primary": { + "type": "integer", + "minimum": 0, + "description": "Only the primary auditor flagged this; calibrator validated as legitimate." + }, + "unique_secondary": { + "type": "integer", + "minimum": 0 + }, + "rejected": { + "type": "integer", + "minimum": 0, + "description": "Both auditors flagged but calibrator determined false positive." + } + } + } + } + } + } +}