diff --git a/dist/.devtrail/audit-prompts/auditor-primary.md b/dist/.devtrail/audit-prompts/auditor-primary.md new file mode 100644 index 0000000..e873c00 --- /dev/null +++ b/dist/.devtrail/audit-prompts/auditor-primary.md @@ -0,0 +1,154 @@ + + +You are an external auditor reviewing the execution of a DevTrail Charter. +Your job is to compare what the Charter declared (ex-ante) against what the +commits actually changed (ex-post) and produce a categorized list of findings. + +You are the **{{audit_role}}** auditor in a dual-audit cycle. Another +auditor of a different model family is being given the same Charter and diff +in parallel. A calibrator-reconciler will later compare your findings against +theirs. Cross-model heterogeneity is the point — your distribution of +training and your blind spots differ from the other auditor's, and that is +what makes the convergence (or disagreement) signal valuable. + +# What you are auditing + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +**Originating AILOGs** (rationale + emergent risks documented during execution): + +``` +{{ailog_paths}} +``` + +# Charter content + +```markdown +{{charter_content}} +``` + +# AILOG content + +```markdown +{{ailog_contents}} +``` + +# Diff + +```diff +{{git_diff}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: auditor-primary +auditor: # e.g., copilot-v1.0.37 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/auditor-primary.prompt.md +audited_at: +findings_total: +findings_by_category: + hallucination: + implementation_gap: + real_debt: + false_positive: +--- + +# Audit: {{charter_id}} by + +## Summary + +[1-2 paragraphs: did the execution match the Charter's declared scope? What +is the overall verdict — clean, partial, deviated?] + +## Findings + +### F1 — + +**Where:** `` or `` if span-wide. + +**What I observed:** [Concrete description of the gap, hallucination, or +real debt. Cite specific lines from the diff or the AILOGs.] + +**Why I'm flagging it:** [Reasoning. What about the Charter's declaration vs +the diff makes this a finding?] + +### F2 — ... + +[Continue numbering F1...FN. One section per finding.] +``` + +# Categorization rules + +Apply the following categories. The calibrator will use the same definitions: + +- **`hallucination`** — the Charter or implementation references something + that does not exist (an API, a function, a field name, a behavior). The + agent invented it. Verify by reading the diff or the cited file. +- **`implementation_gap`** — the Charter declared work that the diff did + not deliver, OR the diff delivered work the Charter did not declare, + WITHOUT it being documented as drift in the AILOG. (If documented in + AILOG under `## Risk` as `R`, that is *not* a gap; the AILOG-aware + drift check already accepts it.) +- **`real_debt`** — code-level concern that is correct as far as the + Charter goes but introduces technical debt or a subtle defect (a missing + error path, a leaky resource, a non-idempotent operation). Adopter is + expected to capture as `TDE` doc post-audit. +- **`false_positive`** — what initially looked like a finding but, on + closer inspection of the AILOGs or the diff context, isn't one. + Document anyway; the calibrator uses these to recognize patterns where + one auditor over-reports. + +# Discipline + +- Cite specific file paths and line numbers from the diff. Do not summarize + abstractly. +- If you cannot find anything substantive, return `findings_total: 0` with + a single `## Summary` paragraph explaining what you reviewed. Empty audits + are valid signal — the calibrator will note convergence with the other + auditor's empty audit, if applicable. +- Do not fabricate findings to seem thorough. The categorization rules + above include `false_positive` precisely because over-reporting is a + real audit failure mode. +- Do not consult external sources beyond what is provided in this prompt. + The audit must be reproducible from the prompt + the diff + the AILOGs + alone. diff --git a/dist/.devtrail/audit-prompts/auditor-secondary.md b/dist/.devtrail/audit-prompts/auditor-secondary.md new file mode 100644 index 0000000..a2651fd --- /dev/null +++ b/dist/.devtrail/audit-prompts/auditor-secondary.md @@ -0,0 +1,131 @@ + + +You are an independent external auditor reviewing the execution of a +DevTrail Charter. You are the **{{audit_role}}** auditor. A primary auditor +of a different model family is reviewing the same Charter and diff in +parallel. The two of you may agree or disagree; both are valuable signal. +A calibrator-reconciler will integrate your findings with the primary's. + +You may have been trained on different data than the primary. Your blind +spots and your priors are different. Audit independently — the value of the +dual-audit comes from convergence on real findings and divergence on +boundary cases, not from echoing the primary auditor. + +# What you are auditing + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +**Originating AILOGs** (rationale + emergent risks documented during execution): + +``` +{{ailog_paths}} +``` + +# Charter content + +```markdown +{{charter_content}} +``` + +# AILOG content + +```markdown +{{ailog_contents}} +``` + +# Diff + +```diff +{{git_diff}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: auditor-secondary +auditor: # e.g., gemini-cli-v1.5 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/auditor-secondary.prompt.md +audited_at: +findings_total: +findings_by_category: + hallucination: + implementation_gap: + real_debt: + false_positive: +--- + +# Audit: {{charter_id}} by + +## Summary + +[1-2 paragraphs: did the execution match the Charter's declared scope? +What is the overall verdict?] + +## Findings + +### F1 — + +**Where:** `` or `` if span-wide. + +**What I observed:** [Concrete description. Cite specific lines from the +diff or the AILOGs.] + +**Why I'm flagging it:** [Reasoning. What about the Charter's declaration +vs the diff makes this a finding?] + +### F2 — ... + +[One section per finding.] +``` + +# Categorization rules + +Same categories as the primary auditor — the calibrator uses the same +definitions to compare your findings: + +- **`hallucination`** — Charter or implementation references something + that does not exist (invented API, function, field, behavior). Verify + by reading the diff or cited file. +- **`implementation_gap`** — Charter declared work the diff did not + deliver (or vice versa) WITHOUT it being documented as drift in the + AILOG. (Documented in AILOG `## Risk` as `R` is *not* a gap.) +- **`real_debt`** — code-level concern not strictly within Charter + scope but introducing debt or a subtle defect (missing error path, + leaky resource, non-idempotent operation). Adopter captures as `TDE`. +- **`false_positive`** — looked like a finding but, on closer reading + of the AILOGs or diff context, isn't. Document anyway; calibrator + uses these to detect over-reporting patterns. + +# Discipline + +- Cite specific file paths and line numbers from the diff. No abstract + summaries. +- If you find nothing substantive, return `findings_total: 0` with a + `## Summary` paragraph explaining your review. Empty is valid signal. +- Do not fabricate findings to seem thorough. Over-reporting is a real + audit failure mode — `false_positive` exists precisely for this case. +- Do not consult external sources beyond this prompt. The audit must be + reproducible from the prompt + diff + AILOGs alone. diff --git a/dist/.devtrail/audit-prompts/calibrator-reconciler.md b/dist/.devtrail/audit-prompts/calibrator-reconciler.md new file mode 100644 index 0000000..1dd4763 --- /dev/null +++ b/dist/.devtrail/audit-prompts/calibrator-reconciler.md @@ -0,0 +1,173 @@ + + +You are the **calibrator-reconciler** of a DevTrail dual-audit cycle. Two +external auditors of different model families have already reviewed the +Charter; their outputs are below. Your job is to apply the categorization +schema definitionally, recognize agreement and disagreement, and produce a +consolidated list of findings that the Charter's telemetry can record. + +You are not auditing fresh. You are reading two audits and reconciling them. + +# What you are reconciling + +**Charter:** `{{charter_path}}` (`{{charter_id}}` — {{charter_title}}) + +**Git range:** `{{git_range}}` + +# Charter content + +```markdown +{{charter_content}} +``` + +# Originating AILOGs + +``` +{{ailog_paths}} +``` + +```markdown +{{ailog_contents}} +``` + +# Auditor PRIMARY output + +```markdown +{{auditor_primary_findings}} +``` + +# Auditor SECONDARY output + +```markdown +{{auditor_secondary_findings}} +``` + +# What I need from you + +Produce a markdown file with this exact frontmatter shape (validates against +`{{schema_path}}`): + +```yaml +--- +audit_role: calibrator-reconciler +calibrator: # e.g., claude-opus-4 +charter_id: {{charter_id}} +git_range: "{{git_range}}" +prompt_used: prompts/calibrator-reconciler.prompt.md +calibrated_at: +auditors_reconciled: + - auditor-primary.md + - auditor-secondary.md +findings_consolidated: +findings_by_status: + agreed: # both auditors flagged the same finding + disputed: # both flagged but disagreed on category — you picked + unique_primary: # only primary; you validated as legitimate + unique_secondary: # only secondary; you validated + rejected: # both flagged but you determined false positive +--- + +# Calibration: {{charter_id}} + +## Reconciliation summary + +[1-2 paragraphs: how convergent were the auditors? Where did they +disagree, and on what kind of finding? Did one auditor have a higher +false-positive rate?] + +## Reconciled findings + +### C1 — + +**Status:** agreed | disputed | unique_primary | unique_secondary | rejected. + +**Where:** ``. + +**What was observed:** [Combine the auditors' descriptions. If they +disagreed, note both views and your resolution.] + +**Calibration rationale:** [Why this status. If `agreed`, name what each +auditor said. If `disputed`, name the disagreement and your call. If +`unique_*`, explain why you validated. If `rejected`, explain why both +auditors were wrong.] + +### C2 — ... + +[One section per consolidated finding. Numbering C1...CN is independent +of the F1...FN numbering each auditor used; cross-reference auditor +numbering inside each section as needed.] +``` + +# Categorization rules (same as the auditors) + +- **`hallucination`** — invented API, function, field, behavior. +- **`implementation_gap`** — declared but not delivered (or vice versa) + WITHOUT being documented in AILOG as drift. +- **`real_debt`** — code-level debt or subtle defect outside Charter scope. +- **`false_positive`** — appeared to be a finding but isn't. + +# Status assignment rules + +For each distinct finding (deduplicate when both auditors describe the +same gap with different wording): + +- `agreed` — both auditors flagged it AND assigned the same category. + Strongest signal — the convergence between heterogeneous auditors is + what makes a dual-audit valuable. +- `disputed` — both auditors flagged it BUT assigned different categories + (e.g., primary calls it `implementation_gap`, secondary calls it + `hallucination`). You pick the category that fits the schema definitions + best, given the diff and the AILOGs. +- `unique_primary` / `unique_secondary` — only one auditor flagged it, + AND on your reading, they were correct to flag it. +- `rejected` — one or both auditors flagged it, but on closer reading + of the AILOGs (especially `## Risk` `R` documented mitigations) + or the diff, it isn't a finding. Both `unique` flags can become + `rejected` if the unique auditor was wrong. + +# Discipline + +- Use the `findings_by_status` counts as a cross-check against your + body sections. They must add up to `findings_consolidated`. +- Do not introduce findings the auditors did not see. If you spot + something they missed, document it in `## Reconciliation summary` as + an observation, not as a `C` finding. Fresh findings are out of + scope for the calibrator role — that's what the next audit cycle is for. +- The `rejected` count is signal worth tracking — it tells the Charter + author which audit categories tend to over-report on this kind of + Charter, which improves future audit prompt design. +- Do not consult external sources beyond what is provided. The + reconciliation must be reproducible from the prompt + the two auditor + outputs + the Charter + the AILOGs. diff --git a/dist/.devtrail/schemas/audit-output.schema.v0.json b/dist/.devtrail/schemas/audit-output.schema.v0.json new file mode 100644 index 0000000..b629138 --- /dev/null +++ b/dist/.devtrail/schemas/audit-output.schema.v0.json @@ -0,0 +1,164 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://devtrail.dev/schemas/audit-output.schema.v0.json", + "title": "DevTrail Charter Audit Output (experimental v0)", + "description": "Frontmatter schema for the markdown files that auditors and the calibrator-reconciler produce during a `devtrail charter audit` cycle. The shape distinguishes auditor outputs (primary/secondary, fresh findings) from calibrator outputs (reconciliation across auditor outputs) via the `audit_role` field — one of three fixed roles, not arbitrary N.", + "$comment": "EXPERIMENTAL v0. Phase 3 of the CLI roadmap. Crystallized from the dual-audit pattern validated empirically in Sentinel (Copilot + Gemini + claude-analisis across 6 cycles). Will not stabilize to v1.0 until validated in a second domain. The schema is for the YAML inside the file's frontmatter; the body is free-form markdown by convention. The findings_by_category enum (hallucination | implementation_gap | real_debt | false_positive) is the same one used by `external_audit` in charter-telemetry.schema.v0.json — the audit cycle output integrates directly into the Charter telemetry at close time.", + "type": "object", + "oneOf": [ + { "$ref": "#/$defs/auditorOutput" }, + { "$ref": "#/$defs/calibratorOutput" } + ], + "$defs": { + "auditorOutput": { + "type": "object", + "required": [ + "audit_role", + "auditor", + "charter_id", + "git_range", + "prompt_used", + "audited_at", + "findings_total", + "findings_by_category" + ], + "additionalProperties": true, + "properties": { + "audit_role": { + "type": "string", + "enum": ["auditor-primary", "auditor-secondary"], + "description": "Slot this output occupies in the dual-audit. Primary and secondary are heterogeneous by design (see CLI-REFERENCE.md `devtrail charter audit` for the inter-family recommendation)." + }, + "auditor": { + "type": "string", + "minLength": 1, + "description": "Model identifier and version of the auditor that produced this output (e.g., `copilot-v1.0.37`, `gemini-cli-v1.5`, `claude-opus-4`). Free-form so adopters can use any model identifier." + }, + "charter_id": { + "type": "string", + "pattern": "^CHARTER-[0-9]{2,}(-[a-z0-9-]+)?$" + }, + "git_range": { + "type": "string", + "minLength": 1, + "description": "Git revision range the auditor reviewed (e.g., `origin/main..HEAD`)." + }, + "prompt_used": { + "type": "string", + "minLength": 1, + "description": "Path (relative to the audit directory) of the resolved prompt file the auditor was given. Required so the operator can verify the prompt wasn't tampered with mid-cycle. Closes RFC #82." + }, + "audited_at": { + "type": "string", + "format": "date" + }, + "findings_total": { + "type": "integer", + "minimum": 0 + }, + "findings_by_category": { + "type": "object", + "additionalProperties": false, + "properties": { + "hallucination": { "type": "integer", "minimum": 0 }, + "implementation_gap": { "type": "integer", "minimum": 0 }, + "real_debt": { "type": "integer", "minimum": 0 }, + "false_positive": { "type": "integer", "minimum": 0 } + } + }, + "audit_quality": { + "type": "string", + "enum": ["high", "medium", "low"], + "description": "Operator's calibration of this auditor's output quality. Optional — set when consolidating into telemetry." + } + } + }, + "calibratorOutput": { + "type": "object", + "required": [ + "audit_role", + "calibrator", + "charter_id", + "git_range", + "prompt_used", + "calibrated_at", + "auditors_reconciled", + "findings_consolidated" + ], + "additionalProperties": true, + "properties": { + "audit_role": { + "type": "string", + "const": "calibrator-reconciler" + }, + "calibrator": { + "type": "string", + "minLength": 1, + "description": "Model identifier of the calibrator. May be of any family — even the same as the implementer — because the calibrator's job is to apply the schema definitionally over already-produced auditor verdicts, not to discover gaps. Heterogeneity matters for the auditor pair, not the calibrator (see roadmap §5.2)." + }, + "charter_id": { + "type": "string", + "pattern": "^CHARTER-[0-9]{2,}(-[a-z0-9-]+)?$" + }, + "git_range": { + "type": "string", + "minLength": 1 + }, + "prompt_used": { + "type": "string", + "minLength": 1 + }, + "calibrated_at": { + "type": "string", + "format": "date" + }, + "auditors_reconciled": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": { + "type": "string", + "minLength": 1, + "description": "Path (relative to audit dir) of an auditor output file this calibration considered." + }, + "description": "Exactly two auditor outputs are reconciled in v0 (dual-audit pattern). N≥3 audit ensembles are forward-looking." + }, + "findings_consolidated": { + "type": "integer", + "minimum": 0, + "description": "Total distinct findings after reconciliation (after deduplication and rejection of false positives identified across both auditors)." + }, + "findings_by_status": { + "type": "object", + "additionalProperties": false, + "properties": { + "agreed": { + "type": "integer", + "minimum": 0, + "description": "Both auditors flagged the same finding (highest signal — convergence)." + }, + "disputed": { + "type": "integer", + "minimum": 0, + "description": "Auditors flagged but disagreed on category; calibrator picked one." + }, + "unique_primary": { + "type": "integer", + "minimum": 0, + "description": "Only the primary auditor flagged this; calibrator validated as legitimate." + }, + "unique_secondary": { + "type": "integer", + "minimum": 0 + }, + "rejected": { + "type": "integer", + "minimum": 0, + "description": "Both auditors flagged but calibrator determined false positive." + } + } + } + } + } + } +}