diff --git a/benchmark-leakage-audit-assistant/README.md b/benchmark-leakage-audit-assistant/README.md new file mode 100644 index 0000000..d4f759a --- /dev/null +++ b/benchmark-leakage-audit-assistant/README.md @@ -0,0 +1,37 @@ +# Benchmark Leakage Audit Assistant + +This self-contained module adds an AI research assistant slice for pre-release benchmark hygiene. It helps reviewers catch evaluation leakage before a paper, model, or scientific benchmark result is published. + +## What It Checks + +- Train/test overlap by record ID or normalized content fingerprint +- Benchmark contamination in the training corpus +- Final holdout or test set use during model selection +- Missing split provenance such as deterministic method, seed, or manifest hash +- Missing reproducibility packet evidence such as lockfiles, manifests, code archive, and preregistration + +## Run It + +```bash +node benchmark-leakage-audit-assistant/test.js +node benchmark-leakage-audit-assistant/demo.js +``` + +The module uses only Node.js standard library APIs. + +## Public API + +```js +const { auditBenchmarkLeakage } = require("./index.js"); + +const audit = auditBenchmarkLeakage(project); +console.log(audit.summary.releaseDecision); +console.log(audit.findings); +console.log(audit.reviewerPacket.tasks); +``` + +The audit returns a release decision of `pass`, `needs-remediation`, or `block`, plus reviewer-ready findings with evidence and remediation tasks. + +## Demo + +The included `demo.gif` shows the module blocking a release candidate with train/test overlap, benchmark contamination, held-out tuning, weak split provenance, and missing reproducibility artifacts. diff --git a/benchmark-leakage-audit-assistant/demo.gif b/benchmark-leakage-audit-assistant/demo.gif new file mode 100644 index 0000000..c021b13 Binary files /dev/null and b/benchmark-leakage-audit-assistant/demo.gif differ diff --git a/benchmark-leakage-audit-assistant/demo.js b/benchmark-leakage-audit-assistant/demo.js new file mode 100644 index 0000000..fc2d772 --- /dev/null +++ b/benchmark-leakage-audit-assistant/demo.js @@ -0,0 +1,84 @@ +const { auditBenchmarkLeakage } = require("./index.js"); + +const demoProject = { + title: "NeuroImaging Benchmark Release Candidate", + benchmark: { + name: "NeuroBench-26", + items: [ + { + id: "nb26-hidden-17", + title: "Hidden fMRI cohort sample", + text: "Reserved NeuroBench-26 fMRI cohort with blinded diagnostic labels." + } + ] + }, + datasets: { + train: [ + { + id: "train-cohort-13", + source: "NeuroBench-26 pre-release mirror", + text: "Reserved NeuroBench-26 fMRI cohort with blinded diagnostic labels." + }, + { + id: "subject-0042", + source: "lab import", + text: "Subject 0042 resting-state network features with quality-control notes." + } + ], + validation: [ + { + id: "subject-1182", + source: "validation import", + text: "Subject 1182 task-state network features with adjudicated QC status." + } + ], + test: [ + { + id: "subject-0042", + source: "holdout import", + text: "Subject 0042 resting-state network features with quality-control notes." + } + ] + }, + split: { + method: "manual export", + seed: "", + manifestHash: null + }, + experiments: [ + { + id: "exp-neuro-7", + usedForSelection: "test", + notes: "Selected final model using best test AUROC after evaluating four checkpoints." + } + ], + artifacts: { + rawDataManifest: true, + splitManifest: false, + environmentLock: false, + codeArchive: true, + preregistration: false + } +}; + +const audit = auditBenchmarkLeakage(demoProject); + +console.log(`Benchmark leakage audit: ${audit.summary.projectTitle}`); +console.log(`Decision: ${audit.summary.releaseDecision}`); +console.log(`Reproducibility confidence: ${audit.summary.reproducibilityConfidence}`); +console.log(`Findings: ${audit.summary.findingCount}`); +console.log(""); + +for (const finding of audit.findings) { + console.log(`[${finding.severity.toUpperCase()}] ${finding.title}`); + for (const evidence of finding.evidence) { + console.log(` - ${evidence}`); + } + console.log(` Remediation: ${finding.remediation}`); + console.log(""); +} + +console.log("Reviewer tasks:"); +for (const task of audit.reviewerPacket.tasks) { + console.log(`- ${task}`); +} diff --git a/benchmark-leakage-audit-assistant/index.js b/benchmark-leakage-audit-assistant/index.js new file mode 100644 index 0000000..6924ef4 --- /dev/null +++ b/benchmark-leakage-audit-assistant/index.js @@ -0,0 +1,305 @@ +const crypto = require("node:crypto"); + +const SEVERITIES = ["critical", "high", "medium", "low"]; +const REQUIRED_ARTIFACTS = [ + "rawDataManifest", + "splitManifest", + "environmentLock", + "codeArchive", + "preregistration" +]; + +function auditBenchmarkLeakage(project) { + const normalized = normalizeProject(project); + const findings = [ + ...findTrainTestOverlap(normalized), + ...findBenchmarkContamination(normalized), + ...findHeldoutTuning(normalized), + ...findSplitProvenanceGaps(normalized), + ...findReproducibilityGaps(normalized) + ]; + + const severityCounts = countSeverities(findings); + const summary = { + projectTitle: normalized.title, + findingCount: findings.length, + severityCounts, + releaseDecision: decideRelease(severityCounts), + reproducibilityConfidence: scoreReproducibility(findings) + }; + + return { + summary, + findings, + reviewerPacket: buildReviewerPacket(summary, findings) + }; +} + +function normalizeProject(project) { + if (!project || typeof project !== "object") { + throw new TypeError("auditBenchmarkLeakage expects a project object"); + } + + const datasets = project.datasets || {}; + + return { + title: project.title || "Untitled research project", + benchmark: project.benchmark || { name: "", items: [] }, + datasets: { + train: Array.isArray(datasets.train) ? datasets.train : [], + validation: Array.isArray(datasets.validation) ? datasets.validation : [], + test: Array.isArray(datasets.test) ? datasets.test : [] + }, + split: project.split || {}, + experiments: Array.isArray(project.experiments) ? project.experiments : [], + artifacts: project.artifacts || {} + }; +} + +function findTrainTestOverlap(project) { + const trainFingerprints = new Map(); + const trainIds = new Map(); + + for (const record of project.datasets.train) { + const id = normalizeValue(record.id); + if (id) trainIds.set(id, record); + trainFingerprints.set(fingerprintRecord(record), record); + } + + const overlaps = []; + for (const testRecord of project.datasets.test) { + const id = normalizeValue(testRecord.id); + const fingerprint = fingerprintRecord(testRecord); + const trainRecord = trainIds.get(id) || trainFingerprints.get(fingerprint); + + if (trainRecord) { + overlaps.push({ + trainId: trainRecord.id || "unknown-train-id", + testId: testRecord.id || "unknown-test-id", + snippet: compactText(testRecord.text || testRecord.title || testRecord.source || "") + }); + } + } + + if (overlaps.length === 0) return []; + + return [ + finding({ + type: "train_test_overlap", + severity: "critical", + title: "Training records overlap with the final evaluation split", + evidence: overlaps.map( + (overlap) => + `Train ${overlap.trainId} matches test ${overlap.testId}: ${overlap.snippet}` + ), + remediation: + "Rebuild the split from source data, remove duplicated records from the training corpus, and rerun the final holdout once." + }) + ]; +} + +function findBenchmarkContamination(project) { + const benchmarkTerms = [ + project.benchmark.name, + ...safeArray(project.benchmark.items).flatMap((item) => [ + item.id, + item.title, + item.text + ]) + ] + .map(normalizeText) + .filter((term) => term.length >= 8); + + const contaminated = []; + for (const record of project.datasets.train) { + const recordText = normalizeText( + [record.source, record.title, record.text].filter(Boolean).join(" ") + ); + const matchedTerm = benchmarkTerms.find((term) => recordText.includes(term)); + + if (matchedTerm) { + contaminated.push( + `Training record ${record.id || "unknown"} contains benchmark signal "${truncate( + matchedTerm, + 80 + )}".` + ); + } + } + + if (contaminated.length === 0) return []; + + return [ + finding({ + type: "benchmark_contamination", + severity: "high", + title: "Training corpus appears to contain benchmark material", + evidence: contaminated, + remediation: + "Remove benchmark mirrors and leaderboard-derived content from training, then document an exclusion rule for future ingestion." + }) + ]; +} + +function findHeldoutTuning(project) { + const riskyExperiments = project.experiments.filter(hasHeldoutTuningRisk); + + if (riskyExperiments.length === 0) return []; + + return [ + finding({ + type: "heldout_tuning", + severity: "high", + title: "Final holdout data was used during model selection", + evidence: riskyExperiments.map( + (experiment) => + `${experiment.id || "unnamed experiment"}: ${compactText( + experiment.notes || experiment.usedForSelection || experiment.tunedOn || "" + )}` + ), + remediation: + "Move model and checkpoint selection to validation data, freeze the chosen configuration, and rerun the untouched holdout once." + }) + ]; +} + +function hasHeldoutTuningRisk(experiment) { + const selectionSource = normalizeText( + [experiment.usedForSelection, experiment.tunedOn].filter(Boolean).join(" ") + ); + if (/\b(test|holdout|held out|held out|heldout)\b/.test(selectionSource)) { + return true; + } + + const notes = normalizeText(experiment.notes); + return ( + /\b(test|holdout|heldout)\b.{0,40}\b(select|selected|selection|tune|tuned|checkpoint|best)\b/.test(notes) || + /\b(select|selected|selection|tune|tuned|checkpoint|best)\b.{0,40}\b(test|holdout|heldout)\b/.test(notes) + ); +} + +function findSplitProvenanceGaps(project) { + const gaps = []; + if (!project.split.method) gaps.push("split method is missing"); + if (project.split.seed === null || project.split.seed === undefined || project.split.seed === "") { + gaps.push("split seed is missing"); + } + if (!project.split.manifestHash) gaps.push("split manifest hash is missing"); + + if (gaps.length === 0) return []; + + return [ + finding({ + type: "split_provenance_gap", + severity: "medium", + title: "Dataset split provenance is not reproducible", + evidence: gaps, + remediation: + "Record the deterministic split method, seed, and manifest hash so reviewers can recreate train/validation/test membership." + }) + ]; +} + +function findReproducibilityGaps(project) { + const missing = REQUIRED_ARTIFACTS.filter((artifact) => !project.artifacts[artifact]); + if (missing.length === 0) return []; + + return [ + finding({ + type: "reproducibility_gap", + severity: missing.length >= 3 ? "high" : "medium", + title: "Reproducibility packet is missing required evidence", + evidence: missing.map((artifact) => `${artifact} is absent`), + remediation: + "Attach the missing manifests, lockfiles, code archive, and preregistration evidence before release review." + }) + ]; +} + +function finding(input) { + return { + id: `${input.type}-${hash(input.evidence.join("|")).slice(0, 8)}`, + type: input.type, + severity: input.severity, + title: input.title, + evidence: input.evidence, + remediation: input.remediation + }; +} + +function buildReviewerPacket(summary, findings) { + if (findings.length === 0) { + return { + keyRisks: ["No benchmark leakage signals detected."], + tasks: ["Proceed with normal scientific peer review."], + questions: ["Have reviewers independently confirmed benchmark access controls?"] + }; + } + + return { + keyRisks: findings.map((item) => `${item.severity.toUpperCase()}: ${item.title}`), + tasks: findings.map((item) => item.remediation), + questions: [ + `Can the team reproduce the ${summary.projectTitle} split from immutable inputs?`, + "Was the final holdout evaluated exactly once after model selection froze?", + "Are benchmark and leaderboard mirrors excluded from every training corpus?" + ] + }; +} + +function countSeverities(findings) { + const counts = Object.fromEntries(SEVERITIES.map((severity) => [severity, 0])); + for (const item of findings) counts[item.severity] += 1; + return counts; +} + +function decideRelease(counts) { + if (counts.critical > 0 || counts.high > 1) return "block"; + if (counts.high === 1 || counts.medium > 0) return "needs-remediation"; + return "pass"; +} + +function scoreReproducibility(findings) { + if (findings.some((item) => item.type === "reproducibility_gap" && item.severity === "high")) { + return "low"; + } + if (findings.some((item) => item.type === "reproducibility_gap" || item.type === "split_provenance_gap")) { + return "medium"; + } + return "high"; +} + +function fingerprintRecord(record) { + return hash(normalizeText([record.title, record.text].filter(Boolean).join(" "))); +} + +function hash(value) { + return crypto.createHash("sha256").update(String(value)).digest("hex"); +} + +function normalizeText(value) { + return String(value || "") + .toLowerCase() + .replace(/[^a-z0-9]+/g, " ") + .trim(); +} + +function normalizeValue(value) { + return String(value || "").trim().toLowerCase(); +} + +function safeArray(value) { + return Array.isArray(value) ? value : []; +} + +function compactText(value) { + return truncate(String(value).replace(/\s+/g, " ").trim(), 120); +} + +function truncate(value, length) { + return value.length > length ? `${value.slice(0, length - 3)}...` : value; +} + +module.exports = { + auditBenchmarkLeakage +}; diff --git a/benchmark-leakage-audit-assistant/requirement-map.md b/benchmark-leakage-audit-assistant/requirement-map.md new file mode 100644 index 0000000..df0e01b --- /dev/null +++ b/benchmark-leakage-audit-assistant/requirement-map.md @@ -0,0 +1,29 @@ +# Requirement Map + +Issue: SCIBASE-AI/SCIBASE.AI#16, AI-Powered Research Assistant Suite. + +## Auto Peer Review Reports + +- The audit emits reviewer-ready findings with severity, evidence, and remediation. +- `reviewerPacket.keyRisks`, `reviewerPacket.tasks`, and `reviewerPacket.questions` provide a structured pre-release review packet. + +## Reproducibility Checker + +- The module checks split method, split seed, split manifest hash, raw data manifest, split manifest, environment lock, code archive, and preregistration evidence. +- The summary includes `reproducibilityConfidence` so reviewers can triage projects before publication. + +## Research Gap Finder Alignment + +- Benchmark leakage is treated as a research-quality gap: contaminated results must be remediated before they can support reliable claims. +- The generated reviewer questions identify follow-up work needed to restore benchmark validity. + +## Safety And Scope + +- Uses only synthetic sample data. +- Requires no external services, credentials, or network access. +- Keeps the implementation isolated under `benchmark-leakage-audit-assistant/`. + +## Verification + +- `node benchmark-leakage-audit-assistant/test.js` +- `node benchmark-leakage-audit-assistant/demo.js` diff --git a/benchmark-leakage-audit-assistant/test.js b/benchmark-leakage-audit-assistant/test.js new file mode 100644 index 0000000..110aaf1 --- /dev/null +++ b/benchmark-leakage-audit-assistant/test.js @@ -0,0 +1,169 @@ +const assert = require("node:assert/strict"); +const { auditBenchmarkLeakage } = require("./index.js"); + +function test(name, fn) { + try { + fn(); + console.log(`ok - ${name}`); + } catch (error) { + console.error(`not ok - ${name}`); + throw error; + } +} + +const contaminatedProject = { + title: "Protein Stability Leaderboard Study", + benchmark: { + name: "ProteinBench", + items: [ + { + id: "pb-42", + title: "Thermal stability mutation panel", + text: "A held-out thermal stability mutation panel for ProteinBench." + } + ] + }, + datasets: { + train: [ + { + id: "train-001", + source: "ProteinBench public leaderboard mirror", + text: "A held-out thermal stability mutation panel for ProteinBench." + }, + { + id: "dup-777", + source: "lab notebook", + text: "Kinase mutant A had a melting temperature shift of 2.4 C." + } + ], + validation: [ + { + id: "val-009", + source: "internal split", + text: "Independent validation measurement for kinase mutant B." + } + ], + test: [ + { + id: "dup-777", + source: "final holdout", + text: "Kinase mutant A had a melting temperature shift of 2.4 C." + } + ] + }, + split: { + method: "manual spreadsheet split", + seed: null, + manifestHash: "" + }, + experiments: [ + { + id: "exp-main", + usedForSelection: "test", + notes: "Selected the final checkpoint using best test accuracy after three tuning rounds." + } + ], + artifacts: { + rawDataManifest: true, + splitManifest: false, + environmentLock: false, + codeArchive: true, + preregistration: false + } +}; + +const cleanProject = { + title: "Blind Materials Benchmark", + benchmark: { + name: "MatBench Blind", + items: [ + { + id: "mat-test-1", + title: "Hidden elastic modulus sample", + text: "Elastic modulus holdout measurement kept in a locked benchmark set." + } + ] + }, + datasets: { + train: [ + { + id: "mat-train-1", + source: "2024 lab training corpus", + text: "Training-only polymer synthesis run with public features." + } + ], + validation: [ + { + id: "mat-val-1", + source: "validation split", + text: "Validation-only polymer synthesis run for model selection." + } + ], + test: [ + { + id: "mat-test-shadow", + source: "sealed holdout", + text: "Different blinded polymer synthesis run for final reporting." + } + ] + }, + split: { + method: "stratified hash split", + seed: 20260518, + manifestHash: "sha256:3bf44d1e2c" + }, + experiments: [ + { + id: "exp-clean", + usedForSelection: "validation", + notes: "Hyperparameters were chosen on validation data before one final holdout evaluation." + } + ], + artifacts: { + rawDataManifest: true, + splitManifest: true, + environmentLock: true, + codeArchive: true, + preregistration: true + } +}; + +test("flags train/test overlap, benchmark contamination, held-out tuning, weak split provenance, and missing artifacts", () => { + const audit = auditBenchmarkLeakage(contaminatedProject); + const types = audit.findings.map((finding) => finding.type); + + assert.equal(audit.summary.releaseDecision, "block"); + assert.equal(audit.summary.severityCounts.critical, 1); + assert.ok(types.includes("train_test_overlap")); + assert.ok(types.includes("benchmark_contamination")); + assert.ok(types.includes("heldout_tuning")); + assert.ok(types.includes("split_provenance_gap")); + assert.ok(types.includes("reproducibility_gap")); + assert.ok( + audit.reviewerPacket.tasks.some((task) => + task.toLowerCase().includes("rebuild the split") + ) + ); +}); + +test("returns a pass decision for a clean project with complete reproducibility evidence", () => { + const audit = auditBenchmarkLeakage(cleanProject); + + assert.equal(audit.summary.releaseDecision, "pass"); + assert.equal(audit.summary.findingCount, 0); + assert.equal(audit.summary.reproducibilityConfidence, "high"); + assert.deepEqual(audit.findings, []); + assert.ok(audit.reviewerPacket.keyRisks.includes("No benchmark leakage signals detected.")); +}); + +test("emits reviewer-ready evidence and remediation text for each finding", () => { + const audit = auditBenchmarkLeakage(contaminatedProject); + + for (const finding of audit.findings) { + assert.ok(finding.id); + assert.ok(["critical", "high", "medium", "low"].includes(finding.severity)); + assert.ok(finding.title.length > 10); + assert.ok(finding.evidence.length > 0); + assert.ok(finding.remediation.length > 10); + } +});