diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/README.md b/scientific-knowledge-graph/graph-ingestion-auditor/README.md new file mode 100644 index 00000000..645b97dd --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/README.md @@ -0,0 +1,24 @@ +# Graph Ingestion Auditor + +This module adds a deterministic, dependency-free knowledge graph ingestion auditor for SCIBASE issue #17. It checks whether extracted scientific entities, relationships, recommendations, and export settings are safe to publish into a project discovery graph. + +## Scope + +- Required entity coverage for papers, authors, datasets, methods, software, and concepts +- DOI, ORCID, DataCite/Zenodo, MeSH/OBO/Wikidata, PubChem, UniProt, SWH, and GitHub identifier checks +- Extraction confidence thresholds +- Relationship endpoint, provenance, evidence, and private-content checks +- AI recommendation evidence and private-signal visibility checks +- JSON-LD/RDF/GraphML export readiness, license, and schema version checks + +All fixtures are synthetic. The module does not use private research content, user activity, live ontology services, credentials, or external APIs. + +## Run + +```bash +node scientific-knowledge-graph/graph-ingestion-auditor/test.js +node scientific-knowledge-graph/graph-ingestion-auditor/demo.js +node scientific-knowledge-graph/graph-ingestion-auditor/make-demo-video.js +``` + +Generated knowledge graph audit artifacts are written to `reports/`. diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/demo.js b/scientific-knowledge-graph/graph-ingestion-auditor/demo.js new file mode 100644 index 00000000..65632568 --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/demo.js @@ -0,0 +1,15 @@ +const fs = require("fs"); +const path = require("path"); +const { auditKnowledgeGraph, renderMarkdownReport, renderSvgSummary } = require("./index"); +const { graphPolicy, riskyGraph } = require("./sample-data"); + +const outputDir = path.join(__dirname, "reports"); +fs.mkdirSync(outputDir, { recursive: true }); + +const result = auditKnowledgeGraph(riskyGraph, graphPolicy); +fs.writeFileSync(path.join(outputDir, "graph-ingestion-audit.json"), `${JSON.stringify(result, null, 2)}\n`); +fs.writeFileSync(path.join(outputDir, "graph-ingestion-audit.md"), renderMarkdownReport(result)); +fs.writeFileSync(path.join(outputDir, "graph-ingestion-summary.svg"), renderSvgSummary(result)); + +console.log(`decision=${result.decision} riskScore=${result.riskScore} findings=${result.findings.length}`); +console.log(`reports=${outputDir}`); diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/index.js b/scientific-knowledge-graph/graph-ingestion-auditor/index.js new file mode 100644 index 00000000..2edc48c1 --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/index.js @@ -0,0 +1,356 @@ +const crypto = require("crypto"); + +const SEVERITY_WEIGHT = { + blocker: 33, + high: 17, + medium: 8, + low: 3, +}; + +const REQUIRED_ENTITY_TYPES = ["paper", "author", "dataset", "method", "software", "concept"]; + +const IDENTIFIER_RULES = { + paper: /^10\.\d{4,9}\/[-._;()/:A-Z0-9]+$/i, + author: /^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$/, + dataset: /^(doi:10\.\d{4,9}\/.+|datacite:.+|zenodo:\d+)$/i, + method: /^(mesh:.+|obo:.+|wikidata:Q\d+)$/i, + software: /^(swh:1:.+|github:[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+|doi:10\.\d{4,9}\/.+)$/i, + concept: /^(mesh:.+|pubchem:\d+|uniprot:[A-Z0-9]+|wikidata:Q\d+)$/i, +}; + +function stableStringify(value) { + if (Array.isArray(value)) { + return `[${value.map(stableStringify).join(",")}]`; + } + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + return JSON.stringify(value); +} + +function digest(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex").slice(0, 16); +} + +function finding(severity, code, title, evidence, action) { + return { severity, code, title, evidence, action }; +} + +function entityKey(entity) { + return `${entity.type}:${entity.id}`; +} + +function verifyEntityCoverage(graph) { + const findings = []; + const types = new Set((graph.entities || []).map((entity) => entity.type)); + REQUIRED_ENTITY_TYPES.forEach((type) => { + if (!types.has(type)) { + findings.push(finding( + type === "paper" || type === "author" ? "blocker" : "high", + "REQUIRED_ENTITY_TYPE_MISSING", + "Required knowledge graph entity type is missing", + `${graph.projectId} has no extracted ${type} entities.`, + `Add at least one reviewed ${type} entity before publishing this graph slice.`, + )); + } + }); + return findings; +} + +function verifyEntityIdentifiers(graph, policy) { + const findings = []; + const minConfidence = policy.minimumEntityConfidence || 0.78; + + (graph.entities || []).forEach((entity) => { + if (!entity.label || !entity.type) { + findings.push(finding( + "blocker", + "ENTITY_LABEL_OR_TYPE_MISSING", + "Entity lacks a label or type", + `${entity.id || "unknown"} has label=${entity.label || "missing"} type=${entity.type || "missing"}.`, + "Require a label and typed extraction result before graph ingestion.", + )); + } + + if ((entity.confidence || 0) < minConfidence) { + findings.push(finding( + entity.type === "paper" || entity.type === "dataset" ? "high" : "medium", + "ENTITY_CONFIDENCE_BELOW_THRESHOLD", + "Extracted entity confidence is below threshold", + `${entityKey(entity)} confidence=${entity.confidence || 0}, threshold=${minConfidence}.`, + "Route the entity to human review or hide it from recommendations until verified.", + )); + } + + const rule = IDENTIFIER_RULES[entity.type]; + if (rule && !rule.test(entity.identifier || "")) { + findings.push(finding( + entity.type === "paper" || entity.type === "author" ? "high" : "medium", + "ENTITY_IDENTIFIER_INVALID", + "Entity identifier does not match the expected namespace", + `${entityKey(entity)} identifier=${entity.identifier || "missing"}.`, + "Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form.", + )); + } + + if (entity.visibility === "private" && !entity.privateProjectId) { + findings.push(finding( + "blocker", + "PRIVATE_ENTITY_SCOPE_MISSING", + "Private entity lacks a project scope", + `${entityKey(entity)} is private without a privateProjectId.`, + "Attach private graph nodes to a project scope or exclude them from shared graph exports.", + )); + } + }); + + return findings; +} + +function verifyRelationships(graph, policy) { + const findings = []; + const minEvidence = policy.minimumRelationshipEvidence || 2; + const knownEntities = new Set((graph.entities || []).map(entityKey)); + + (graph.relationships || []).forEach((relationship) => { + const source = `${relationship.sourceType}:${relationship.sourceId}`; + const target = `${relationship.targetType}:${relationship.targetId}`; + + if (!knownEntities.has(source) || !knownEntities.has(target)) { + findings.push(finding( + "blocker", + "RELATIONSHIP_ENDPOINT_MISSING", + "Relationship references a missing entity", + `${relationship.id} links ${source} -> ${target}, but at least one endpoint is absent.`, + "Create the missing entity or remove the relationship before graph export.", + )); + } + + if (!relationship.provenance?.sourceDocumentId || !relationship.provenance?.extractorVersion) { + findings.push(finding( + "high", + "RELATIONSHIP_PROVENANCE_MISSING", + "Relationship lacks extraction provenance", + `${relationship.id} has incomplete source document or extractor metadata.`, + "Store source document, span, extractor version, and review state for every edge.", + )); + } + + if ((relationship.evidenceCount || 0) < minEvidence) { + findings.push(finding( + "medium", + "RELATIONSHIP_EVIDENCE_WEAK", + "Relationship has weak evidence support", + `${relationship.id} has ${relationship.evidenceCount || 0} evidence items, below ${minEvidence}.`, + "Require more evidence or mark the edge as tentative in graph navigation.", + )); + } + + if (relationship.visibility === "public" && relationship.sourcePrivateContent) { + findings.push(finding( + "blocker", + "PRIVATE_CONTENT_EDGE_PUBLIC", + "Public relationship is derived from private content", + `${relationship.id} is public but cites private source content.`, + "Downgrade the edge to private scope or re-extract it from public evidence.", + )); + } + }); + + return findings; +} + +function verifyRecommendations(graph, policy) { + const findings = []; + const minRecommendationEvidence = policy.minimumRecommendationEvidence || 3; + + (graph.recommendations || []).forEach((recommendation) => { + if ((recommendation.evidenceEdges || []).length < minRecommendationEvidence) { + findings.push(finding( + "high", + "RECOMMENDATION_EVIDENCE_INSUFFICIENT", + "AI recommendation lacks enough graph evidence", + `${recommendation.id} has ${(recommendation.evidenceEdges || []).length} evidence edges.`, + "Hide or label the recommendation until enough verified graph evidence supports it.", + )); + } + + if (recommendation.usesPrivateSignals && recommendation.visibility === "public") { + findings.push(finding( + "blocker", + "PRIVATE_SIGNAL_IN_PUBLIC_RECOMMENDATION", + "Public recommendation uses private user or project signals", + `${recommendation.id} combines private signals with public visibility.`, + "Keep private-signal recommendations in the user workspace only.", + )); + } + }); + + return findings; +} + +function verifyExportReadiness(graph) { + const findings = []; + const exportConfig = graph.exportConfig || {}; + + if (!exportConfig.format || !["jsonld", "rdf", "graphml"].includes(exportConfig.format)) { + findings.push(finding( + "medium", + "GRAPH_EXPORT_FORMAT_UNSUPPORTED", + "Graph export format is missing or unsupported", + `${graph.projectId} export format=${exportConfig.format || "missing"}.`, + "Choose jsonld, rdf, or graphml for downstream graph consumers.", + )); + } + + if (exportConfig.public && !exportConfig.license) { + findings.push(finding( + "high", + "PUBLIC_GRAPH_LICENSE_MISSING", + "Public graph export lacks a license", + `${graph.projectId} is configured for public graph export without a license.`, + "Set a graph metadata license before publishing reusable linked data.", + )); + } + + if (!exportConfig.schemaVersion) { + findings.push(finding( + "medium", + "GRAPH_SCHEMA_VERSION_MISSING", + "Graph export lacks a schema version", + `${graph.projectId} has no schema version in exportConfig.`, + "Pin a schema version so downstream consumers can validate graph payloads.", + )); + } + + return findings; +} + +function auditKnowledgeGraph(graph, policy = {}) { + const findings = [ + ...verifyEntityCoverage(graph), + ...verifyEntityIdentifiers(graph, policy), + ...verifyRelationships(graph, policy), + ...verifyRecommendations(graph, policy), + ...verifyExportReadiness(graph), + ]; + + const riskScore = Math.min(100, findings.reduce((sum, item) => sum + SEVERITY_WEIGHT[item.severity], 0)); + const blockers = findings.filter((item) => item.severity === "blocker").length; + const high = findings.filter((item) => item.severity === "high").length; + const decision = blockers + ? "block-graph-publication" + : high + ? "manual-graph-review" + : findings.length + ? "publish-with-graph-caveats" + : "ready-for-graph-publication"; + + const entityCounts = (graph.entities || []).reduce((counts, entity) => { + counts[entity.type] = (counts[entity.type] || 0) + 1; + return counts; + }, {}); + + const packet = { + projectId: graph.projectId, + reviewedAt: policy.reviewDate, + decision, + riskScore, + entityCounts, + relationshipCount: (graph.relationships || []).length, + recommendationCount: (graph.recommendations || []).length, + exportFormat: graph.exportConfig?.format || null, + findings, + remediationActions: findings.map((item) => ({ code: item.code, action: item.action })), + generatedFrom: "synthetic-knowledge-graph-only", + }; + + return { + ...packet, + auditDigest: digest(packet), + }; +} + +function renderMarkdownReport(result) { + const lines = [ + "# Knowledge Graph Ingestion Audit", + "", + `Project: ${result.projectId}`, + `Decision: ${result.decision}`, + `Risk score: ${result.riskScore}`, + `Relationships: ${result.relationshipCount}`, + `Recommendations: ${result.recommendationCount}`, + `Audit digest: ${result.auditDigest}`, + "", + "## Entity Counts", + ]; + + Object.keys(result.entityCounts) + .sort() + .forEach((type) => { + lines.push(`- ${type}: ${result.entityCounts[type]}`); + }); + + lines.push("", "## Findings"); + if (!result.findings.length) { + lines.push("- No knowledge graph ingestion issues detected."); + } else { + result.findings.forEach((item) => { + lines.push(`- [${item.severity}] ${item.title}: ${item.evidence}`); + lines.push(` Action: ${item.action}`); + }); + } + + lines.push("", "## Safety", "- Synthetic graph packet only; no private research content, user activity, live ontology services, credentials, or external APIs."); + return `${lines.join("\n")}\n`; +} + +function escapeXml(value) { + return String(value) + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """); +} + +function renderSvgSummary(result) { + const color = result.decision === "block-graph-publication" + ? "#b42318" + : result.decision === "ready-for-graph-publication" + ? "#067647" + : "#b54708"; + const types = Object.keys(result.entityCounts).sort(); + const maxCount = Math.max(...types.map((type) => result.entityCounts[type]), 1); + const bars = types.slice(0, 6).map((type, index) => { + const y = 195 + index * 42; + const width = Math.round(430 * result.entityCounts[type] / maxCount); + return `${escapeXml(type)} + + + ${result.entityCounts[type]}`; + }).join("\n"); + + return ` + + + Knowledge Graph Ingestion Audit + + ${escapeXml(result.decision)} + Risk score: ${result.riskScore} + Audit digest: ${escapeXml(result.auditDigest)} + ${bars} + Synthetic graph packet only. No ontology or recommendation service calls. + +`; +} + +module.exports = { + auditKnowledgeGraph, + digest, + entityKey, + renderMarkdownReport, + renderSvgSummary, +}; diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/make-demo-video.js b/scientific-knowledge-graph/graph-ingestion-auditor/make-demo-video.js new file mode 100644 index 00000000..3a777896 --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/make-demo-video.js @@ -0,0 +1,101 @@ +const fs = require("fs"); +const path = require("path"); +const { execFileSync } = require("child_process"); +const { auditKnowledgeGraph } = require("./index"); +const { graphPolicy, riskyGraph } = require("./sample-data"); + +const WIDTH = 1280; +const HEIGHT = 720; +const outputDir = path.join(__dirname, "reports"); +const frameDir = path.join(outputDir, "ppm-frames"); +fs.mkdirSync(frameDir, { recursive: true }); + +function rgb(hex) { + const clean = hex.replace("#", ""); + return [ + Number.parseInt(clean.slice(0, 2), 16), + Number.parseInt(clean.slice(2, 4), 16), + Number.parseInt(clean.slice(4, 6), 16), + ]; +} + +function canvas(color) { + const data = Buffer.alloc(WIDTH * HEIGHT * 3); + const [r, g, b] = rgb(color); + for (let offset = 0; offset < data.length; offset += 3) { + data[offset] = r; + data[offset + 1] = g; + data[offset + 2] = b; + } + return data; +} + +function rect(data, x, y, width, height, color) { + const [r, g, b] = rgb(color); + for (let row = Math.max(0, y); row < Math.min(HEIGHT, y + height); row += 1) { + for (let col = Math.max(0, x); col < Math.min(WIDTH, x + width); col += 1) { + const offset = (row * WIDTH + col) * 3; + data[offset] = r; + data[offset + 1] = g; + data[offset + 2] = b; + } + } +} + +function writePpm(filePath, data) { + fs.writeFileSync(filePath, Buffer.concat([Buffer.from(`P6\n${WIDTH} ${HEIGHT}\n255\n`), data])); +} + +function renderFrame(filePath, result, frame) { + const data = canvas("#f7f8fa"); + rect(data, 36, 36, 1208, 648, "#ffffff"); + rect(data, 36, 36, 1208, 3, "#d0d5dd"); + rect(data, 36, 681, 1208, 3, "#d0d5dd"); + rect(data, 36, 36, 3, 648, "#d0d5dd"); + rect(data, 1241, 36, 3, 648, "#d0d5dd"); + + rect(data, 80, 82, 800, 34, "#101828"); + rect(data, 80, 132, Math.round(800 * result.riskScore / 100), 42, "#b42318"); + rect(data, 910, 82, 250, 92, "#fee4e2"); + rect(data, 944, 112, 182, 32, "#b42318"); + + const types = Object.keys(result.entityCounts).sort(); + const maxCount = Math.max(...types.map((type) => result.entityCounts[type]), 1); + types.forEach((type, index) => { + const y = 230 + index * 58; + const width = Math.round(760 * result.entityCounts[type] / maxCount); + rect(data, 120, y, 760, 34, "#eaecf0"); + rect(data, 120, y, Math.max(10, width - frame * 3), 34, "#12b76a"); + rect(data, 910, y - 6, 110, 46, "#dcfae6"); + rect(data, 950, y + 10, 30, 14, "#12b76a"); + }); + + const blockerCount = result.findings.filter((item) => item.severity === "blocker").length; + for (let i = 0; i < blockerCount; i += 1) { + rect(data, 120 + i * 82, 610 - frame * 3, 56, 36 + frame * 3, "#7a271a"); + } + rect(data, 80, 652, 1080, 12, "#98a2b3"); + writePpm(filePath, data); +} + +const result = auditKnowledgeGraph(riskyGraph, graphPolicy); +for (let frame = 0; frame < 4; frame += 1) { + renderFrame(path.join(frameDir, `frame-${String(frame + 1).padStart(3, "0")}.ppm`), result, frame); +} + +const outputPath = path.join(outputDir, "demo.mp4"); +execFileSync("ffmpeg", [ + "-y", + "-framerate", + "1", + "-i", + path.join(frameDir, "frame-%03d.ppm"), + "-vf", + "fps=12,format=yuv420p", + "-movflags", + "+faststart", + outputPath, +], { stdio: "inherit" }); + +fs.rmSync(frameDir, { recursive: true, force: true }); +console.log(`demo video=${outputPath}`); diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/reports/demo.mp4 b/scientific-knowledge-graph/graph-ingestion-auditor/reports/demo.mp4 new file mode 100644 index 00000000..bb99eb30 Binary files /dev/null and b/scientific-knowledge-graph/graph-ingestion-auditor/reports/demo.mp4 differ diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.json b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.json new file mode 100644 index 00000000..9ced4520 --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.json @@ -0,0 +1,176 @@ +{ + "projectId": "proj-crispr-neuro-graph", + "reviewedAt": "2026-05-28T00:00:00Z", + "decision": "block-graph-publication", + "riskScore": 100, + "entityCounts": { + "paper": 1, + "author": 1, + "dataset": 1, + "method": 1, + "concept": 1 + }, + "relationshipCount": 2, + "recommendationCount": 1, + "exportFormat": "csv", + "findings": [ + { + "severity": "high", + "code": "REQUIRED_ENTITY_TYPE_MISSING", + "title": "Required knowledge graph entity type is missing", + "evidence": "proj-crispr-neuro-graph has no extracted software entities.", + "action": "Add at least one reviewed software entity before publishing this graph slice." + }, + { + "severity": "high", + "code": "ENTITY_IDENTIFIER_INVALID", + "title": "Entity identifier does not match the expected namespace", + "evidence": "paper:paper-1 identifier=missing-doi.", + "action": "Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form." + }, + { + "severity": "high", + "code": "ENTITY_IDENTIFIER_INVALID", + "title": "Entity identifier does not match the expected namespace", + "evidence": "author:author-1 identifier=orcid-pending.", + "action": "Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form." + }, + { + "severity": "high", + "code": "ENTITY_CONFIDENCE_BELOW_THRESHOLD", + "title": "Extracted entity confidence is below threshold", + "evidence": "dataset:dataset-1 confidence=0.7, threshold=0.78.", + "action": "Route the entity to human review or hide it from recommendations until verified." + }, + { + "severity": "blocker", + "code": "PRIVATE_ENTITY_SCOPE_MISSING", + "title": "Private entity lacks a project scope", + "evidence": "dataset:dataset-1 is private without a privateProjectId.", + "action": "Attach private graph nodes to a project scope or exclude them from shared graph exports." + }, + { + "severity": "high", + "code": "RELATIONSHIP_PROVENANCE_MISSING", + "title": "Relationship lacks extraction provenance", + "evidence": "edge-1 has incomplete source document or extractor metadata.", + "action": "Store source document, span, extractor version, and review state for every edge." + }, + { + "severity": "medium", + "code": "RELATIONSHIP_EVIDENCE_WEAK", + "title": "Relationship has weak evidence support", + "evidence": "edge-1 has 1 evidence items, below 2.", + "action": "Require more evidence or mark the edge as tentative in graph navigation." + }, + { + "severity": "blocker", + "code": "PRIVATE_CONTENT_EDGE_PUBLIC", + "title": "Public relationship is derived from private content", + "evidence": "edge-1 is public but cites private source content.", + "action": "Downgrade the edge to private scope or re-extract it from public evidence." + }, + { + "severity": "blocker", + "code": "RELATIONSHIP_ENDPOINT_MISSING", + "title": "Relationship references a missing entity", + "evidence": "edge-2 links paper:paper-1 -> software:software-missing, but at least one endpoint is absent.", + "action": "Create the missing entity or remove the relationship before graph export." + }, + { + "severity": "high", + "code": "RECOMMENDATION_EVIDENCE_INSUFFICIENT", + "title": "AI recommendation lacks enough graph evidence", + "evidence": "rec-1 has 1 evidence edges.", + "action": "Hide or label the recommendation until enough verified graph evidence supports it." + }, + { + "severity": "blocker", + "code": "PRIVATE_SIGNAL_IN_PUBLIC_RECOMMENDATION", + "title": "Public recommendation uses private user or project signals", + "evidence": "rec-1 combines private signals with public visibility.", + "action": "Keep private-signal recommendations in the user workspace only." + }, + { + "severity": "medium", + "code": "GRAPH_EXPORT_FORMAT_UNSUPPORTED", + "title": "Graph export format is missing or unsupported", + "evidence": "proj-crispr-neuro-graph export format=csv.", + "action": "Choose jsonld, rdf, or graphml for downstream graph consumers." + }, + { + "severity": "high", + "code": "PUBLIC_GRAPH_LICENSE_MISSING", + "title": "Public graph export lacks a license", + "evidence": "proj-crispr-neuro-graph is configured for public graph export without a license.", + "action": "Set a graph metadata license before publishing reusable linked data." + }, + { + "severity": "medium", + "code": "GRAPH_SCHEMA_VERSION_MISSING", + "title": "Graph export lacks a schema version", + "evidence": "proj-crispr-neuro-graph has no schema version in exportConfig.", + "action": "Pin a schema version so downstream consumers can validate graph payloads." + } + ], + "remediationActions": [ + { + "code": "REQUIRED_ENTITY_TYPE_MISSING", + "action": "Add at least one reviewed software entity before publishing this graph slice." + }, + { + "code": "ENTITY_IDENTIFIER_INVALID", + "action": "Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form." + }, + { + "code": "ENTITY_IDENTIFIER_INVALID", + "action": "Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form." + }, + { + "code": "ENTITY_CONFIDENCE_BELOW_THRESHOLD", + "action": "Route the entity to human review or hide it from recommendations until verified." + }, + { + "code": "PRIVATE_ENTITY_SCOPE_MISSING", + "action": "Attach private graph nodes to a project scope or exclude them from shared graph exports." + }, + { + "code": "RELATIONSHIP_PROVENANCE_MISSING", + "action": "Store source document, span, extractor version, and review state for every edge." + }, + { + "code": "RELATIONSHIP_EVIDENCE_WEAK", + "action": "Require more evidence or mark the edge as tentative in graph navigation." + }, + { + "code": "PRIVATE_CONTENT_EDGE_PUBLIC", + "action": "Downgrade the edge to private scope or re-extract it from public evidence." + }, + { + "code": "RELATIONSHIP_ENDPOINT_MISSING", + "action": "Create the missing entity or remove the relationship before graph export." + }, + { + "code": "RECOMMENDATION_EVIDENCE_INSUFFICIENT", + "action": "Hide or label the recommendation until enough verified graph evidence supports it." + }, + { + "code": "PRIVATE_SIGNAL_IN_PUBLIC_RECOMMENDATION", + "action": "Keep private-signal recommendations in the user workspace only." + }, + { + "code": "GRAPH_EXPORT_FORMAT_UNSUPPORTED", + "action": "Choose jsonld, rdf, or graphml for downstream graph consumers." + }, + { + "code": "PUBLIC_GRAPH_LICENSE_MISSING", + "action": "Set a graph metadata license before publishing reusable linked data." + }, + { + "code": "GRAPH_SCHEMA_VERSION_MISSING", + "action": "Pin a schema version so downstream consumers can validate graph payloads." + } + ], + "generatedFrom": "synthetic-knowledge-graph-only", + "auditDigest": "0589aebec7ba5969" +} diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.md b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.md new file mode 100644 index 00000000..6e5f915e --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-audit.md @@ -0,0 +1,48 @@ +# Knowledge Graph Ingestion Audit + +Project: proj-crispr-neuro-graph +Decision: block-graph-publication +Risk score: 100 +Relationships: 2 +Recommendations: 1 +Audit digest: 0589aebec7ba5969 + +## Entity Counts +- author: 1 +- concept: 1 +- dataset: 1 +- method: 1 +- paper: 1 + +## Findings +- [high] Required knowledge graph entity type is missing: proj-crispr-neuro-graph has no extracted software entities. + Action: Add at least one reviewed software entity before publishing this graph slice. +- [high] Entity identifier does not match the expected namespace: paper:paper-1 identifier=missing-doi. + Action: Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form. +- [high] Entity identifier does not match the expected namespace: author:author-1 identifier=orcid-pending. + Action: Normalize the entity to DOI, ORCID, DataCite, MeSH, PubChem, UniProt, Wikidata, SWH, or GitHub form. +- [high] Extracted entity confidence is below threshold: dataset:dataset-1 confidence=0.7, threshold=0.78. + Action: Route the entity to human review or hide it from recommendations until verified. +- [blocker] Private entity lacks a project scope: dataset:dataset-1 is private without a privateProjectId. + Action: Attach private graph nodes to a project scope or exclude them from shared graph exports. +- [high] Relationship lacks extraction provenance: edge-1 has incomplete source document or extractor metadata. + Action: Store source document, span, extractor version, and review state for every edge. +- [medium] Relationship has weak evidence support: edge-1 has 1 evidence items, below 2. + Action: Require more evidence or mark the edge as tentative in graph navigation. +- [blocker] Public relationship is derived from private content: edge-1 is public but cites private source content. + Action: Downgrade the edge to private scope or re-extract it from public evidence. +- [blocker] Relationship references a missing entity: edge-2 links paper:paper-1 -> software:software-missing, but at least one endpoint is absent. + Action: Create the missing entity or remove the relationship before graph export. +- [high] AI recommendation lacks enough graph evidence: rec-1 has 1 evidence edges. + Action: Hide or label the recommendation until enough verified graph evidence supports it. +- [blocker] Public recommendation uses private user or project signals: rec-1 combines private signals with public visibility. + Action: Keep private-signal recommendations in the user workspace only. +- [medium] Graph export format is missing or unsupported: proj-crispr-neuro-graph export format=csv. + Action: Choose jsonld, rdf, or graphml for downstream graph consumers. +- [high] Public graph export lacks a license: proj-crispr-neuro-graph is configured for public graph export without a license. + Action: Set a graph metadata license before publishing reusable linked data. +- [medium] Graph export lacks a schema version: proj-crispr-neuro-graph has no schema version in exportConfig. + Action: Pin a schema version so downstream consumers can validate graph payloads. + +## Safety +- Synthetic graph packet only; no private research content, user activity, live ontology services, credentials, or external APIs. diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-summary.svg b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-summary.svg new file mode 100644 index 00000000..4d237aa7 --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/reports/graph-ingestion-summary.svg @@ -0,0 +1,30 @@ + + + + Knowledge Graph Ingestion Audit + + block-graph-publication + Risk score: 100 + Audit digest: 0589aebec7ba5969 + author + + + 1 +concept + + + 1 +dataset + + + 1 +method + + + 1 +paper + + + 1 + Synthetic graph packet only. No ontology or recommendation service calls. + diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/sample-data.js b/scientific-knowledge-graph/graph-ingestion-auditor/sample-data.js new file mode 100644 index 00000000..acfb071f --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/sample-data.js @@ -0,0 +1,221 @@ +const graphPolicy = { + reviewDate: "2026-05-28T00:00:00Z", + minimumEntityConfidence: 0.78, + minimumRelationshipEvidence: 2, + minimumRecommendationEvidence: 3, +}; + +const riskyGraph = { + projectId: "proj-crispr-neuro-graph", + entities: [ + { + id: "paper-1", + type: "paper", + label: "CRISPR screen in neural organoids", + identifier: "missing-doi", + confidence: 0.91, + visibility: "public", + }, + { + id: "author-1", + type: "author", + label: "R. Singh", + identifier: "orcid-pending", + confidence: 0.86, + visibility: "public", + }, + { + id: "dataset-1", + type: "dataset", + label: "Organoid expression matrix", + identifier: "zenodo:123456", + confidence: 0.7, + visibility: "private", + }, + { + id: "method-1", + type: "method", + label: "single-cell clustering", + identifier: "mesh:D012345", + confidence: 0.81, + visibility: "public", + }, + { + id: "concept-1", + type: "concept", + label: "dopamine receptor", + identifier: "uniprot:P14416", + confidence: 0.9, + visibility: "public", + }, + ], + relationships: [ + { + id: "edge-1", + sourceType: "paper", + sourceId: "paper-1", + targetType: "dataset", + targetId: "dataset-1", + predicate: "usesDataset", + evidenceCount: 1, + visibility: "public", + sourcePrivateContent: true, + provenance: { + sourceDocumentId: "doc-1", + extractorVersion: "", + }, + }, + { + id: "edge-2", + sourceType: "paper", + sourceId: "paper-1", + targetType: "software", + targetId: "software-missing", + predicate: "usesSoftware", + evidenceCount: 2, + visibility: "public", + sourcePrivateContent: false, + provenance: { + sourceDocumentId: "doc-1", + extractorVersion: "kg-extract-0.4.1", + }, + }, + ], + recommendations: [ + { + id: "rec-1", + label: "Recommend related CRISPR dataset", + evidenceEdges: ["edge-1"], + usesPrivateSignals: true, + visibility: "public", + }, + ], + exportConfig: { + public: true, + format: "csv", + license: "", + schemaVersion: "", + }, +}; + +const readyGraph = { + projectId: "proj-catalyst-kg", + entities: [ + { + id: "paper-1", + type: "paper", + label: "Catalyst reuse in flow reactors", + identifier: "10.1234/scibase.synthetic.42", + confidence: 0.95, + visibility: "public", + }, + { + id: "author-1", + type: "author", + label: "Morgan Rivera", + identifier: "0000-0001-5000-0007", + confidence: 0.93, + visibility: "public", + }, + { + id: "dataset-1", + type: "dataset", + label: "Catalyst reuse yields", + identifier: "doi:10.5281/zenodo.1234567", + confidence: 0.91, + visibility: "public", + }, + { + id: "method-1", + type: "method", + label: "flow chemistry", + identifier: "wikidata:Q902470", + confidence: 0.89, + visibility: "public", + }, + { + id: "software-1", + type: "software", + label: "scibase-analysis", + identifier: "github:SCIBASE-AI/scibase-analysis", + confidence: 0.88, + visibility: "public", + }, + { + id: "concept-1", + type: "concept", + label: "palladium", + identifier: "pubchem:23938", + confidence: 0.94, + visibility: "public", + }, + ], + relationships: [ + { + id: "edge-1", + sourceType: "paper", + sourceId: "paper-1", + targetType: "dataset", + targetId: "dataset-1", + predicate: "usesDataset", + evidenceCount: 3, + visibility: "public", + sourcePrivateContent: false, + provenance: { + sourceDocumentId: "doc-42", + extractorVersion: "kg-extract-1.0.0", + }, + }, + { + id: "edge-2", + sourceType: "paper", + sourceId: "paper-1", + targetType: "software", + targetId: "software-1", + predicate: "usesSoftware", + evidenceCount: 2, + visibility: "public", + sourcePrivateContent: false, + provenance: { + sourceDocumentId: "doc-42", + extractorVersion: "kg-extract-1.0.0", + }, + }, + { + id: "edge-3", + sourceType: "paper", + sourceId: "paper-1", + targetType: "concept", + targetId: "concept-1", + predicate: "studiesConcept", + evidenceCount: 4, + visibility: "public", + sourcePrivateContent: false, + provenance: { + sourceDocumentId: "doc-42", + extractorVersion: "kg-extract-1.0.0", + }, + }, + ], + recommendations: [ + { + id: "rec-1", + label: "Recommend related flow chemistry method", + evidenceEdges: ["edge-1", "edge-2", "edge-3"], + usesPrivateSignals: false, + visibility: "public", + }, + ], + exportConfig: { + public: true, + format: "jsonld", + license: "CC-BY-4.0", + schemaVersion: "scibase-kg-v1", + }, +}; + +module.exports = { + graphPolicy, + readyGraph, + riskyGraph, +}; diff --git a/scientific-knowledge-graph/graph-ingestion-auditor/test.js b/scientific-knowledge-graph/graph-ingestion-auditor/test.js new file mode 100644 index 00000000..d7e7067a --- /dev/null +++ b/scientific-knowledge-graph/graph-ingestion-auditor/test.js @@ -0,0 +1,45 @@ +const assert = require("assert"); +const { + auditKnowledgeGraph, + digest, + entityKey, + renderMarkdownReport, + renderSvgSummary, +} = require("./index"); +const { graphPolicy, readyGraph, riskyGraph } = require("./sample-data"); + +const risky = auditKnowledgeGraph(riskyGraph, graphPolicy); +assert.strictEqual(risky.decision, "block-graph-publication"); +assert.ok(risky.riskScore >= 90, "risky graph should produce a strong block score"); +assert.ok(risky.findings.some((item) => item.code === "REQUIRED_ENTITY_TYPE_MISSING")); +assert.ok(risky.findings.some((item) => item.code === "ENTITY_IDENTIFIER_INVALID")); +assert.ok(risky.findings.some((item) => item.code === "ENTITY_CONFIDENCE_BELOW_THRESHOLD")); +assert.ok(risky.findings.some((item) => item.code === "PRIVATE_ENTITY_SCOPE_MISSING")); +assert.ok(risky.findings.some((item) => item.code === "RELATIONSHIP_ENDPOINT_MISSING")); +assert.ok(risky.findings.some((item) => item.code === "RELATIONSHIP_PROVENANCE_MISSING")); +assert.ok(risky.findings.some((item) => item.code === "PRIVATE_CONTENT_EDGE_PUBLIC")); +assert.ok(risky.findings.some((item) => item.code === "PRIVATE_SIGNAL_IN_PUBLIC_RECOMMENDATION")); +assert.ok(risky.findings.some((item) => item.code === "GRAPH_EXPORT_FORMAT_UNSUPPORTED")); +assert.ok(risky.findings.some((item) => item.code === "PUBLIC_GRAPH_LICENSE_MISSING")); + +const repeat = auditKnowledgeGraph(riskyGraph, graphPolicy); +assert.strictEqual(risky.auditDigest, repeat.auditDigest, "audit digest must be deterministic"); + +const ready = auditKnowledgeGraph(readyGraph, graphPolicy); +assert.strictEqual(ready.decision, "ready-for-graph-publication"); +assert.strictEqual(ready.findings.length, 0); +assert.strictEqual(ready.entityCounts.software, 1); + +assert.strictEqual(entityKey({ type: "paper", id: "p1" }), "paper:p1"); +assert.strictEqual(digest({ b: 2, a: 1 }), digest({ a: 1, b: 2 })); + +const markdown = renderMarkdownReport(risky); +assert.ok(markdown.includes("Knowledge Graph Ingestion Audit")); +assert.ok(markdown.includes("Synthetic graph packet only")); +assert.ok(markdown.includes("block-graph-publication")); + +const svg = renderSvgSummary(risky); +assert.ok(svg.includes("