diff --git a/scientific-artifact-hosting-governance/README.md b/scientific-artifact-hosting-governance/README.md new file mode 100644 index 00000000..fd8d8b71 --- /dev/null +++ b/scientific-artifact-hosting-governance/README.md @@ -0,0 +1,52 @@ +# Scientific Artifact Hosting Governance + +This module adds a dependency-free governance layer for the Scientific Data & Code Hosting bounty in issue #14. It focuses on the reviewable data/code package contracts that make scientific artifacts findable, reusable, versioned, and executable. + +## What It Covers + +- Deterministic artifact manifests with SHA-256 content hashes. +- File-type detection for datasets, notebooks, code, figures, documents, and supplements. +- Metadata completeness checks for title, creators, description, keywords, and license. +- Metadata-aware preview descriptors for tables, notebooks, images, and source files. +- Version diff records for changed content, license changes, and metadata changes. +- JSON-LD and DataCite export payloads for FAIR discovery and DOI workflows. +- Execution environment readiness checks for reproducible analysis commands. +- Audit hash for review and long-term traceability. + +## Run The Demo + +```bash +node scientific-artifact-hosting-governance/demo.js +``` + +The demo prints a reviewer-ready JSON record with the artifact manifest, version diffs, runtime readiness, FAIR score, JSON-LD export, DataCite export, blockers, and audit hash. + +## Visual Demo + +Open `scientific-artifact-hosting-governance/docs/demo.svg` for a privacy-safe walkthrough of the module flow. It uses synthetic sample data only. + +For bounty review, the same walkthrough is also available as a short WebM demo video at `scientific-artifact-hosting-governance/docs/demo.webm`. + +## Run The Tests + +```bash +node scientific-artifact-hosting-governance/test.js +``` + +The tests cover type detection, manifest previews and hashes, metadata blockers, version diffs, runtime readiness, JSON-LD/DataCite exports, and audit hashing. + +## Requirement Mapping + +| Issue #14 requirement | Implementation | +| --- | --- | +| Scalable storage engine | `buildArtifactManifest()` records artifact type, hash, size, version, access, and preview metadata. | +| Metadata-aware previews | `buildPreviewDescriptor()` emits table, notebook, image, source, or download preview contracts. | +| Upload versioning and diffing | `diffArtifactVersions()` and `buildVersionDiffs()` capture content, license, and metadata changes. | +| JSON-LD/schema.org metadata | `buildJsonLd()` emits schema.org-compatible dataset distribution metadata. | +| DataCite metadata | `buildDataCite()` emits DOI-oriented identifier, creator, title, publisher, and related artifact records. | +| FAIR compliance | `scoreFairReadiness()` checks metadata completeness, public access, and runtime readiness. | +| Executable environments | `validateRuntimeEnvironment()` verifies Docker/environment definition and reproducibility commands. | + +## Design Notes + +The module uses only Node.js built-ins and synthetic sample data. It does not require package installation, credentials, live files, or external APIs. diff --git a/scientific-artifact-hosting-governance/demo.js b/scientific-artifact-hosting-governance/demo.js new file mode 100644 index 00000000..d469e5e9 --- /dev/null +++ b/scientific-artifact-hosting-governance/demo.js @@ -0,0 +1,6 @@ +const {evaluateArtifactHosting} = require("./index"); +const {project} = require("./sample-data"); + +const result = evaluateArtifactHosting(project); + +console.log(JSON.stringify(result, null, 2)); diff --git a/scientific-artifact-hosting-governance/docs/demo.svg b/scientific-artifact-hosting-governance/docs/demo.svg new file mode 100644 index 00000000..450b6cda --- /dev/null +++ b/scientific-artifact-hosting-governance/docs/demo.svg @@ -0,0 +1,78 @@ + + Scientific artifact hosting governance demo + Privacy-safe workflow demo showing synthetic artifacts moving through manifest hashing, metadata checks, version diffs, FAIR exports, runtime checks, and review readiness. + + + + + + + + + Scientific Artifact Hosting Governance + Synthetic demo: data, code, notebook, metadata, runtime checks, and FAIR export readiness. + + + 1. Artifact Intake + CSV dataset, Python code, notebook + + 3 artifacts detected + + + + + 2. Manifest + Hashes, versions, access, previews + + SHA-256 + preview contracts + + + + + 3. Metadata Checks + Title, creators, description, license + + metadata complete + + + + + 4. Version Diff + Dataset content changed from v1.0 + + content_changed + + + + + 5. FAIR Exports + schema.org JSON-LD and DataCite + + DOI-ready metadata + + + + + 6. Runtime Readiness + Docker image and reproducibility command + + ready_for_review + + + Audit Result + fairScore: 96 | blockers: [] | auditHash: deterministic review trace + + diff --git a/scientific-artifact-hosting-governance/docs/demo.webm b/scientific-artifact-hosting-governance/docs/demo.webm new file mode 100644 index 00000000..482f2be8 Binary files /dev/null and b/scientific-artifact-hosting-governance/docs/demo.webm differ diff --git a/scientific-artifact-hosting-governance/index.js b/scientific-artifact-hosting-governance/index.js new file mode 100644 index 00000000..38a054d2 --- /dev/null +++ b/scientific-artifact-hosting-governance/index.js @@ -0,0 +1,340 @@ +const crypto = require("crypto"); + +const EXTENSION_TYPES = { + csv: "dataset", + tsv: "dataset", + json: "dataset", + parquet: "dataset", + xlsx: "dataset", + ipynb: "notebook", + py: "code", + r: "code", + jl: "code", + png: "figure", + jpg: "figure", + jpeg: "figure", + svg: "figure", + mp4: "supplement", + pdf: "document", + md: "document", +}; + +function stableStringify(value) { + if (Array.isArray(value)) { + return `[${value.map(stableStringify).join(",")}]`; + } + + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + + return JSON.stringify(value); +} + +function sha256(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex"); +} + +function assertArray(name, value) { + if (!Array.isArray(value)) { + throw new TypeError(`${name} must be an array`); + } +} + +function getExtension(name) { + const index = String(name || "").lastIndexOf("."); + return index === -1 ? "" : name.slice(index + 1).toLowerCase(); +} + +function detectArtifactType(artifact) { + if (artifact.type) { + return artifact.type; + } + + return EXTENSION_TYPES[getExtension(artifact.name)] || "supplement"; +} + +function byteSize(content) { + if (content === undefined || content === null) { + return 0; + } + + return Buffer.byteLength(typeof content === "string" ? content : stableStringify(content)); +} + +function buildPreviewDescriptor(artifact, type) { + if (type === "dataset" && typeof artifact.content === "string") { + const rows = artifact.content.trim().split(/\r?\n/).filter(Boolean); + const header = rows[0] ? rows[0].split(",").map((value) => value.trim()) : []; + return { + kind: "table", + rowCount: Math.max(rows.length - 1, 0), + columns: header, + }; + } + + if (type === "notebook") { + return { + kind: "notebook", + renderer: "jupyter", + }; + } + + if (type === "figure") { + return { + kind: "thumbnail", + renderer: "image", + }; + } + + if (type === "code") { + return { + kind: "source", + language: artifact.language || getExtension(artifact.name), + }; + } + + return { + kind: "download", + }; +} + +function validateArtifactMetadata(artifact) { + const metadata = artifact.metadata || {}; + const missing = []; + + for (const field of ["title", "creators", "description", "keywords"]) { + if (!metadata[field] || (Array.isArray(metadata[field]) && metadata[field].length === 0)) { + missing.push(field); + } + } + + if (!artifact.license) { + missing.push("license"); + } + + return { + ok: missing.length === 0, + missing, + }; +} + +function buildArtifactManifest(project) { + assertArray("project.artifacts", project.artifacts); + + return project.artifacts.map((artifact) => { + const type = detectArtifactType(artifact); + const metadataCheck = validateArtifactMetadata(artifact); + + return { + id: artifact.id, + name: artifact.name, + type, + version: artifact.version || "1", + contentHash: sha256({ + name: artifact.name, + type, + content: artifact.content, + metadata: artifact.metadata || {}, + }), + sizeBytes: byteSize(artifact.content), + license: artifact.license || null, + access: artifact.access || "private", + persistentId: artifact.persistentId || null, + preview: buildPreviewDescriptor(artifact, type), + metadataCheck, + }; + }); +} + +function diffArtifactVersions(previousArtifact, currentArtifact) { + const previousType = detectArtifactType(previousArtifact); + const currentType = detectArtifactType(currentArtifact); + const previousHash = sha256({ + name: previousArtifact.name, + type: previousType, + content: previousArtifact.content, + metadata: previousArtifact.metadata || {}, + }); + const currentHash = sha256({ + name: currentArtifact.name, + type: currentType, + content: currentArtifact.content, + metadata: currentArtifact.metadata || {}, + }); + const changes = []; + + if (previousHash !== currentHash) { + changes.push("content_changed"); + } + if (previousArtifact.license !== currentArtifact.license) { + changes.push("license_changed"); + } + if (stableStringify(previousArtifact.metadata || {}) !== stableStringify(currentArtifact.metadata || {})) { + changes.push("metadata_changed"); + } + + return { + artifactId: currentArtifact.id, + previousVersion: previousArtifact.version || "1", + currentVersion: currentArtifact.version || "1", + previousHash, + currentHash, + changes, + }; +} + +function buildVersionDiffs(project) { + const previousById = new Map((project.previousArtifacts || []).map((artifact) => [artifact.id, artifact])); + + return project.artifacts + .filter((artifact) => previousById.has(artifact.id)) + .map((artifact) => diffArtifactVersions(previousById.get(artifact.id), artifact)); +} + +function validateRuntimeEnvironment(environment) { + const missing = []; + + if (!environment || typeof environment !== "object") { + return { + ok: false, + missing: ["environment"], + commands: [], + }; + } + + if (!environment.type) { + missing.push("type"); + } + if (!environment.image && !environment.definitionFile) { + missing.push("image_or_definition_file"); + } + if (!Array.isArray(environment.commands) || environment.commands.length === 0) { + missing.push("commands"); + } + + return { + ok: missing.length === 0, + missing, + commands: environment.commands || [], + }; +} + +function buildJsonLd(project, manifest) { + return { + "@context": "https://schema.org", + "@type": "Dataset", + "@id": project.persistentId || project.id, + name: project.title, + description: project.description, + creator: project.creators, + keywords: project.keywords || [], + license: project.license, + distribution: manifest.map((artifact) => ({ + "@type": "DataDownload", + identifier: artifact.persistentId || artifact.id, + name: artifact.name, + encodingFormat: artifact.type, + contentUrl: artifact.access === "public" ? `/artifacts/${artifact.id}` : null, + sha256: artifact.contentHash, + })), + }; +} + +function metadataIdentifierType(identifier) { + return String(identifier || "").startsWith("10.") ? "DOI" : "LocalId"; +} + +function buildDataCite(project, manifest) { + const projectIdentifier = project.persistentId || project.id; + + return { + identifiers: [{identifier: projectIdentifier, identifierType: metadataIdentifierType(projectIdentifier)}], + creators: project.creators.map((creator) => ({name: creator})), + titles: [{title: project.title}], + publisher: project.publisher || "SCIBASE.AI", + publicationYear: project.publicationYear, + resourceType: {resourceTypeGeneral: "Dataset", resourceType: "Scientific artifact package"}, + relatedIdentifiers: manifest.map((artifact) => { + const artifactIdentifier = artifact.persistentId || artifact.id; + + return { + relatedIdentifier: artifactIdentifier, + relatedIdentifierType: metadataIdentifierType(artifactIdentifier), + relationType: "HasPart", + }; + }), + }; +} + +function scoreFairReadiness(manifest, runtime) { + const maxScore = 100; + const metadataMissing = manifest.reduce((sum, artifact) => sum + artifact.metadataCheck.missing.length, 0); + const privateArtifacts = manifest.filter((artifact) => artifact.access !== "public").length; + const runtimePenalty = runtime.ok ? 0 : 20; + const score = maxScore - metadataMissing * 8 - privateArtifacts * 4 - runtimePenalty; + + return Math.max(score, 0); +} + +function evaluateArtifactHosting(project) { + assertArray("project.creators", project.creators); + assertArray("project.artifacts", project.artifacts); + + const manifest = buildArtifactManifest(project); + const runtime = validateRuntimeEnvironment(project.executionEnvironment); + const versionDiffs = buildVersionDiffs(project); + const blockers = []; + + for (const artifact of manifest) { + if (!artifact.metadataCheck.ok) { + blockers.push(`artifact_metadata_incomplete:${artifact.id}`); + } + } + + if (!runtime.ok) { + blockers.push("runtime_environment_incomplete"); + } + + if (!project.license) { + blockers.push("project_license_missing"); + } + + const fairScore = scoreFairReadiness(manifest, runtime); + + return { + projectId: project.id, + manifest, + versionDiffs, + runtimeReadiness: runtime, + fairScore, + hostingReadiness: blockers.length === 0 ? "ready_for_review" : "blocked", + blockers, + jsonLd: buildJsonLd(project, manifest), + dataCite: buildDataCite(project, manifest), + auditHash: sha256({ + projectId: project.id, + manifest, + versionDiffs, + runtime, + fairScore, + blockers, + }), + }; +} + +module.exports = { + buildArtifactManifest, + buildDataCite, + buildJsonLd, + detectArtifactType, + diffArtifactVersions, + evaluateArtifactHosting, + metadataIdentifierType, + sha256, + stableStringify, + validateRuntimeEnvironment, +}; diff --git a/scientific-artifact-hosting-governance/sample-data.js b/scientific-artifact-hosting-governance/sample-data.js new file mode 100644 index 00000000..58ca23f4 --- /dev/null +++ b/scientific-artifact-hosting-governance/sample-data.js @@ -0,0 +1,81 @@ +const previousDataset = { + id: "artifact-cell-counts", + name: "cell-counts.csv", + version: "1.0", + license: "CC-BY-4.0", + access: "public", + persistentId: "10.5555/scibase.cell-counts.v1", + metadata: { + title: "Cell count measurements", + creators: ["Demo Lab"], + description: "Baseline cell-count measurements for a synthetic reproducibility package.", + keywords: ["cell-count", "reproducibility", "synthetic"], + }, + content: "sample,condition,count\nA,control,10\nB,treatment,12\n", +}; + +const project = { + id: "project-reproducible-cells-001", + title: "Synthetic cell-count reproducibility package", + description: "A privacy-safe sample package showing data, code, notebook, metadata, versioning, and runtime readiness.", + creators: ["Demo Lab", "SCIBASE Reviewer"], + keywords: ["FAIR", "data hosting", "code hosting", "reproducibility"], + license: "CC-BY-4.0", + publisher: "SCIBASE.AI", + publicationYear: 2026, + persistentId: "10.5555/scibase.reproducible-cells", + previousArtifacts: [previousDataset], + artifacts: [ + { + ...previousDataset, + version: "1.1", + content: "sample,condition,count\nA,control,10\nB,treatment,13\nC,treatment,15\n", + }, + { + id: "artifact-analysis-code", + name: "analysis.py", + version: "1.0", + license: "MIT", + access: "public", + persistentId: "analysis.py", + language: "python", + metadata: { + title: "Analysis script", + creators: ["Demo Lab"], + description: "Synthetic analysis script for the reproducibility package.", + keywords: ["python", "analysis", "reproducibility"], + }, + content: "print('run reproducibility analysis')\n", + }, + { + id: "artifact-notebook", + name: "reproduce.ipynb", + version: "1.0", + license: "MIT", + access: "private", + persistentId: "reproduce.ipynb", + metadata: { + title: "Reproducibility notebook", + creators: ["Demo Lab"], + description: "Synthetic notebook renderer fixture.", + keywords: ["jupyter", "notebook"], + }, + content: { + cells: [{cell_type: "markdown", source: ["# Reproduce synthetic result"]}], + metadata: {}, + nbformat: 4, + nbformat_minor: 5, + }, + }, + ], + executionEnvironment: { + type: "docker", + image: "python:3.12-slim", + commands: ["python analysis.py"], + }, +}; + +module.exports = { + previousDataset, + project, +}; diff --git a/scientific-artifact-hosting-governance/test.js b/scientific-artifact-hosting-governance/test.js new file mode 100644 index 00000000..78a3625e --- /dev/null +++ b/scientific-artifact-hosting-governance/test.js @@ -0,0 +1,91 @@ +const assert = require("assert"); +const { + buildArtifactManifest, + detectArtifactType, + diffArtifactVersions, + evaluateArtifactHosting, + metadataIdentifierType, + validateRuntimeEnvironment, +} = require("./index"); +const {previousDataset, project} = require("./sample-data"); + +function testTypeDetection() { + assert.strictEqual(detectArtifactType({name: "measurements.csv"}), "dataset"); + assert.strictEqual(detectArtifactType({name: "analysis.ipynb"}), "notebook"); + assert.strictEqual(detectArtifactType({name: "workflow.py"}), "code"); + assert.strictEqual(detectArtifactType({name: "figure.svg"}), "figure"); + assert.strictEqual(detectArtifactType({name: "unknown.bin"}), "supplement"); +} + +function testManifestBuildsPreviewAndHashes() { + const manifest = buildArtifactManifest(project); + const dataset = manifest.find((artifact) => artifact.id === "artifact-cell-counts"); + const notebook = manifest.find((artifact) => artifact.id === "artifact-notebook"); + + assert.strictEqual(manifest.length, 3); + assert.strictEqual(dataset.type, "dataset"); + assert.strictEqual(dataset.preview.rowCount, 3); + assert.deepStrictEqual(dataset.preview.columns, ["sample", "condition", "count"]); + assert.strictEqual(dataset.metadataCheck.ok, true); + assert.strictEqual(dataset.contentHash.length, 64); + assert.strictEqual(notebook.preview.kind, "notebook"); +} + +function testMissingMetadataBlocksReadiness() { + const incomplete = { + ...project, + artifacts: [ + { + id: "artifact-bad", + name: "bad.csv", + content: "id,value\n1,2\n", + }, + ], + }; + const result = evaluateArtifactHosting(incomplete); + + assert.strictEqual(result.hostingReadiness, "blocked"); + assert(result.blockers.includes("artifact_metadata_incomplete:artifact-bad")); +} + +function testVersionDiffs() { + const diff = diffArtifactVersions(previousDataset, project.artifacts[0]); + + assert.strictEqual(diff.artifactId, "artifact-cell-counts"); + assert.strictEqual(diff.previousVersion, "1.0"); + assert.strictEqual(diff.currentVersion, "1.1"); + assert(diff.changes.includes("content_changed")); +} + +function testRuntimeReadiness() { + assert.strictEqual(validateRuntimeEnvironment(project.executionEnvironment).ok, true); + assert.deepStrictEqual(validateRuntimeEnvironment({type: "docker"}).missing, ["image_or_definition_file", "commands"]); +} + +function testFullEvaluationExportsMetadata() { + const result = evaluateArtifactHosting(project); + + assert.strictEqual(result.hostingReadiness, "ready_for_review"); + assert(result.fairScore > 80); + assert.strictEqual(result.jsonLd["@type"], "Dataset"); + assert.strictEqual(result.jsonLd.distribution.length, 3); + assert.strictEqual(result.dataCite.relatedIdentifiers.length, 3); + assert.strictEqual(result.dataCite.relatedIdentifiers[0].relatedIdentifierType, "DOI"); + assert.strictEqual(result.dataCite.relatedIdentifiers[1].relatedIdentifierType, "LocalId"); + assert.strictEqual(result.auditHash.length, 64); +} + +function testMetadataIdentifierTypes() { + assert.strictEqual(metadataIdentifierType("10.5555/scibase.artifact"), "DOI"); + assert.strictEqual(metadataIdentifierType("analysis.py"), "LocalId"); +} + +testTypeDetection(); +testManifestBuildsPreviewAndHashes(); +testMissingMetadataBlocksReadiness(); +testVersionDiffs(); +testRuntimeReadiness(); +testFullEvaluationExportsMetadata(); +testMetadataIdentifierTypes(); + +console.log("scientific-artifact-hosting-governance tests passed");