diff --git a/knowledge-graph-author-affiliation-disambiguation/README.md b/knowledge-graph-author-affiliation-disambiguation/README.md new file mode 100644 index 0000000..ff3ae7f --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/README.md @@ -0,0 +1,30 @@ +# Knowledge Graph Author Affiliation Disambiguation + +Self-contained module for issue `#17` Scientific Knowledge Graph Integration. + +It adds a deterministic trust layer for author and affiliation entities before they are used in entity pages, collaboration maps, semantic search, or AI recommendations. + +## What It Does + +- Normalizes author names, initials, ORCID values, email domains, affiliations, and scientific concepts. +- Merges author mentions when there is strong evidence from ORCID, affiliation, domain, and topic overlap. +- Sends homonyms and low-confidence merges to a curator queue instead of polluting the graph. +- Builds collaboration edges from shared document evidence. +- Produces recommendation guards so uncertain identities do not drive cross-lab suggestions. +- Exports schema.org-compatible creator metadata. + +## Demo + +```bash +npm run check +npm test +npm run demo +``` + +The sample dataset intentionally contains two different `Maya Chen` authors. The Stanford CRISPR author is merged across paper, dataset, and protocol mentions, while the MIT materials-science homonym is routed to curator review. + +## Why This Belongs in the Knowledge Graph + +The issue calls out authors, affiliations, entity pages, lab-to-lab collaboration maps, graph navigation, and personalized recommendations. Those features depend on author identity quality. A graph that merges homonyms or splits the same author across affiliations will produce misleading entity pages, false collaboration paths, and bad recommendation digests. + +This slice complements broad extractors and navigators by adding identity-quality controls before graph edges are trusted. diff --git a/knowledge-graph-author-affiliation-disambiguation/data/sample-mentions.json b/knowledge-graph-author-affiliation-disambiguation/data/sample-mentions.json new file mode 100644 index 0000000..2d79660 --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/data/sample-mentions.json @@ -0,0 +1,56 @@ +[ + { + "mentionId": "m1", + "documentId": "paper-crispr-01", + "name": "Dr. Maya A. Chen", + "orcid": "0000-0002-1825-0097", + "email": "maya.chen@stanford.edu", + "affiliation": "Stanford University, Department of Bioengineering", + "concepts": ["CRISPR", "single-cell RNA-seq"], + "doi": "10.5555/crispr.01" + }, + { + "mentionId": "m2", + "documentId": "dataset-crispr-01", + "name": "Maya Chen", + "email": "mchen@stanford.edu", + "affiliation": "Dept. of Bioengineering, Stanford Univ.", + "concepts": ["CRISPR", "perturb-seq"], + "doi": "10.5555/data.01" + }, + { + "mentionId": "m3", + "documentId": "protocol-crispr-02", + "name": "M. Chen", + "affiliation": "Stanford Bioengineering", + "concepts": ["CRISPR", "protocol"], + "doi": "10.5555/protocol.02" + }, + { + "mentionId": "m4", + "documentId": "paper-materials-07", + "name": "Maya Chen", + "email": "maya.chen@mit.edu", + "affiliation": "MIT Materials Science and Engineering", + "concepts": ["perovskite", "thin films"], + "doi": "10.5555/materials.07" + }, + { + "mentionId": "m5", + "documentId": "paper-crispr-01", + "name": "Luis Ortega", + "orcid": "0000-0003-1111-2222", + "email": "lortega@ucsf.edu", + "affiliation": "UCSF Computational Biology", + "concepts": ["single-cell RNA-seq", "trajectory inference"], + "doi": "10.5555/crispr.01" + }, + { + "mentionId": "m6", + "documentId": "dataset-crispr-01", + "name": "L. Ortega", + "affiliation": "University of California San Francisco", + "concepts": ["trajectory inference", "perturb-seq"], + "doi": "10.5555/data.01" + } +] diff --git a/knowledge-graph-author-affiliation-disambiguation/docs/demo.mp4 b/knowledge-graph-author-affiliation-disambiguation/docs/demo.mp4 new file mode 100644 index 0000000..be954aa Binary files /dev/null and b/knowledge-graph-author-affiliation-disambiguation/docs/demo.mp4 differ diff --git a/knowledge-graph-author-affiliation-disambiguation/docs/demo.svg b/knowledge-graph-author-affiliation-disambiguation/docs/demo.svg new file mode 100644 index 0000000..8430a26 --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/docs/demo.svg @@ -0,0 +1,20 @@ + + + Author Affiliation Disambiguation + Trust layer for scientific knowledge graph author and collaboration nodes + + Maya A. Chen + Stanford Bioengineering + + Luis Ortega + UCSF Computational Bio + + Maya Chen + MIT Materials Science + + collaboration edge + + homonym review guard + + Output: author nodes, collaboration edges, schema.org export, curator queue, recommendation suppression guards + diff --git a/knowledge-graph-author-affiliation-disambiguation/docs/requirement-map.md b/knowledge-graph-author-affiliation-disambiguation/docs/requirement-map.md new file mode 100644 index 0000000..fcf84fd --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/docs/requirement-map.md @@ -0,0 +1,16 @@ +# Requirement Map + +This module targets issue `#17` Scientific Knowledge Graph Integration. + +| Issue capability | Implementation | +| --- | --- | +| Parse authors and affiliations from uploaded content | `createAuthorGraph()` accepts author mentions from papers, datasets, notebooks, or protocols and normalizes names, affiliations, ORCID values, email domains, concepts, and DOI evidence. | +| Build author graphs and collaboration maps | `buildCollaborationEdges()` creates weighted collaboration edges from shared document evidence and shared concept context. | +| Aggregate usage contexts for entity pages | Author nodes include mention IDs, source documents, concepts, affiliations, and merge evidence for entity-page rendering. | +| Support semantic graph recommendations | `recommendationGuards` suppresses recommendations when identity confidence is not sufficient, preventing unsafe cross-lab suggestions. | +| Output linked data / schema.org metadata | `toSchemaOrg()` exports creator metadata with schema.org `Person` and `Organization` structures. | +| Human review for ambiguous graph edges | `curatorQueue` captures homonyms and low-confidence merge candidates with transparent scoring reasons. | + +## Distinctness + +This slice is intentionally narrower than broad knowledge graph extractors and navigators. It focuses on author identity, affiliation normalization, homonym safety, and collaboration-edge trust, which are prerequisites for accurate author graphs, lab-to-lab maps, and personalized recommendations. diff --git a/knowledge-graph-author-affiliation-disambiguation/package.json b/knowledge-graph-author-affiliation-disambiguation/package.json new file mode 100644 index 0000000..786d59f --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/package.json @@ -0,0 +1,11 @@ +{ + "name": "knowledge-graph-author-affiliation-disambiguation", + "version": "1.0.0", + "private": true, + "type": "commonjs", + "scripts": { + "check": "node --check src/author-affiliation-disambiguation.js && node --check test/author-affiliation-disambiguation.test.js && node --check scripts/demo.js", + "test": "node --test test/author-affiliation-disambiguation.test.js", + "demo": "node scripts/demo.js" + } +} diff --git a/knowledge-graph-author-affiliation-disambiguation/scripts/demo.js b/knowledge-graph-author-affiliation-disambiguation/scripts/demo.js new file mode 100644 index 0000000..3f9c374 --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/scripts/demo.js @@ -0,0 +1,25 @@ +const mentions = require("../data/sample-mentions.json"); +const {createAuthorGraph, toSchemaOrg} = require("../src/author-affiliation-disambiguation"); + +const graph = createAuthorGraph(mentions); + +console.log("Author nodes"); +console.table(graph.authorNodes.map((node) => ({ + id: node.id, + name: node.name, + mentions: node.mentionIds.length, + documents: node.documents.length, + concepts: node.concepts.length +}))); + +console.log("\nCollaboration edges"); +console.table(graph.collaborationEdges); + +console.log("\nCurator queue"); +console.table(graph.curatorQueue); + +console.log("\nRecommendation guards"); +console.log(JSON.stringify(graph.recommendationGuards, null, 2)); + +console.log("\nSchema.org export"); +console.log(JSON.stringify(toSchemaOrg(graph), null, 2)); diff --git a/knowledge-graph-author-affiliation-disambiguation/src/author-affiliation-disambiguation.js b/knowledge-graph-author-affiliation-disambiguation/src/author-affiliation-disambiguation.js new file mode 100644 index 0000000..0102a5d --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/src/author-affiliation-disambiguation.js @@ -0,0 +1,247 @@ +const crypto = require("crypto"); + +const STOPWORDS = new Set(["dr", "prof", "phd", "md"]); +const AFFILIATION_ALIASES = new Map([ + ["univ", "university"], + ["dept", "department"], + ["mit", "massachusetts institute of technology"], + ["ucsf", "university of california san francisco"] +]); + +function normalizeText(value) { + return String(value || "") + .toLowerCase() + .replace(/&/g, " and ") + .replace(/[^a-z0-9\s.-]/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function normalizeName(name) { + const tokens = normalizeText(name) + .replace(/\./g, " ") + .split(" ") + .filter((token) => token && !STOPWORDS.has(token)); + return { + normalized: tokens.join(" "), + tokens, + initials: tokens.map((token) => token[0]).join(""), + surname: tokens.at(-1) || "" + }; +} + +function normalizeAffiliation(affiliation) { + const tokens = normalizeText(affiliation) + .split(" ") + .map((token) => token.replace(/\./g, "")) + .flatMap((token) => (AFFILIATION_ALIASES.has(token) ? AFFILIATION_ALIASES.get(token).split(" ") : [token])) + .filter(Boolean); + return [...new Set(tokens)].sort(); +} + +function emailDomain(email) { + const parts = String(email || "").toLowerCase().split("@"); + return parts.length === 2 ? parts[1] : ""; +} + +function jaccard(left, right) { + const a = new Set(left); + const b = new Set(right); + if (a.size === 0 && b.size === 0) { + return 0; + } + const intersection = [...a].filter((item) => b.has(item)).length; + return intersection / new Set([...a, ...b]).size; +} + +function conceptOverlap(left, right) { + return jaccard((left.concepts || []).map(normalizeText), (right.concepts || []).map(normalizeText)); +} + +function nameSimilarity(left, right) { + const a = normalizeName(left.name); + const b = normalizeName(right.name); + if (a.normalized === b.normalized) { + return 1; + } + if (a.surname && a.surname === b.surname && (a.initials[0] === b.initials[0] || a.tokens[0] === b.tokens[0])) { + return 0.82; + } + return jaccard(a.tokens, b.tokens); +} + +function compareMentions(left, right) { + if (left.orcid && right.orcid && left.orcid === right.orcid) { + return { + score: 0.99, + decision: "same-author", + reasons: ["orcid-exact-match"] + }; + } + + const nameScore = nameSimilarity(left, right); + const affiliationScore = jaccard(normalizeAffiliation(left.affiliation), normalizeAffiliation(right.affiliation)); + const domainScore = emailDomain(left.email) && emailDomain(left.email) === emailDomain(right.email) ? 1 : 0; + const topicScore = conceptOverlap(left, right); + const score = Number((nameScore * 0.45 + affiliationScore * 0.25 + domainScore * 0.2 + topicScore * 0.1).toFixed(3)); + + const reasons = [ + `name:${nameScore.toFixed(2)}`, + `affiliation:${affiliationScore.toFixed(2)}`, + `email-domain:${domainScore.toFixed(2)}`, + `concepts:${topicScore.toFixed(2)}` + ]; + + if (score >= 0.7 || (nameScore >= 0.82 && affiliationScore >= 0.35 && topicScore >= 0.25)) { + return {score, decision: "same-author", reasons}; + } + if (nameScore >= 0.82 && affiliationScore < 0.18 && domainScore === 0) { + return {score, decision: "homonym-review", reasons}; + } + if (score >= 0.5) { + return {score, decision: "needs-review", reasons}; + } + return {score, decision: "different-author", reasons}; +} + +function clusterMentions(mentions) { + const clusters = []; + const reviewQueue = []; + + for (const mention of mentions) { + let best = null; + for (const cluster of clusters) { + const comparisons = cluster.mentions.map((existing) => compareMentions(existing, mention)); + const strongest = comparisons.reduce((max, item) => (item.score > max.score ? item : max), {score: 0}); + if (!best || strongest.score > best.comparison.score) { + best = {cluster, comparison: strongest}; + } + } + + if (best && best.comparison.decision === "same-author") { + best.cluster.mentions.push(mention); + best.cluster.evidence.push({mentionId: mention.mentionId, score: best.comparison.score, reasons: best.comparison.reasons}); + } else { + const clusterId = `author-${clusters.length + 1}`; + clusters.push({ + clusterId, + canonicalName: mention.name, + canonicalAffiliation: mention.affiliation, + mentions: [mention], + evidence: [{mentionId: mention.mentionId, score: 1, reasons: ["cluster-seed"]}] + }); + } + + if (best && best.comparison.decision !== "same-author" && best.comparison.score >= 0.45) { + reviewQueue.push({ + leftClusterId: best.cluster.clusterId, + mentionId: mention.mentionId, + decision: best.comparison.decision, + score: best.comparison.score, + reasons: best.comparison.reasons + }); + } + } + + return {clusters, reviewQueue}; +} + +function buildCollaborationEdges(clusters) { + const byDocument = new Map(); + for (const cluster of clusters) { + for (const mention of cluster.mentions) { + if (!byDocument.has(mention.documentId)) { + byDocument.set(mention.documentId, []); + } + byDocument.get(mention.documentId).push({cluster, mention}); + } + } + + const edges = new Map(); + for (const [documentId, authors] of byDocument.entries()) { + for (let i = 0; i < authors.length; i += 1) { + for (let j = i + 1; j < authors.length; j += 1) { + const pair = [authors[i].cluster.clusterId, authors[j].cluster.clusterId].sort(); + const edgeId = pair.join("--"); + const edge = edges.get(edgeId) || { + edgeId, + source: pair[0], + target: pair[1], + documents: [], + sharedConcepts: new Set(), + confidence: 0 + }; + edge.documents.push(documentId); + for (const concept of [...(authors[i].mention.concepts || []), ...(authors[j].mention.concepts || [])]) { + edge.sharedConcepts.add(normalizeText(concept)); + } + edge.confidence = Math.min(1, edge.confidence + 0.25); + edges.set(edgeId, edge); + } + } + } + + return [...edges.values()].map((edge) => ({ + ...edge, + sharedConcepts: [...edge.sharedConcepts].sort(), + confidence: Number(edge.confidence.toFixed(2)) + })); +} + +function createAuthorGraph(mentions) { + const {clusters, reviewQueue} = clusterMentions(mentions); + const collaborationEdges = buildCollaborationEdges(clusters); + const graphHash = crypto + .createHash("sha256") + .update(JSON.stringify({clusters, reviewQueue, collaborationEdges})) + .digest("hex"); + + return { + generatedAt: new Date("2026-05-15T00:00:00.000Z").toISOString(), + graphHash, + authorNodes: clusters.map((cluster) => ({ + id: cluster.clusterId, + name: cluster.canonicalName, + affiliation: cluster.canonicalAffiliation, + mentionIds: cluster.mentions.map((mention) => mention.mentionId), + documents: [...new Set(cluster.mentions.map((mention) => mention.documentId))].sort(), + concepts: [...new Set(cluster.mentions.flatMap((mention) => mention.concepts || []).map(normalizeText))].sort(), + evidence: cluster.evidence + })), + collaborationEdges, + curatorQueue: reviewQueue, + recommendationGuards: reviewQueue.map((item) => ({ + reason: "author-identity-uncertain", + action: "suppress cross-lab collaboration recommendations until reviewed", + mentionId: item.mentionId, + candidateClusterId: item.leftClusterId, + score: item.score + })) + }; +} + +function toSchemaOrg(graph) { + return { + "@context": "https://schema.org", + "@type": "Dataset", + name: "SCIBASE author-affiliation disambiguation graph", + identifier: graph.graphHash, + creator: graph.authorNodes.map((node) => ({ + "@type": "Person", + name: node.name, + affiliation: { + "@type": "Organization", + name: node.affiliation + }, + sameAs: node.mentionIds + })) + }; +} + +module.exports = { + compareMentions, + createAuthorGraph, + normalizeAffiliation, + normalizeName, + toSchemaOrg +}; diff --git a/knowledge-graph-author-affiliation-disambiguation/test/author-affiliation-disambiguation.test.js b/knowledge-graph-author-affiliation-disambiguation/test/author-affiliation-disambiguation.test.js new file mode 100644 index 0000000..dc4a598 --- /dev/null +++ b/knowledge-graph-author-affiliation-disambiguation/test/author-affiliation-disambiguation.test.js @@ -0,0 +1,42 @@ +const assert = require("assert/strict"); +const test = require("node:test"); +const mentions = require("../data/sample-mentions.json"); +const { + compareMentions, + createAuthorGraph, + normalizeAffiliation, + normalizeName, + toSchemaOrg +} = require("../src/author-affiliation-disambiguation"); + +test("normalizes author names and affiliation aliases", () => { + assert.deepEqual(normalizeName("Dr. Maya A. Chen").tokens, ["maya", "a", "chen"]); + assert.ok(normalizeAffiliation("Dept. of Bioengineering, Stanford Univ.").includes("university")); +}); + +test("matches same author mentions with strong evidence", () => { + const result = compareMentions(mentions[0], mentions[1]); + assert.equal(result.decision, "same-author"); + assert.ok(result.score >= 0.72); +}); + +test("sends homonyms to curator review instead of merging", () => { + const result = compareMentions(mentions[0], mentions[3]); + assert.equal(result.decision, "homonym-review"); + assert.ok(result.score < 0.72); +}); + +test("builds author nodes, collaboration edges, and recommendation guards", () => { + const graph = createAuthorGraph(mentions); + assert.equal(graph.authorNodes.length, 3); + assert.equal(graph.collaborationEdges.length, 1); + assert.equal(graph.curatorQueue.length, 1); + assert.equal(graph.recommendationGuards[0].reason, "author-identity-uncertain"); +}); + +test("exports schema.org-compatible creator metadata", () => { + const graph = createAuthorGraph(mentions); + const schema = toSchemaOrg(graph); + assert.equal(schema["@context"], "https://schema.org"); + assert.equal(schema.creator.length, graph.authorNodes.length); +});