Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions kg-entity-alias-disambiguation/demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
const fs = require("fs");
const path = require("path");
const { graphBatch } = require("./sample-data");
const { evaluateBatch } = require("./index");

const reportsDir = path.join(__dirname, "reports");
fs.mkdirSync(reportsDir, { recursive: true });

const report = evaluateBatch(graphBatch);
fs.writeFileSync(path.join(reportsDir, "entity-disambiguation-report.json"), `${JSON.stringify(report, null, 2)}\n`);

const markdown = [
"# Knowledge Graph Entity Alias Disambiguation Report",
"",
`Batch: ${report.batchId}`,
`Generated: ${report.generatedAt}`,
"",
"## Summary",
"",
`- Mentions evaluated: ${report.summary.total}`,
`- Published canonical nodes: ${report.summary["publish-canonical-node"] || 0}`,
`- Held for curation: ${report.summary["hold-for-curation"] || 0}`,
`- Findings: ${report.summary.findingCount}`,
"",
"## Mention Decisions",
"",
...report.mentions.flatMap((mention) => [
`### ${mention.mention}`,
"",
`- Decision: ${mention.decision}`,
`- Canonical candidate: ${mention.canonicalCandidate} (${mention.canonicalLabel})`,
`- Margin: ${mention.margin}`,
`- Findings: ${mention.findings.length ? mention.findings.join("; ") : "none"}`,
`- Actions: ${mention.actions.join("; ")}`,
""
])
].join("\n").trim();

fs.writeFileSync(path.join(reportsDir, "entity-disambiguation-report.md"), `${markdown}\n`);

const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="960" height="540" viewBox="0 0 960 540">
<rect width="960" height="540" fill="#f8fafc"/>
<text x="48" y="64" font-family="Arial" font-size="28" font-weight="700" fill="#172033">KG Entity Disambiguation</text>
<text x="48" y="102" font-family="Arial" font-size="16" fill="#4f5d75">Alias collision guard before graph edge publication</text>
${report.mentions.map((mention, index) => {
const y = 150 + index * 82;
const color = mention.decision === "publish-canonical-node" ? "#047857" : "#b45309";
return `<rect x="48" y="${y - 34}" width="864" height="58" rx="6" fill="#ffffff" stroke="#d7dce6"/>
<text x="72" y="${y - 8}" font-family="Arial" font-size="18" font-weight="700" fill="#172033">${mention.mention}</text>
<text x="72" y="${y + 16}" font-family="Arial" font-size="14" fill="${color}">${mention.decision}</text>
<text x="300" y="${y + 16}" font-family="Arial" font-size="14" fill="#4f5d75">${mention.canonicalCandidate}, margin ${mention.margin}</text>`;
}).join("\n ")}
</svg>
`;

fs.writeFileSync(path.join(reportsDir, "summary.svg"), svg);
console.log(JSON.stringify(report.summary, null, 2));
78 changes: 78 additions & 0 deletions kg-entity-alias-disambiguation/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
const BLOCK_MARGIN = 0.05;
const SAFE_SCORE = 0.86;

function tokenize(value) {
return new Set(String(value).toLowerCase().replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter(Boolean));
}

function overlapScore(context, label) {
const contextTokens = tokenize(context);
const labelTokens = [...tokenize(label)];
if (labelTokens.length === 0) return 0;
const hits = labelTokens.filter((token) => contextTokens.has(token)).length;
return hits / labelTokens.length;
}

function evaluateMention(mention) {
const ranked = mention.candidates
.map((candidate) => ({
...candidate,
contextOverlap: overlapScore(mention.context, candidate.label),
adjustedScore: Number((candidate.score + overlapScore(mention.context, candidate.label) * 0.12).toFixed(3))
}))
.sort((a, b) => b.adjustedScore - a.adjustedScore);

const [best, runnerUp] = ranked;
const margin = runnerUp ? Number((best.adjustedScore - runnerUp.adjustedScore).toFixed(3)) : best.adjustedScore;
const findings = [];

if (runnerUp && margin < BLOCK_MARGIN) {
findings.push(`candidate margin ${margin} is below ${BLOCK_MARGIN}`);
}
if (best.adjustedScore < SAFE_SCORE) {
findings.push(`best adjusted score ${best.adjustedScore} is below ${SAFE_SCORE}`);
}
if (mention.proposedEdges.length > 1 && runnerUp && margin < 0.1) {
findings.push("multiple ontology edges proposed for an ambiguous mention");
}

const decision = findings.length > 0 ? "hold-for-curation" : "publish-canonical-node";
const actions = decision === "publish-canonical-node"
? [`merge mention into ${best.id}`, "publish proposed graph edge"]
: ["block graph edge publication", "route mention to ontology curator", `keep candidate ${best.id} as provisional`];

return {
mention: mention.mention,
source: mention.source,
doi: mention.doi,
decision,
canonicalCandidate: best.id,
canonicalLabel: best.label,
margin,
rankedCandidates: ranked,
findings,
actions
};
}

function evaluateBatch(batch) {
const mentions = batch.candidates.map(evaluateMention);
const summary = mentions.reduce(
(acc, mention) => {
acc.total += 1;
acc[mention.decision] = (acc[mention.decision] || 0) + 1;
acc.findingCount += mention.findings.length;
return acc;
},
{ total: 0, findingCount: 0 }
);

return {
batchId: batch.id,
generatedAt: batch.generatedAt,
summary,
mentions
};
}

module.exports = { BLOCK_MARGIN, SAFE_SCORE, evaluateMention, evaluateBatch };
32 changes: 32 additions & 0 deletions kg-entity-alias-disambiguation/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Knowledge Graph Entity Alias Disambiguation

This module adds a focused guard for Scientific Knowledge Graph Integration. It evaluates ambiguous extracted entities before they are merged into canonical graph nodes or used to publish graph edges.

The scope is intentionally narrow. It does not implement broad entity extraction, semantic search, graph navigation UI, recommendations, author graph rendering, or ontology export. It handles the alias collision decision layer between extracted mentions and canonical graph nodes.

## What It Checks

- Candidate ontology IDs from sources such as MeSH, NCBI Gene, PubChem, Wikidata, and GeoNames
- Alias collisions where one mention can refer to different concepts
- Context overlap and confidence margins between top candidates
- Multiple proposed ontology edges for a single ambiguous mention
- Deterministic publish, hold, and curation actions

## Demo

Run:

```bash
node kg-entity-alias-disambiguation/test.js
node kg-entity-alias-disambiguation/demo.js
node kg-entity-alias-disambiguation/render-video.js
```

Generated artifacts:

- `reports/entity-disambiguation-report.json`
- `reports/entity-disambiguation-report.md`
- `reports/summary.svg`
- `reports/demo.mp4`

All data is synthetic. The module does not call external ontology APIs, DOI services, graph databases, private projects, credentials, or live SCIBASE accounts.
60 changes: 60 additions & 0 deletions kg-entity-alias-disambiguation/render-video.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
const fs = require("fs");
const path = require("path");
const { spawnSync } = require("child_process");

const reportsDir = path.join(__dirname, "reports");
fs.mkdirSync(reportsDir, { recursive: true });

const width = 960;
const height = 540;
const ppm = path.join(reportsDir, "demo-frame.ppm");
const mp4 = path.join(reportsDir, "demo.mp4");

function pixel(x, y) {
if (y < 112) return [23, 32, 51];
if (x > 46 && x < 914 && y > 126 && y < 188) return [255, 255, 255];
if (x > 46 && x < 914 && y > 210 && y < 272) return [255, 255, 255];
if (x > 46 && x < 914 && y > 294 && y < 356) return [255, 255, 255];
if (x > 46 && x < 914 && y > 378 && y < 440) return [255, 255, 255];
if (x > 64 && x < 238 && y > 140 && y < 174) return [4, 120, 87];
if (x > 64 && x < 238 && y > 224 && y < 258) return [4, 120, 87];
if (x > 64 && x < 238 && y > 308 && y < 342) return [180, 83, 9];
if (x > 64 && x < 238 && y > 392 && y < 426) return [180, 83, 9];
return [248, 250, 252];
}

const header = `P6\n${width} ${height}\n255\n`;
const body = Buffer.alloc(width * height * 3);
for (let y = 0; y < height; y += 1) {
for (let x = 0; x < width; x += 1) {
const offset = (y * width + x) * 3;
const [r, g, b] = pixel(x, y);
body[offset] = r;
body[offset + 1] = g;
body[offset + 2] = b;
}
}
fs.writeFileSync(ppm, Buffer.concat([Buffer.from(header), body]));

const result = spawnSync("ffmpeg", [
"-y",
"-loop",
"1",
"-framerate",
"24",
"-i",
ppm,
"-t",
"5",
"-vf",
"format=yuv420p",
"-movflags",
"+faststart",
mp4
], { stdio: "inherit" });

if (result.status !== 0) {
throw new Error("ffmpeg failed to render demo video");
}

console.log(mp4);
Binary file added kg-entity-alias-disambiguation/reports/demo.mp4
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
{
"batchId": "kg-batch-2026-05-29",
"generatedAt": "2026-05-29T12:42:00.000Z",
"summary": {
"total": 4,
"findingCount": 3,
"publish-canonical-node": 2,
"hold-for-curation": 2
},
"mentions": [
{
"mention": "APC",
"source": "paper-neuro-17",
"doi": "10.5555/neuro.17",
"decision": "publish-canonical-node",
"canonicalCandidate": "NCBIGene:324",
"canonicalLabel": "APC gene",
"margin": 0.17,
"rankedCandidates": [
{
"id": "NCBIGene:324",
"label": "APC gene",
"ontology": "NCBI Gene",
"score": 0.91,
"contextOverlap": 0,
"adjustedScore": 0.91
},
{
"id": "MeSH:D000073878",
"label": "Antigen-presenting cells",
"ontology": "MeSH",
"score": 0.74,
"contextOverlap": 0,
"adjustedScore": 0.74
}
],
"findings": [],
"actions": [
"merge mention into NCBIGene:324",
"publish proposed graph edge"
]
},
{
"mention": "ALS",
"source": "paper-neuro-18",
"doi": "10.5555/neuro.18",
"decision": "publish-canonical-node",
"canonicalCandidate": "MeSH:D000690",
"canonicalLabel": "Amyotrophic Lateral Sclerosis",
"margin": 0.37,
"rankedCandidates": [
{
"id": "MeSH:D000690",
"label": "Amyotrophic Lateral Sclerosis",
"ontology": "MeSH",
"score": 0.89,
"contextOverlap": 1,
"adjustedScore": 1.01
},
{
"id": "CHEBI:456216",
"label": "Acetolactate synthase",
"ontology": "ChEBI",
"score": 0.64,
"contextOverlap": 0,
"adjustedScore": 0.64
}
],
"findings": [],
"actions": [
"merge mention into MeSH:D000690",
"publish proposed graph edge"
]
},
{
"mention": "Java",
"source": "software-methods-02",
"doi": "10.5555/software.02",
"decision": "hold-for-curation",
"canonicalCandidate": "GeoNames:1643084",
"canonicalLabel": "Java island",
"margin": 0.01,
"rankedCandidates": [
{
"id": "GeoNames:1643084",
"label": "Java island",
"ontology": "GeoNames",
"score": 0.82,
"contextOverlap": 0.5,
"adjustedScore": 0.88
},
{
"id": "Wikidata:Q251",
"label": "Java programming language",
"ontology": "Wikidata",
"score": 0.83,
"contextOverlap": 0.3333333333333333,
"adjustedScore": 0.87
}
],
"findings": [
"candidate margin 0.01 is below 0.05"
],
"actions": [
"block graph edge publication",
"route mention to ontology curator",
"keep candidate GeoNames:1643084 as provisional"
]
},
{
"mention": "Mercury",
"source": "materials-09",
"doi": "10.5555/materials.09",
"decision": "hold-for-curation",
"canonicalCandidate": "PubChem:23931",
"canonicalLabel": "Mercury element",
"margin": 0.03,
"rankedCandidates": [
{
"id": "PubChem:23931",
"label": "Mercury element",
"ontology": "PubChem",
"score": 0.88,
"contextOverlap": 0.5,
"adjustedScore": 0.94
},
{
"id": "MeSH:D008628",
"label": "Mercury compounds",
"ontology": "MeSH",
"score": 0.85,
"contextOverlap": 0.5,
"adjustedScore": 0.91
}
],
"findings": [
"candidate margin 0.03 is below 0.05",
"multiple ontology edges proposed for an ambiguous mention"
],
"actions": [
"block graph edge publication",
"route mention to ontology curator",
"keep candidate PubChem:23931 as provisional"
]
}
]
}
Loading