From 3c5d4c1a3e3535a41f4855d445cf3bcf6f1485e8 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 11 May 2026 17:15:21 +0800 Subject: [PATCH 1/4] fix: stop silently dropping wiki updates when section/frontmatter drifts `_update_index`, `_backlink_summary`, `_backlink_concepts`, and `_add_related_link` all combined a loose `substring in text` check with a strict newline-anchored `str.replace`. When the user's wiki had a hand-edited heading (trailing space, missing section, etc.) or non-canonical frontmatter (`sources:[a]` with no space), the if-branch matched but the replace did nothing, silently dropping the new entry/link. Refactor all four to use line-based section parsing via a new `_ensure_h2_section` helper (logs a warning + appends when the section is missing), and extract `_prepend_source_to_frontmatter` that walks frontmatter lines instead of string-matching. Adds regression tests covering trailing-whitespace headings, missing sections, and `sources:` without a space. Fixes #26. --- openkb/agent/compiler.py | 97 +++++++++++++++++++++++++++++----------- tests/test_compiler.py | 82 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 25 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index bf15a1c3..e32bb8c8 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -326,6 +326,28 @@ def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | Non return None +def _ensure_h2_section(lines: list[str], heading: str) -> None: + """Ensure an H2 section ``heading`` exists in ``lines``; append if missing. + + Recovers from hand-edited or drifted index.md files where the expected + section was removed or renamed — without this, downstream inserts would + silently no-op and entries would be dropped. + """ + if _get_section_bounds(lines, heading) is not None: + return + logger.warning( + "index.md is missing %r section; appending it. " + "Check whether the file was hand-edited away from the canonical layout.", + heading, + ) + while lines and lines[-1] == "": + lines.pop() + if lines: + lines.append("") + lines.append(heading) + lines.append("") + + def _section_contains_link(lines: list[str], heading: str, link: str) -> bool: """Check whether an index entry already exists inside the named section.""" bounds = _get_section_bounds(lines, heading) @@ -455,6 +477,42 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is path.write_text(frontmatter + content, encoding="utf-8") +def _prepend_source_to_frontmatter(text: str, source_file: str) -> str: + """Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter. + + Creates the frontmatter or the ``sources:`` line if missing. Returns the + text unchanged if ``source_file`` is already present in the list, or if + the frontmatter is malformed (no closing ``---``). + """ + if not text.startswith("---"): + return f"---\nsources: [{source_file}]\n---\n\n" + text + + fm_end = text.find("---", 3) + if fm_end == -1: + return text + + fm_block = text[:fm_end] + body = text[fm_end:] + fm_lines = fm_block.split("\n") + + for i, line in enumerate(fm_lines): + if not line.lstrip().startswith("sources:"): + continue + lb = line.find("[") + rb = line.rfind("]") + if lb == -1 or rb == -1 or rb < lb: + return text + items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()] + if source_file in items: + return text + items.insert(0, source_file) + fm_lines[i] = f"sources: [{', '.join(items)}]" + return "\n".join(fm_lines) + body + + fm_lines.insert(1, f"sources: [{source_file}]") + return "\n".join(fm_lines) + body + + def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: """Add a cross-reference link to an existing concept page (no LLM call).""" concepts_dir = wiki_dir / "concepts" @@ -467,20 +525,8 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f if link in text: return - # Update sources in frontmatter if source_file not in text: - if text.startswith("---"): - end = text.find("---", 3) - if end != -1: - fm = text[:end + 3] - body = text[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - text = fm + body - else: - text = f"---\nsources: [{source_file}]\n---\n\n" + text + text = _prepend_source_to_frontmatter(text, source_file) text += f"\n\nSee also: {link}" path.write_text(text, encoding="utf-8") @@ -505,13 +551,11 @@ def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) - if not missing: return - new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing) - if "## Related Concepts" in text: - # Append into existing section - text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1) - else: - text += f"\n\n## Related Concepts\n{new_links}\n" - summary_path.write_text(text, encoding="utf-8") + lines = text.split("\n") + _ensure_h2_section(lines, "## Related Concepts") + for slug in reversed(missing): + _insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]") + summary_path.write_text("\n".join(lines), encoding="utf-8") def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: @@ -533,11 +577,10 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) text = path.read_text(encoding="utf-8") if link in text: continue - if "## Related Documents" in text: - text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1) - else: - text += f"\n\n## Related Documents\n- {link}\n" - path.write_text(text, encoding="utf-8") + lines = text.split("\n") + _ensure_h2_section(lines, "## Related Documents") + _insert_section_entry(lines, "## Related Documents", f"- {link}") + path.write_text("\n".join(lines), encoding="utf-8") def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], @@ -565,6 +608,10 @@ def _update_index( lines = index_path.read_text(encoding="utf-8").split("\n") + _ensure_h2_section(lines, "## Documents") + if concept_names: + _ensure_h2_section(lines, "## Concepts") + doc_link = f"[[summaries/{doc_name}]]" if not _section_contains_link(lines, "## Documents", doc_link): doc_entry = f"- {doc_link} ({doc_type})" diff --git a/tests/test_compiler.py b/tests/test_compiler.py index cb02efc0..fb02ca24 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -289,6 +289,32 @@ def test_adds_concept_entry_when_link_exists_outside_concepts_section(self, tmp_ assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text assert "- [[concepts/attention]] — New brief" in text + def test_recovers_when_documents_section_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", [], doc_brief="Brief") + text = (wiki / "index.md").read_text() + assert "## Documents" in text + assert "[[summaries/my-doc]] (short) — Brief" in text + + def test_recovers_when_concepts_section_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"], + concept_briefs={"attention": "Focus"}) + text = (wiki / "index.md").read_text() + assert "## Concepts" in text + assert "[[concepts/attention]] — Focus" in text + assert "[[summaries/my-doc]]" in text + class TestReadWikiContext: def test_empty_wiki(self, tmp_path): @@ -455,6 +481,19 @@ def test_merges_into_existing_section(self, tmp_path): assert "[[concepts/transformer]]" in text assert text.count("[[concepts/attention]]") == 1 + def test_section_with_trailing_whitespace_still_merges(self, tmp_path): + """Heading with trailing space must not cause silent drop of new links.""" + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "# Summary\n\nContent.\n\n## Related Concepts \n- [[concepts/attention]]\n", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert "[[concepts/transformer]]" in text + class TestBacklinkConcepts: def test_adds_summary_link_to_concept(self, tmp_path): @@ -503,6 +542,20 @@ def test_skips_missing_concept_file(self, tmp_path): # Should not raise _backlink_concepts(wiki, "paper", ["nonexistent"]) + def test_section_with_trailing_whitespace_still_merges(self, tmp_path): + """Heading with trailing space must not cause silent drop of new link.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\n## Related Documents \n- [[summaries/old-paper]]\n", + encoding="utf-8", + ) + _backlink_concepts(wiki, "new-paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-paper]]" in text + assert "[[summaries/old-paper]]" in text + class TestAddRelatedLink: def test_adds_see_also_link(self, tmp_path): @@ -536,6 +589,35 @@ def test_skips_if_file_missing(self, tmp_path): # Should not raise _add_related_link(wiki, "nonexistent", "doc", "file.pdf") + def test_frontmatter_without_space_after_colon_still_merges(self, tmp_path): + """sources:[a] (no space after colon) must still prepend new source.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources:[paper1.pdf]\n---\n\n# Attention\n", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "[[summaries/new-doc]]" in text + + def test_frontmatter_without_sources_line_gets_one_inserted(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nbrief: Focus mechanism\n---\n\n# Attention\n", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper.pdf") + text = (concepts / "attention.md").read_text() + assert "sources: [paper.pdf]" in text + assert "brief: Focus mechanism" in text + assert "[[summaries/new-doc]]" in text + def _mock_completion(responses: list[str]): """Create a mock for litellm.completion that returns responses in order.""" From 445da25744263151286994388a1541c13c2655fa Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 11 May 2026 17:52:57 +0800 Subject: [PATCH 2/4] fix: tolerate trailing whitespace on H2 headings in section lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_get_section_bounds` used exact `line == heading` equality, so a heading like `## Documents ` (trailing space) was treated as a different section. `_ensure_h2_section` would then append a clean `## Documents` at the end of the file, stranding the original entries under the drifted heading — silently converting the previous "drop" failure mode into a "duplicate section" failure mode. Match on `line.rstrip() == heading` so drifted whitespace is recovered properly. Tighten the two regression tests with `text.count(...) == 1` so the duplicate-H2 case fails them. Generalize the warning text since `_ensure_h2_section` is also called for backlink sections. --- openkb/agent/compiler.py | 11 ++++++++--- tests/test_compiler.py | 8 ++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index e32bb8c8..0670d683 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -313,9 +313,14 @@ def _read_concept_briefs(wiki_dir: Path) -> str: def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: - """Return the [start, end) bounds for a Markdown H2 section.""" + """Return the [start, end) bounds for a Markdown H2 section. + + Heading lookup tolerates trailing whitespace on the file's heading line + so a drifted ``## Documents `` still matches ``## Documents`` — otherwise + callers would treat the section as missing and append a duplicate H2. + """ for i, line in enumerate(lines): - if line == heading: + if line.rstrip() == heading: start = i + 1 end = len(lines) for j in range(start, len(lines)): @@ -336,7 +341,7 @@ def _ensure_h2_section(lines: list[str], heading: str) -> None: if _get_section_bounds(lines, heading) is not None: return logger.warning( - "index.md is missing %r section; appending it. " + "Wiki page is missing %r section; appending it. " "Check whether the file was hand-edited away from the canonical layout.", heading, ) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index fb02ca24..881e8bea 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -482,7 +482,8 @@ def test_merges_into_existing_section(self, tmp_path): assert text.count("[[concepts/attention]]") == 1 def test_section_with_trailing_whitespace_still_merges(self, tmp_path): - """Heading with trailing space must not cause silent drop of new links.""" + """Heading with trailing space must merge into the existing section, + not append a duplicate H2.""" wiki = tmp_path / "wiki" summaries = wiki / "summaries" summaries.mkdir(parents=True) @@ -493,6 +494,7 @@ def test_section_with_trailing_whitespace_still_merges(self, tmp_path): _backlink_summary(wiki, "paper", ["attention", "transformer"]) text = (summaries / "paper.md").read_text() assert "[[concepts/transformer]]" in text + assert text.count("## Related Concepts") == 1 class TestBacklinkConcepts: @@ -543,7 +545,8 @@ def test_skips_missing_concept_file(self, tmp_path): _backlink_concepts(wiki, "paper", ["nonexistent"]) def test_section_with_trailing_whitespace_still_merges(self, tmp_path): - """Heading with trailing space must not cause silent drop of new link.""" + """Heading with trailing space must merge into the existing section, + not append a duplicate H2.""" wiki = tmp_path / "wiki" concepts = wiki / "concepts" concepts.mkdir(parents=True) @@ -555,6 +558,7 @@ def test_section_with_trailing_whitespace_still_merges(self, tmp_path): text = (concepts / "attention.md").read_text() assert "[[summaries/new-paper]]" in text assert "[[summaries/old-paper]]" in text + assert text.count("## Related Documents") == 1 class TestAddRelatedLink: From f829ba5167eacd93c336287ad2bec6422f8c01d4 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 11 May 2026 18:04:58 +0800 Subject: [PATCH 3/4] refactor: extract _iter_h2_headings so section lookup shares one H2 scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously `_get_section_bounds` did exact-equality on the start line and a separate `startswith("## ")` walk to find the end. Two heading-detection rules in one function — easy to drift out of sync (the trailing-whitespace fix in the prior commit only touched the start side). Pull the H2 scan into `_iter_h2_headings`, which returns the normalized heading + index for every H2 line. `_get_section_bounds` then just indexes into that list: the target match and the "next H2" boundary use the same rule by construction. No observable behavior change. --- openkb/agent/compiler.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 0670d683..b05f9f16 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -312,21 +312,37 @@ def _read_concept_briefs(wiki_dir: Path) -> str: return "\n".join(lines) or "(none yet)" +def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]: + """Return ``[(line_index, normalized_heading), ...]`` for every ATX H2. + + A line counts as H2 when it starts with ``"## "`` (two hashes + space). + ``normalized_heading`` is the line with trailing whitespace stripped, so + ``"## Documents "`` normalizes to ``"## Documents"`` — letting callers + use exact-string comparison without tripping on stray whitespace. + + Used by ``_get_section_bounds`` so heading lookup and the next-section + boundary share one scan and one normalization rule. + """ + return [ + (i, line.rstrip()) + for i, line in enumerate(lines) + if line.startswith("## ") + ] + + def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: """Return the [start, end) bounds for a Markdown H2 section. - Heading lookup tolerates trailing whitespace on the file's heading line - so a drifted ``## Documents `` still matches ``## Documents`` — otherwise - callers would treat the section as missing and append a duplicate H2. + Uses ``_iter_h2_headings`` so the same H2 detection that finds the + target heading also determines the section's end (the next H2). A + drifted ``"## Documents "`` matches ``"## Documents"`` because both + sides are normalized. """ - for i, line in enumerate(lines): - if line.rstrip() == heading: - start = i + 1 - end = len(lines) - for j in range(start, len(lines)): - if lines[j].startswith("## "): - end = j - break + headings = _iter_h2_headings(lines) + for k, (idx, normalized) in enumerate(headings): + if normalized == heading: + start = idx + 1 + end = headings[k + 1][0] if k + 1 < len(headings) else len(lines) return start, end return None From f72b3328e7ecbacf8e5c69b05f8b4444bad79a5f Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 11 May 2026 18:23:16 +0800 Subject: [PATCH 4/4] fix: migrate _write_concept to _prepend_source_to_frontmatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_write_concept` (called on every concept update) still used the same `fm.replace("sources: [", ...)` anti-pattern this PR introduced `_prepend_source_to_frontmatter` to eliminate — leaving the bug fixed in `_add_related_link` but live on the higher-frequency path. Non-canonical frontmatter like `sources:[paper.pdf]` (no space after colon) silently dropped the new source on every subsequent compile. Replace the inline frontmatter manipulation with a single call to the helper, mirroring `_add_related_link`. Add a regression test exercising `sources:[a]` (no space) through `_write_concept`'s update path. --- openkb/agent/compiler.py | 13 +------------ tests/test_compiler.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index b05f9f16..b5695d58 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -448,18 +448,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is if is_update and path.exists(): existing = path.read_text(encoding="utf-8") if source_file not in existing: - if existing.startswith("---"): - end = existing.find("---", 3) - if end != -1: - fm = existing[:end + 3] - body = existing[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - existing = fm + body - else: - existing = f"---\nsources: [{source_file}]\n---\n\n" + existing + existing = _prepend_source_to_frontmatter(existing, source_file) # Strip frontmatter from LLM content to avoid duplicate blocks clean = content if clean.startswith("---"): diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 881e8bea..19fb6fcb 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -181,6 +181,22 @@ def test_update_concept_appends_source(self, tmp_path): assert "paper1.pdf" in text assert "New info from paper2." in text + def test_update_concept_merges_into_non_canonical_sources(self, tmp_path): + """sources:[a] (no space after colon) must still get paper2 prepended, + matching the helper's behavior in _add_related_link.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources:[paper1.pdf]\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True) + text = (concepts / "attention.md").read_text() + assert "paper1.pdf" in text + assert "paper2.pdf" in text + assert "New info from paper2." in text + class TestUpdateIndex: def test_appends_entries_with_briefs(self, tmp_path):