From 3c5d4c1a3e3535a41f4855d445cf3bcf6f1485e8 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 11 May 2026 17:15:21 +0800
Subject: [PATCH 1/4] fix: stop silently dropping wiki updates when
 section/frontmatter drifts

`_update_index`, `_backlink_summary`, `_backlink_concepts`, and
`_add_related_link` all combined a loose `substring in text` check
with a strict newline-anchored `str.replace`. When the user's wiki
had a hand-edited heading (trailing space, missing section, etc.)
or non-canonical frontmatter (`sources:[a]` with no space), the
if-branch matched but the replace did nothing, silently dropping
the new entry/link.

Refactor all four to use line-based section parsing via a new
`_ensure_h2_section` helper (logs a warning + appends when the
section is missing), and extract `_prepend_source_to_frontmatter`
that walks frontmatter lines instead of string-matching. Adds
regression tests covering trailing-whitespace headings, missing
sections, and `sources:` without a space.

Fixes #26.
---
 openkb/agent/compiler.py | 97 +++++++++++++++++++++++++++++-----------
 tests/test_compiler.py   | 82 +++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 25 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index bf15a1c3..e32bb8c8 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -326,6 +326,28 @@ def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | Non
     return None
 
 
+def _ensure_h2_section(lines: list[str], heading: str) -> None:
+    """Ensure an H2 section ``heading`` exists in ``lines``; append if missing.
+
+    Recovers from hand-edited or drifted index.md files where the expected
+    section was removed or renamed — without this, downstream inserts would
+    silently no-op and entries would be dropped.
+    """
+    if _get_section_bounds(lines, heading) is not None:
+        return
+    logger.warning(
+        "index.md is missing %r section; appending it. "
+        "Check whether the file was hand-edited away from the canonical layout.",
+        heading,
+    )
+    while lines and lines[-1] == "":
+        lines.pop()
+    if lines:
+        lines.append("")
+    lines.append(heading)
+    lines.append("")
+
+
 def _section_contains_link(lines: list[str], heading: str, link: str) -> bool:
     """Check whether an index entry already exists inside the named section."""
     bounds = _get_section_bounds(lines, heading)
@@ -455,6 +477,42 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
         path.write_text(frontmatter + content, encoding="utf-8")
 
 
+def _prepend_source_to_frontmatter(text: str, source_file: str) -> str:
+    """Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter.
+
+    Creates the frontmatter or the ``sources:`` line if missing. Returns the
+    text unchanged if ``source_file`` is already present in the list, or if
+    the frontmatter is malformed (no closing ``---``).
+    """
+    if not text.startswith("---"):
+        return f"---\nsources: [{source_file}]\n---\n\n" + text
+
+    fm_end = text.find("---", 3)
+    if fm_end == -1:
+        return text
+
+    fm_block = text[:fm_end]
+    body = text[fm_end:]
+    fm_lines = fm_block.split("\n")
+
+    for i, line in enumerate(fm_lines):
+        if not line.lstrip().startswith("sources:"):
+            continue
+        lb = line.find("[")
+        rb = line.rfind("]")
+        if lb == -1 or rb == -1 or rb < lb:
+            return text
+        items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
+        if source_file in items:
+            return text
+        items.insert(0, source_file)
+        fm_lines[i] = f"sources: [{', '.join(items)}]"
+        return "\n".join(fm_lines) + body
+
+    fm_lines.insert(1, f"sources: [{source_file}]")
+    return "\n".join(fm_lines) + body
+
+
 def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
     """Add a cross-reference link to an existing concept page (no LLM call)."""
     concepts_dir = wiki_dir / "concepts"
@@ -467,20 +525,8 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f
     if link in text:
         return
 
-    # Update sources in frontmatter
     if source_file not in text:
-        if text.startswith("---"):
-            end = text.find("---", 3)
-            if end != -1:
-                fm = text[:end + 3]
-                body = text[end + 3:]
-                if "sources:" in fm:
-                    fm = fm.replace("sources: [", f"sources: [{source_file}, ")
-                else:
-                    fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
-                text = fm + body
-        else:
-            text = f"---\nsources: [{source_file}]\n---\n\n" + text
+        text = _prepend_source_to_frontmatter(text, source_file)
 
     text += f"\n\nSee also: {link}"
     path.write_text(text, encoding="utf-8")
@@ -505,13 +551,11 @@ def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -
     if not missing:
         return
 
-    new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing)
-    if "## Related Concepts" in text:
-        # Append into existing section
-        text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1)
-    else:
-        text += f"\n\n## Related Concepts\n{new_links}\n"
-    summary_path.write_text(text, encoding="utf-8")
+    lines = text.split("\n")
+    _ensure_h2_section(lines, "## Related Concepts")
+    for slug in reversed(missing):
+        _insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]")
+    summary_path.write_text("\n".join(lines), encoding="utf-8")
 
 
 def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
@@ -533,11 +577,10 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
         text = path.read_text(encoding="utf-8")
         if link in text:
             continue
-        if "## Related Documents" in text:
-            text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1)
-        else:
-            text += f"\n\n## Related Documents\n- {link}\n"
-        path.write_text(text, encoding="utf-8")
+        lines = text.split("\n")
+        _ensure_h2_section(lines, "## Related Documents")
+        _insert_section_entry(lines, "## Related Documents", f"- {link}")
+        path.write_text("\n".join(lines), encoding="utf-8")
 
 def _update_index(
     wiki_dir: Path, doc_name: str, concept_names: list[str],
@@ -565,6 +608,10 @@ def _update_index(
 
     lines = index_path.read_text(encoding="utf-8").split("\n")
 
+    _ensure_h2_section(lines, "## Documents")
+    if concept_names:
+        _ensure_h2_section(lines, "## Concepts")
+
     doc_link = f"[[summaries/{doc_name}]]"
     if not _section_contains_link(lines, "## Documents", doc_link):
         doc_entry = f"- {doc_link} ({doc_type})"
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index cb02efc0..fb02ca24 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -289,6 +289,32 @@ def test_adds_concept_entry_when_link_exists_outside_concepts_section(self, tmp_
         assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text
         assert "- [[concepts/attention]] — New brief" in text
 
+    def test_recovers_when_documents_section_missing(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", [], doc_brief="Brief")
+        text = (wiki / "index.md").read_text()
+        assert "## Documents" in text
+        assert "[[summaries/my-doc]] (short) — Brief" in text
+
+    def test_recovers_when_concepts_section_missing(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", ["attention"],
+                       concept_briefs={"attention": "Focus"})
+        text = (wiki / "index.md").read_text()
+        assert "## Concepts" in text
+        assert "[[concepts/attention]] — Focus" in text
+        assert "[[summaries/my-doc]]" in text
+
 
 class TestReadWikiContext:
     def test_empty_wiki(self, tmp_path):
@@ -455,6 +481,19 @@ def test_merges_into_existing_section(self, tmp_path):
         assert "[[concepts/transformer]]" in text
         assert text.count("[[concepts/attention]]") == 1
 
+    def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
+        """Heading with trailing space must not cause silent drop of new links."""
+        wiki = tmp_path / "wiki"
+        summaries = wiki / "summaries"
+        summaries.mkdir(parents=True)
+        (summaries / "paper.md").write_text(
+            "# Summary\n\nContent.\n\n## Related Concepts \n- [[concepts/attention]]\n",
+            encoding="utf-8",
+        )
+        _backlink_summary(wiki, "paper", ["attention", "transformer"])
+        text = (summaries / "paper.md").read_text()
+        assert "[[concepts/transformer]]" in text
+
 
 class TestBacklinkConcepts:
     def test_adds_summary_link_to_concept(self, tmp_path):
@@ -503,6 +542,20 @@ def test_skips_missing_concept_file(self, tmp_path):
         # Should not raise
         _backlink_concepts(wiki, "paper", ["nonexistent"])
 
+    def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
+        """Heading with trailing space must not cause silent drop of new link."""
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "# Attention\n\n## Related Documents \n- [[summaries/old-paper]]\n",
+            encoding="utf-8",
+        )
+        _backlink_concepts(wiki, "new-paper", ["attention"])
+        text = (concepts / "attention.md").read_text()
+        assert "[[summaries/new-paper]]" in text
+        assert "[[summaries/old-paper]]" in text
+
 
 class TestAddRelatedLink:
     def test_adds_see_also_link(self, tmp_path):
@@ -536,6 +589,35 @@ def test_skips_if_file_missing(self, tmp_path):
         # Should not raise
         _add_related_link(wiki, "nonexistent", "doc", "file.pdf")
 
+    def test_frontmatter_without_space_after_colon_still_merges(self, tmp_path):
+        """sources:[a] (no space after colon) must still prepend new source."""
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources:[paper1.pdf]\n---\n\n# Attention\n",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper2.pdf")
+        text = (concepts / "attention.md").read_text()
+        assert "paper2.pdf" in text
+        assert "paper1.pdf" in text
+        assert "[[summaries/new-doc]]" in text
+
+    def test_frontmatter_without_sources_line_gets_one_inserted(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nbrief: Focus mechanism\n---\n\n# Attention\n",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper.pdf")
+        text = (concepts / "attention.md").read_text()
+        assert "sources: [paper.pdf]" in text
+        assert "brief: Focus mechanism" in text
+        assert "[[summaries/new-doc]]" in text
+
 
 def _mock_completion(responses: list[str]):
     """Create a mock for litellm.completion that returns responses in order."""

From 445da25744263151286994388a1541c13c2655fa Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 11 May 2026 17:52:57 +0800
Subject: [PATCH 2/4] fix: tolerate trailing whitespace on H2 headings in
 section lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_get_section_bounds` used exact `line == heading` equality, so a heading
like `## Documents ` (trailing space) was treated as a different section.
`_ensure_h2_section` would then append a clean `## Documents` at the end
of the file, stranding the original entries under the drifted heading —
silently converting the previous "drop" failure mode into a "duplicate
section" failure mode.

Match on `line.rstrip() == heading` so drifted whitespace is recovered
properly. Tighten the two regression tests with `text.count(...) == 1`
so the duplicate-H2 case fails them. Generalize the warning text since
`_ensure_h2_section` is also called for backlink sections.
---
 openkb/agent/compiler.py | 11 ++++++++---
 tests/test_compiler.py   |  8 ++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index e32bb8c8..0670d683 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -313,9 +313,14 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
 
 
 def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None:
-    """Return the [start, end) bounds for a Markdown H2 section."""
+    """Return the [start, end) bounds for a Markdown H2 section.
+
+    Heading lookup tolerates trailing whitespace on the file's heading line
+    so a drifted ``## Documents `` still matches ``## Documents`` — otherwise
+    callers would treat the section as missing and append a duplicate H2.
+    """
     for i, line in enumerate(lines):
-        if line == heading:
+        if line.rstrip() == heading:
             start = i + 1
             end = len(lines)
             for j in range(start, len(lines)):
@@ -336,7 +341,7 @@ def _ensure_h2_section(lines: list[str], heading: str) -> None:
     if _get_section_bounds(lines, heading) is not None:
         return
     logger.warning(
-        "index.md is missing %r section; appending it. "
+        "Wiki page is missing %r section; appending it. "
         "Check whether the file was hand-edited away from the canonical layout.",
         heading,
     )
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index fb02ca24..881e8bea 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -482,7 +482,8 @@ def test_merges_into_existing_section(self, tmp_path):
         assert text.count("[[concepts/attention]]") == 1
 
     def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
-        """Heading with trailing space must not cause silent drop of new links."""
+        """Heading with trailing space must merge into the existing section,
+        not append a duplicate H2."""
         wiki = tmp_path / "wiki"
         summaries = wiki / "summaries"
         summaries.mkdir(parents=True)
@@ -493,6 +494,7 @@ def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
         _backlink_summary(wiki, "paper", ["attention", "transformer"])
         text = (summaries / "paper.md").read_text()
         assert "[[concepts/transformer]]" in text
+        assert text.count("## Related Concepts") == 1
 
 
 class TestBacklinkConcepts:
@@ -543,7 +545,8 @@ def test_skips_missing_concept_file(self, tmp_path):
         _backlink_concepts(wiki, "paper", ["nonexistent"])
 
     def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
-        """Heading with trailing space must not cause silent drop of new link."""
+        """Heading with trailing space must merge into the existing section,
+        not append a duplicate H2."""
         wiki = tmp_path / "wiki"
         concepts = wiki / "concepts"
         concepts.mkdir(parents=True)
@@ -555,6 +558,7 @@ def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
         text = (concepts / "attention.md").read_text()
         assert "[[summaries/new-paper]]" in text
         assert "[[summaries/old-paper]]" in text
+        assert text.count("## Related Documents") == 1
 
 
 class TestAddRelatedLink:

From f829ba5167eacd93c336287ad2bec6422f8c01d4 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 11 May 2026 18:04:58 +0800
Subject: [PATCH 3/4] refactor: extract _iter_h2_headings so section lookup
 shares one H2 scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously `_get_section_bounds` did exact-equality on the start line and
a separate `startswith("## ")` walk to find the end. Two heading-detection
rules in one function — easy to drift out of sync (the trailing-whitespace
fix in the prior commit only touched the start side).

Pull the H2 scan into `_iter_h2_headings`, which returns the normalized
heading + index for every H2 line. `_get_section_bounds` then just indexes
into that list: the target match and the "next H2" boundary use the same
rule by construction. No observable behavior change.
---
 openkb/agent/compiler.py | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 0670d683..b05f9f16 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -312,21 +312,37 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
     return "\n".join(lines) or "(none yet)"
 
 
+def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]:
+    """Return ``[(line_index, normalized_heading), ...]`` for every ATX H2.
+
+    A line counts as H2 when it starts with ``"## "`` (two hashes + space).
+    ``normalized_heading`` is the line with trailing whitespace stripped, so
+    ``"## Documents "`` normalizes to ``"## Documents"`` — letting callers
+    use exact-string comparison without tripping on stray whitespace.
+
+    Used by ``_get_section_bounds`` so heading lookup and the next-section
+    boundary share one scan and one normalization rule.
+    """
+    return [
+        (i, line.rstrip())
+        for i, line in enumerate(lines)
+        if line.startswith("## ")
+    ]
+
+
 def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None:
     """Return the [start, end) bounds for a Markdown H2 section.
 
-    Heading lookup tolerates trailing whitespace on the file's heading line
-    so a drifted ``## Documents `` still matches ``## Documents`` — otherwise
-    callers would treat the section as missing and append a duplicate H2.
+    Uses ``_iter_h2_headings`` so the same H2 detection that finds the
+    target heading also determines the section's end (the next H2). A
+    drifted ``"## Documents "`` matches ``"## Documents"`` because both
+    sides are normalized.
     """
-    for i, line in enumerate(lines):
-        if line.rstrip() == heading:
-            start = i + 1
-            end = len(lines)
-            for j in range(start, len(lines)):
-                if lines[j].startswith("## "):
-                    end = j
-                    break
+    headings = _iter_h2_headings(lines)
+    for k, (idx, normalized) in enumerate(headings):
+        if normalized == heading:
+            start = idx + 1
+            end = headings[k + 1][0] if k + 1 < len(headings) else len(lines)
             return start, end
     return None
 

From f72b3328e7ecbacf8e5c69b05f8b4444bad79a5f Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 11 May 2026 18:23:16 +0800
Subject: [PATCH 4/4] fix: migrate _write_concept to
 _prepend_source_to_frontmatter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_write_concept` (called on every concept update) still used the same
`fm.replace("sources: [", ...)` anti-pattern this PR introduced
`_prepend_source_to_frontmatter` to eliminate — leaving the bug fixed in
`_add_related_link` but live on the higher-frequency path. Non-canonical
frontmatter like `sources:[paper.pdf]` (no space after colon) silently
dropped the new source on every subsequent compile.

Replace the inline frontmatter manipulation with a single call to the
helper, mirroring `_add_related_link`. Add a regression test exercising
`sources:[a]` (no space) through `_write_concept`'s update path.
---
 openkb/agent/compiler.py | 13 +------------
 tests/test_compiler.py   | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index b05f9f16..b5695d58 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -448,18 +448,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
     if is_update and path.exists():
         existing = path.read_text(encoding="utf-8")
         if source_file not in existing:
-            if existing.startswith("---"):
-                end = existing.find("---", 3)
-                if end != -1:
-                    fm = existing[:end + 3]
-                    body = existing[end + 3:]
-                    if "sources:" in fm:
-                        fm = fm.replace("sources: [", f"sources: [{source_file}, ")
-                    else:
-                        fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
-                    existing = fm + body
-            else:
-                existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
+            existing = _prepend_source_to_frontmatter(existing, source_file)
         # Strip frontmatter from LLM content to avoid duplicate blocks
         clean = content
         if clean.startswith("---"):
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 881e8bea..19fb6fcb 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -181,6 +181,22 @@ def test_update_concept_appends_source(self, tmp_path):
         assert "paper1.pdf" in text
         assert "New info from paper2." in text
 
+    def test_update_concept_merges_into_non_canonical_sources(self, tmp_path):
+        """sources:[a] (no space after colon) must still get paper2 prepended,
+        matching the helper's behavior in _add_related_link."""
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources:[paper1.pdf]\n---\n\n# Attention\n\nOld content.",
+            encoding="utf-8",
+        )
+        _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True)
+        text = (concepts / "attention.md").read_text()
+        assert "paper1.pdf" in text
+        assert "paper2.pdf" in text
+        assert "New info from paper2." in text
+
 
 class TestUpdateIndex:
     def test_appends_entries_with_briefs(self, tmp_path):