diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index bf15a1c3..b5695d58 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -312,20 +312,63 @@ def _read_concept_briefs(wiki_dir: Path) -> str: return "\n".join(lines) or "(none yet)" +def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]: + """Return ``[(line_index, normalized_heading), ...]`` for every ATX H2. + + A line counts as H2 when it starts with ``"## "`` (two hashes + space). + ``normalized_heading`` is the line with trailing whitespace stripped, so + ``"## Documents "`` normalizes to ``"## Documents"`` — letting callers + use exact-string comparison without tripping on stray whitespace. + + Used by ``_get_section_bounds`` so heading lookup and the next-section + boundary share one scan and one normalization rule. + """ + return [ + (i, line.rstrip()) + for i, line in enumerate(lines) + if line.startswith("## ") + ] + + def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: - """Return the [start, end) bounds for a Markdown H2 section.""" - for i, line in enumerate(lines): - if line == heading: - start = i + 1 - end = len(lines) - for j in range(start, len(lines)): - if lines[j].startswith("## "): - end = j - break + """Return the [start, end) bounds for a Markdown H2 section. + + Uses ``_iter_h2_headings`` so the same H2 detection that finds the + target heading also determines the section's end (the next H2). A + drifted ``"## Documents "`` matches ``"## Documents"`` because both + sides are normalized. + """ + headings = _iter_h2_headings(lines) + for k, (idx, normalized) in enumerate(headings): + if normalized == heading: + start = idx + 1 + end = headings[k + 1][0] if k + 1 < len(headings) else len(lines) return start, end return None +def _ensure_h2_section(lines: list[str], heading: str) -> None: + """Ensure an H2 section ``heading`` exists in ``lines``; append if missing. + + Recovers from hand-edited or drifted index.md files where the expected + section was removed or renamed — without this, downstream inserts would + silently no-op and entries would be dropped. + """ + if _get_section_bounds(lines, heading) is not None: + return + logger.warning( + "Wiki page is missing %r section; appending it. " + "Check whether the file was hand-edited away from the canonical layout.", + heading, + ) + while lines and lines[-1] == "": + lines.pop() + if lines: + lines.append("") + lines.append(heading) + lines.append("") + + def _section_contains_link(lines: list[str], heading: str, link: str) -> bool: """Check whether an index entry already exists inside the named section.""" bounds = _get_section_bounds(lines, heading) @@ -405,18 +448,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is if is_update and path.exists(): existing = path.read_text(encoding="utf-8") if source_file not in existing: - if existing.startswith("---"): - end = existing.find("---", 3) - if end != -1: - fm = existing[:end + 3] - body = existing[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - existing = fm + body - else: - existing = f"---\nsources: [{source_file}]\n---\n\n" + existing + existing = _prepend_source_to_frontmatter(existing, source_file) # Strip frontmatter from LLM content to avoid duplicate blocks clean = content if clean.startswith("---"): @@ -455,6 +487,42 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is path.write_text(frontmatter + content, encoding="utf-8") +def _prepend_source_to_frontmatter(text: str, source_file: str) -> str: + """Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter. + + Creates the frontmatter or the ``sources:`` line if missing. Returns the + text unchanged if ``source_file`` is already present in the list, or if + the frontmatter is malformed (no closing ``---``). + """ + if not text.startswith("---"): + return f"---\nsources: [{source_file}]\n---\n\n" + text + + fm_end = text.find("---", 3) + if fm_end == -1: + return text + + fm_block = text[:fm_end] + body = text[fm_end:] + fm_lines = fm_block.split("\n") + + for i, line in enumerate(fm_lines): + if not line.lstrip().startswith("sources:"): + continue + lb = line.find("[") + rb = line.rfind("]") + if lb == -1 or rb == -1 or rb < lb: + return text + items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()] + if source_file in items: + return text + items.insert(0, source_file) + fm_lines[i] = f"sources: [{', '.join(items)}]" + return "\n".join(fm_lines) + body + + fm_lines.insert(1, f"sources: [{source_file}]") + return "\n".join(fm_lines) + body + + def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: """Add a cross-reference link to an existing concept page (no LLM call).""" concepts_dir = wiki_dir / "concepts" @@ -467,20 +535,8 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f if link in text: return - # Update sources in frontmatter if source_file not in text: - if text.startswith("---"): - end = text.find("---", 3) - if end != -1: - fm = text[:end + 3] - body = text[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - text = fm + body - else: - text = f"---\nsources: [{source_file}]\n---\n\n" + text + text = _prepend_source_to_frontmatter(text, source_file) text += f"\n\nSee also: {link}" path.write_text(text, encoding="utf-8") @@ -505,13 +561,11 @@ def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) - if not missing: return - new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing) - if "## Related Concepts" in text: - # Append into existing section - text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1) - else: - text += f"\n\n## Related Concepts\n{new_links}\n" - summary_path.write_text(text, encoding="utf-8") + lines = text.split("\n") + _ensure_h2_section(lines, "## Related Concepts") + for slug in reversed(missing): + _insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]") + summary_path.write_text("\n".join(lines), encoding="utf-8") def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: @@ -533,11 +587,10 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) text = path.read_text(encoding="utf-8") if link in text: continue - if "## Related Documents" in text: - text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1) - else: - text += f"\n\n## Related Documents\n- {link}\n" - path.write_text(text, encoding="utf-8") + lines = text.split("\n") + _ensure_h2_section(lines, "## Related Documents") + _insert_section_entry(lines, "## Related Documents", f"- {link}") + path.write_text("\n".join(lines), encoding="utf-8") def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], @@ -565,6 +618,10 @@ def _update_index( lines = index_path.read_text(encoding="utf-8").split("\n") + _ensure_h2_section(lines, "## Documents") + if concept_names: + _ensure_h2_section(lines, "## Concepts") + doc_link = f"[[summaries/{doc_name}]]" if not _section_contains_link(lines, "## Documents", doc_link): doc_entry = f"- {doc_link} ({doc_type})" diff --git a/tests/test_compiler.py b/tests/test_compiler.py index cb02efc0..19fb6fcb 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -181,6 +181,22 @@ def test_update_concept_appends_source(self, tmp_path): assert "paper1.pdf" in text assert "New info from paper2." in text + def test_update_concept_merges_into_non_canonical_sources(self, tmp_path): + """sources:[a] (no space after colon) must still get paper2 prepended, + matching the helper's behavior in _add_related_link.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources:[paper1.pdf]\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True) + text = (concepts / "attention.md").read_text() + assert "paper1.pdf" in text + assert "paper2.pdf" in text + assert "New info from paper2." in text + class TestUpdateIndex: def test_appends_entries_with_briefs(self, tmp_path): @@ -289,6 +305,32 @@ def test_adds_concept_entry_when_link_exists_outside_concepts_section(self, tmp_ assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text assert "- [[concepts/attention]] — New brief" in text + def test_recovers_when_documents_section_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", [], doc_brief="Brief") + text = (wiki / "index.md").read_text() + assert "## Documents" in text + assert "[[summaries/my-doc]] (short) — Brief" in text + + def test_recovers_when_concepts_section_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"], + concept_briefs={"attention": "Focus"}) + text = (wiki / "index.md").read_text() + assert "## Concepts" in text + assert "[[concepts/attention]] — Focus" in text + assert "[[summaries/my-doc]]" in text + class TestReadWikiContext: def test_empty_wiki(self, tmp_path): @@ -455,6 +497,21 @@ def test_merges_into_existing_section(self, tmp_path): assert "[[concepts/transformer]]" in text assert text.count("[[concepts/attention]]") == 1 + def test_section_with_trailing_whitespace_still_merges(self, tmp_path): + """Heading with trailing space must merge into the existing section, + not append a duplicate H2.""" + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "# Summary\n\nContent.\n\n## Related Concepts \n- [[concepts/attention]]\n", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert "[[concepts/transformer]]" in text + assert text.count("## Related Concepts") == 1 + class TestBacklinkConcepts: def test_adds_summary_link_to_concept(self, tmp_path): @@ -503,6 +560,22 @@ def test_skips_missing_concept_file(self, tmp_path): # Should not raise _backlink_concepts(wiki, "paper", ["nonexistent"]) + def test_section_with_trailing_whitespace_still_merges(self, tmp_path): + """Heading with trailing space must merge into the existing section, + not append a duplicate H2.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\n## Related Documents \n- [[summaries/old-paper]]\n", + encoding="utf-8", + ) + _backlink_concepts(wiki, "new-paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-paper]]" in text + assert "[[summaries/old-paper]]" in text + assert text.count("## Related Documents") == 1 + class TestAddRelatedLink: def test_adds_see_also_link(self, tmp_path): @@ -536,6 +609,35 @@ def test_skips_if_file_missing(self, tmp_path): # Should not raise _add_related_link(wiki, "nonexistent", "doc", "file.pdf") + def test_frontmatter_without_space_after_colon_still_merges(self, tmp_path): + """sources:[a] (no space after colon) must still prepend new source.""" + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources:[paper1.pdf]\n---\n\n# Attention\n", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "[[summaries/new-doc]]" in text + + def test_frontmatter_without_sources_line_gets_one_inserted(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nbrief: Focus mechanism\n---\n\n# Attention\n", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper.pdf") + text = (concepts / "attention.md").read_text() + assert "sources: [paper.pdf]" in text + assert "brief: Focus mechanism" in text + assert "[[summaries/new-doc]]" in text + def _mock_completion(responses: list[str]): """Create a mock for litellm.completion that returns responses in order."""