Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 103 additions & 46 deletions openkb/agent/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,20 +312,63 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
return "\n".join(lines) or "(none yet)"


def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]:
"""Return ``[(line_index, normalized_heading), ...]`` for every ATX H2.

A line counts as H2 when it starts with ``"## "`` (two hashes + space).
``normalized_heading`` is the line with trailing whitespace stripped, so
``"## Documents "`` normalizes to ``"## Documents"`` — letting callers
use exact-string comparison without tripping on stray whitespace.

Used by ``_get_section_bounds`` so heading lookup and the next-section
boundary share one scan and one normalization rule.
"""
return [
(i, line.rstrip())
for i, line in enumerate(lines)
if line.startswith("## ")
]


def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None:
"""Return the [start, end) bounds for a Markdown H2 section."""
for i, line in enumerate(lines):
if line == heading:
start = i + 1
end = len(lines)
for j in range(start, len(lines)):
if lines[j].startswith("## "):
end = j
break
"""Return the [start, end) bounds for a Markdown H2 section.

Uses ``_iter_h2_headings`` so the same H2 detection that finds the
target heading also determines the section's end (the next H2). A
drifted ``"## Documents "`` matches ``"## Documents"`` because both
sides are normalized.
"""
headings = _iter_h2_headings(lines)
for k, (idx, normalized) in enumerate(headings):
if normalized == heading:
start = idx + 1
end = headings[k + 1][0] if k + 1 < len(headings) else len(lines)
return start, end
return None


def _ensure_h2_section(lines: list[str], heading: str) -> None:
"""Ensure an H2 section ``heading`` exists in ``lines``; append if missing.

Recovers from hand-edited or drifted index.md files where the expected
section was removed or renamed — without this, downstream inserts would
silently no-op and entries would be dropped.
"""
if _get_section_bounds(lines, heading) is not None:
return
logger.warning(
"Wiki page is missing %r section; appending it. "
"Check whether the file was hand-edited away from the canonical layout.",
heading,
)
while lines and lines[-1] == "":
lines.pop()
if lines:
lines.append("")
lines.append(heading)
lines.append("")


def _section_contains_link(lines: list[str], heading: str, link: str) -> bool:
"""Check whether an index entry already exists inside the named section."""
bounds = _get_section_bounds(lines, heading)
Expand Down Expand Up @@ -405,18 +448,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
if is_update and path.exists():
existing = path.read_text(encoding="utf-8")
if source_file not in existing:
if existing.startswith("---"):
end = existing.find("---", 3)
if end != -1:
fm = existing[:end + 3]
body = existing[end + 3:]
if "sources:" in fm:
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
else:
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
existing = fm + body
else:
existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
existing = _prepend_source_to_frontmatter(existing, source_file)
# Strip frontmatter from LLM content to avoid duplicate blocks
clean = content
if clean.startswith("---"):
Expand Down Expand Up @@ -455,6 +487,42 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
path.write_text(frontmatter + content, encoding="utf-8")


def _prepend_source_to_frontmatter(text: str, source_file: str) -> str:
"""Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter.

Creates the frontmatter or the ``sources:`` line if missing. Returns the
text unchanged if ``source_file`` is already present in the list, or if
the frontmatter is malformed (no closing ``---``).
"""
if not text.startswith("---"):
return f"---\nsources: [{source_file}]\n---\n\n" + text

fm_end = text.find("---", 3)
if fm_end == -1:
return text

fm_block = text[:fm_end]
body = text[fm_end:]
fm_lines = fm_block.split("\n")

for i, line in enumerate(fm_lines):
if not line.lstrip().startswith("sources:"):
continue
lb = line.find("[")
rb = line.rfind("]")
if lb == -1 or rb == -1 or rb < lb:
return text
items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
if source_file in items:
return text
items.insert(0, source_file)
fm_lines[i] = f"sources: [{', '.join(items)}]"
return "\n".join(fm_lines) + body

fm_lines.insert(1, f"sources: [{source_file}]")
return "\n".join(fm_lines) + body


def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
"""Add a cross-reference link to an existing concept page (no LLM call)."""
concepts_dir = wiki_dir / "concepts"
Expand All @@ -467,20 +535,8 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f
if link in text:
return

# Update sources in frontmatter
if source_file not in text:
if text.startswith("---"):
end = text.find("---", 3)
if end != -1:
fm = text[:end + 3]
body = text[end + 3:]
if "sources:" in fm:
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
else:
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
text = fm + body
else:
text = f"---\nsources: [{source_file}]\n---\n\n" + text
text = _prepend_source_to_frontmatter(text, source_file)

text += f"\n\nSee also: {link}"
path.write_text(text, encoding="utf-8")
Expand All @@ -505,13 +561,11 @@ def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -
if not missing:
return

new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing)
if "## Related Concepts" in text:
# Append into existing section
text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1)
else:
text += f"\n\n## Related Concepts\n{new_links}\n"
summary_path.write_text(text, encoding="utf-8")
lines = text.split("\n")
_ensure_h2_section(lines, "## Related Concepts")
for slug in reversed(missing):
_insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]")
summary_path.write_text("\n".join(lines), encoding="utf-8")


def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
Expand All @@ -533,11 +587,10 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
text = path.read_text(encoding="utf-8")
if link in text:
continue
if "## Related Documents" in text:
text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1)
else:
text += f"\n\n## Related Documents\n- {link}\n"
path.write_text(text, encoding="utf-8")
lines = text.split("\n")
_ensure_h2_section(lines, "## Related Documents")
_insert_section_entry(lines, "## Related Documents", f"- {link}")
path.write_text("\n".join(lines), encoding="utf-8")

def _update_index(
wiki_dir: Path, doc_name: str, concept_names: list[str],
Expand Down Expand Up @@ -565,6 +618,10 @@ def _update_index(

lines = index_path.read_text(encoding="utf-8").split("\n")

_ensure_h2_section(lines, "## Documents")
if concept_names:
_ensure_h2_section(lines, "## Concepts")

doc_link = f"[[summaries/{doc_name}]]"
if not _section_contains_link(lines, "## Documents", doc_link):
doc_entry = f"- {doc_link} ({doc_type})"
Expand Down
102 changes: 102 additions & 0 deletions tests/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,22 @@ def test_update_concept_appends_source(self, tmp_path):
assert "paper1.pdf" in text
assert "New info from paper2." in text

def test_update_concept_merges_into_non_canonical_sources(self, tmp_path):
"""sources:[a] (no space after colon) must still get paper2 prepended,
matching the helper's behavior in _add_related_link."""
wiki = tmp_path / "wiki"
concepts = wiki / "concepts"
concepts.mkdir(parents=True)
(concepts / "attention.md").write_text(
"---\nsources:[paper1.pdf]\n---\n\n# Attention\n\nOld content.",
encoding="utf-8",
)
_write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True)
text = (concepts / "attention.md").read_text()
assert "paper1.pdf" in text
assert "paper2.pdf" in text
assert "New info from paper2." in text


class TestUpdateIndex:
def test_appends_entries_with_briefs(self, tmp_path):
Expand Down Expand Up @@ -289,6 +305,32 @@ def test_adds_concept_entry_when_link_exists_outside_concepts_section(self, tmp_
assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text
assert "- [[concepts/attention]] — New brief" in text

def test_recovers_when_documents_section_missing(self, tmp_path):
wiki = tmp_path / "wiki"
wiki.mkdir()
(wiki / "index.md").write_text(
"# Index\n\n## Concepts\n\n## Explorations\n",
encoding="utf-8",
)
_update_index(wiki, "my-doc", [], doc_brief="Brief")
text = (wiki / "index.md").read_text()
assert "## Documents" in text
assert "[[summaries/my-doc]] (short) — Brief" in text

def test_recovers_when_concepts_section_missing(self, tmp_path):
wiki = tmp_path / "wiki"
wiki.mkdir()
(wiki / "index.md").write_text(
"# Index\n\n## Documents\n\n## Explorations\n",
encoding="utf-8",
)
_update_index(wiki, "my-doc", ["attention"],
concept_briefs={"attention": "Focus"})
text = (wiki / "index.md").read_text()
assert "## Concepts" in text
assert "[[concepts/attention]] — Focus" in text
assert "[[summaries/my-doc]]" in text


class TestReadWikiContext:
def test_empty_wiki(self, tmp_path):
Expand Down Expand Up @@ -455,6 +497,21 @@ def test_merges_into_existing_section(self, tmp_path):
assert "[[concepts/transformer]]" in text
assert text.count("[[concepts/attention]]") == 1

def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
"""Heading with trailing space must merge into the existing section,
not append a duplicate H2."""
wiki = tmp_path / "wiki"
summaries = wiki / "summaries"
summaries.mkdir(parents=True)
(summaries / "paper.md").write_text(
"# Summary\n\nContent.\n\n## Related Concepts \n- [[concepts/attention]]\n",
encoding="utf-8",
)
_backlink_summary(wiki, "paper", ["attention", "transformer"])
text = (summaries / "paper.md").read_text()
assert "[[concepts/transformer]]" in text
assert text.count("## Related Concepts") == 1


class TestBacklinkConcepts:
def test_adds_summary_link_to_concept(self, tmp_path):
Expand Down Expand Up @@ -503,6 +560,22 @@ def test_skips_missing_concept_file(self, tmp_path):
# Should not raise
_backlink_concepts(wiki, "paper", ["nonexistent"])

def test_section_with_trailing_whitespace_still_merges(self, tmp_path):
"""Heading with trailing space must merge into the existing section,
not append a duplicate H2."""
wiki = tmp_path / "wiki"
concepts = wiki / "concepts"
concepts.mkdir(parents=True)
(concepts / "attention.md").write_text(
"# Attention\n\n## Related Documents \n- [[summaries/old-paper]]\n",
encoding="utf-8",
)
_backlink_concepts(wiki, "new-paper", ["attention"])
text = (concepts / "attention.md").read_text()
assert "[[summaries/new-paper]]" in text
assert "[[summaries/old-paper]]" in text
assert text.count("## Related Documents") == 1


class TestAddRelatedLink:
def test_adds_see_also_link(self, tmp_path):
Expand Down Expand Up @@ -536,6 +609,35 @@ def test_skips_if_file_missing(self, tmp_path):
# Should not raise
_add_related_link(wiki, "nonexistent", "doc", "file.pdf")

def test_frontmatter_without_space_after_colon_still_merges(self, tmp_path):
"""sources:[a] (no space after colon) must still prepend new source."""
wiki = tmp_path / "wiki"
concepts = wiki / "concepts"
concepts.mkdir(parents=True)
(concepts / "attention.md").write_text(
"---\nsources:[paper1.pdf]\n---\n\n# Attention\n",
encoding="utf-8",
)
_add_related_link(wiki, "attention", "new-doc", "paper2.pdf")
text = (concepts / "attention.md").read_text()
assert "paper2.pdf" in text
assert "paper1.pdf" in text
assert "[[summaries/new-doc]]" in text

def test_frontmatter_without_sources_line_gets_one_inserted(self, tmp_path):
wiki = tmp_path / "wiki"
concepts = wiki / "concepts"
concepts.mkdir(parents=True)
(concepts / "attention.md").write_text(
"---\nbrief: Focus mechanism\n---\n\n# Attention\n",
encoding="utf-8",
)
_add_related_link(wiki, "attention", "new-doc", "paper.pdf")
text = (concepts / "attention.md").read_text()
assert "sources: [paper.pdf]" in text
assert "brief: Focus mechanism" in text
assert "[[summaries/new-doc]]" in text


def _mock_completion(responses: list[str]):
"""Create a mock for litellm.completion that returns responses in order."""
Expand Down