Experiments with splitting markdown documents into logical units.

In [None]:
with open("tmp/dnd-notes-main/Factions/The Institute.md", "r") as f:
    lines = f.readlines()

# Remove YAML frontmatter from beginning of file
frontmatter_end = lines.index("---\n", 1) + 1 if "---\n" in lines else 0
lines = lines[frontmatter_end:]

In [None]:
sections = {}
current_section = ""
paragraphs = []

for line in lines:
    if line.startswith("#"):
        if current_section:
            sections[current_section.strip()] = paragraphs
            current_section = ""
            paragraphs = []
        current_section = line.strip()
    else:
        if not current_section:
            continue
        if line.strip() == "":
            continue
        paragraphs.append(line.strip())

if current_section and paragraphs:
    sections[current_section.strip()] = paragraphs

import pprint
pprint.pprint(sections)

The above code splits the markdown document by heading, however nested headings are not handled. At this stage, I'm unsure if nested headings are useful for this tool.

To handle nested headings, we need to keep track of the current heading level. We can do this by keeping track of the number of hashes in the heading.

In [None]:
# Modify the above code to nest the sections in the same way as the original file based on the number of # in the section header
sections_dict = {}

for section in sections:
    if section.count("#") == 1:
        sections_dict[section] = sections[section]
    else:
        # Find the parent section
        parent_section = section[:section.rfind("#")]
        if parent_section not in sections_dict:
            sections_dict[parent_section] = {}
        sections_dict[parent_section][section] = sections[section]

pprint.pprint(sections_dict)

## Attempt to use pandoc to split markdown

I now use the pandoc library to handle the markdown parsing.

In [1]:
import pandoc
import pprint

with open("tmp/dnd-notes-main/Factions/The Institute.md", "r") as f:
    doc = pandoc.read(file=f, format="markdown")

In [3]:
from pandoc.types import Header, Para, BulletList, OrderedList

def split_doc_by_header(doc, include_orphans=True):
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if elt == doc:
            continue
        match elt:
            case Header(_, _, title):
                title_str = pandoc.write(title).strip()
                sections[pandoc.write(title).strip()] = []
            case Para(x):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                sections[title_str].append(pandoc.write(x, options=["--wrap=none"]).strip())
            case BulletList(_) | OrderedList(_):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                # split the list on newlines
                list_str = pandoc.write(elt, options=["--wrap=none"])
                list_items = list_str.splitlines()
                sections[title_str].extend(list_items)
    return sections

pprint.pprint(split_doc_by_header(doc), width=200)

{'History': ["The Institute's presence on the Menagerie Coast is part of a trade agreement between the Dwendalian Empire and the Clovis Concord. As far as the Concord know, the Hearthstar Peaks "
             "facility's purpose is research on the peculiar arcane fire which alights the highest peak. Their true purpose remains a secret even to their own emperor, he believes they are working "
             'on a weapon.'],
 'Key Figures': ['Archmagus Syrus: Syrus is the leader of the Institute and the most powerful spellcaster among their ranks. He is a master of arcane magic and has a deep understanding of the other '
                 'planes. He is cunning and ruthless, and will stop at nothing to achieve the goals of the Institute.',
                 "Yef Golbloom: Golbloom is a sinister therapist and interrogator who is responsible for conducting interviews with the Institute's experiments. He is skilled in the art of "
                 "persuasion and manipulation, and is able to ex