From 56e8393bc9cf521ea6278339e008e9f47636042e Mon Sep 17 00:00:00 2001 From: octo-patch Date: Sun, 26 Apr 2026 11:19:14 +0800 Subject: [PATCH] fix: fix markdown parser edge cases in page_index_md.py (fixes #245) Three robustness fixes for extract_nodes_from_markdown and md_to_tree: 1. ATX closing hashes: update header regex to strip trailing ## markers so "## My Section ##" yields title "My Section" instead of "My Section ##" 2. Tilde + mismatched fence lengths: replace the simple backtick-only toggle with proper fence tracking that records the opening fence character ('`' or '~') and length. A closing fence must use the same character and be at least as long as the opening fence, so: - ~~~python...~~~ correctly suppresses headers inside tilde blocks - ````...``` does NOT close a 4-backtick block (3 < 4) 3. Headerless documents: when no headers are found, md_to_tree now creates a single root node containing the full document text instead of returning an empty structure, preventing silent data loss. Co-Authored-By: Octopus --- pageindex/page_index_md.py | 47 +++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 5a5971690..ca1053222 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -30,25 +30,39 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold def extract_nodes_from_markdown(markdown_content): - header_pattern = r'^(#{1,6})\s+(.+)$' - code_block_pattern = r'^```' + # Strip optional trailing closing hashes from ATX headers (e.g. "## Title ##" -> "Title") + header_pattern = r'^(#{1,6})\s+(.+?)(?:\s+#+\s*)?$' + # Match fenced code blocks: backticks (3+) or tildes (3+) + code_block_pattern = r'^(`{3,}|~{3,})' node_list = [] - + lines = markdown_content.split('\n') in_code_block = False - + fence_char = None # fence character: '`' or '~' + fence_len = 0 # minimum fence length needed to close the block + for line_num, line in enumerate(lines, 1): stripped_line = line.strip() - - # Check for code block delimiters (triple backticks) - if re.match(code_block_pattern, stripped_line): - in_code_block = not in_code_block + + # Check for code block delimiters (3+ backticks or tildes) + fence_match = re.match(code_block_pattern, stripped_line) + if fence_match: + marker = fence_match.group(1) + if not in_code_block: + in_code_block = True + fence_char = marker[0] + fence_len = len(marker) + elif marker[0] == fence_char and len(marker) >= fence_len: + # Close only when same fence character with at least the opening length + in_code_block = False + fence_char = None + fence_len = 0 continue - + # Skip empty lines if not stripped_line: continue - + # Only look for headers when not inside a code block if not in_code_block: match = re.match(header_pattern, stripped_line) @@ -250,7 +264,18 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad print(f"Extracting text content from nodes...") nodes_with_content = extract_node_text_content(node_list, markdown_lines) - + + # Handle headerless documents: treat the entire content as a single node + if not nodes_with_content: + doc_name = os.path.splitext(os.path.basename(md_path))[0] + full_text = '\n'.join(markdown_lines).strip() + nodes_with_content = [{ + 'title': doc_name, + 'line_num': 1, + 'level': 1, + 'text': full_text, + }] + if if_thinning: nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model) print(f"Thinning nodes...")