# mdmanager

> Obsidian App Compatible Markdown reader

In [None]:
#| default_exp mdmanager

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from memexplatform_obsidian.settings import ObsidianConfig
from enum import Enum
import urllib
import mistletoe
from pathlib import Path
from mistletoe.ast_renderer import ASTRenderer
from mistletoe import Document
import yaml
import re
from typing import List
from mistletoe.block_token import BlockToken
from mistletoe import Document, HTMLRenderer
from mistletoe.ast_renderer import AstRenderer
from mistletoe import span_token, block_token, token
from mistletoe.span_token import Link, RawText, EscapeSequence, AutoLink
from mistletoe.span_tokenizer import tokenize
from mistletoe.token import Token


In [None]:
from mistletoe import markdown

In [None]:
#| export
config = ObsidianConfig(); config
# (config.PORTAL/'index.qmd').exists()

ObsidianConfig(PREFIX='/obsidian', OBSIDIAN_VAULT=Path('/Users/rahul1.saraf/rahuketu/programming/notesobs'))

In [None]:
#| export
def get_subdirs(vault: Path):
    subdirs = []
    for p in vault.rglob("*"):
        if p.is_dir():
            # Check if any parent directory (including self) should be skipped
            if any(part.startswith((".", "_")) or part.startswith("logseq") for part in p.parts): continue
            subdirs.append(p)
    return subdirs

In [None]:
# config.OBSIDIAN_VAULT.ls()
vault_path = Path(config.OBSIDIAN_VAULT)
subdirs = get_subdirs(vault_path); subdirs
pages_dir = config.OBSIDIAN_VAULT/"pages"; pages_dir.ls()[-1]

doc_real = pages_dir/"Product Mindset for RAG.md"; doc_real.read_text()

'---\ntitle: Product Mindset for RAG\ntags:\ncollection:\n  - "[[RAG]]"\n---\n## Introduction\n\nDoing correct things involves approaching the problem with the right mindset. Approaching RAG as a project with technical metrics like choosing vector embeddings, llm , chunk size often misses the point of building a RAG product in the first place, providing correct information to the user in shortest possible time. \n\nTo get things right we need to think of RAG as recommendation system and identify the ways to continuously and iteratively improve our system as we systematically collect and receive user feedback. Our main objective should be customer delight, retention and increased engagement. \n\nSo how do we get started. In a project mindset, most of the time people face issues with chicken and egg problem.  Without domain knowledge, without knowing anything about the users and their experience with your product; How do you get started? Should we spend huge amount of money in collecting

In [None]:
doc = Document(["Hello *world*!"])
print(doc)  # token tree

<mistletoe.block_token.Document with 1 child line_number=1>


In [None]:
#| export
class WikiLink(Link):
    """
    Obsidian-style wikilink [[Note|Alias]] that behaves like mistletoe.Link.
    """
    pattern = re.compile(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]")
    parse_inner = False
    parse_group = 0

    def __init__(self, match):
        # The "target" (like a URL)
        self.target = EscapeSequence.strip(match.group(1).strip())
        # Use alias if present, otherwise same as target
        self.alias = match.group(2) or self.target

        # Fill in Link attributes
        self.title = ""     # Obsidian links don’t have a title
        self.label = None   # No label
        self.dest_type = "wikilink"
        self.title_delimiter = None

        # children: inline text tokens for the alias (like how Link holds link text)
        self.children = [RawText(self.alias)]

In [None]:
#| export
class AnyLink(span_token.SpanToken):
    """
    Match any URI scheme like obsidian://, logseq://, code://, http://, https:// etc.
    Does NOT match markdown links [text](...) or wikilinks [[Note]].
    """
    # Must capture the whole URL in group 1
    pattern = re.compile(r'(?<![\]\)"])(([a-zA-Z][a-zA-Z0-9+.-]*://[^\s]+))')
    parse_group = 1

    def __init__(self, match):
        url = match.group(1)
        self.children = (RawText(url),)
        self.target = url
        self.title = ""

In [None]:
#| export

class Properties(Token):
    repr_attributes = ("key", "children")

    def __init__(self, key, value):
        self.key = key
        # Parse value recursively into proper span tokens
        self.children = self._parse_value(value)

    def _parse_value(self, value):
        """Recursively parse value into a flat list of span tokens."""
        if isinstance(value, str):
            stripped = value.strip()
            # Detect plain URLs and convert to AutoLink
            m = AnyLink.pattern.match(stripped)
            if m: return [AnyLink(m)]

            # Otherwise parse as regular inline markdown
            return span_token.tokenize_inner(stripped)
            # return span_token.tokenize_inner(value)

        elif isinstance(value, list):
            tokens = []
            for v in value:
                tokens.extend(self._parse_value(v))
            return tokens

        elif isinstance(value, dict):
            tokens = []
            for k, v in value.items():
                tokens.extend(self._parse_value(v))
            return tokens

        else:
            # Fallback for non-string values
            return [RawText(str(value))]

In [None]:
#| export
class TagLink(Link):
    """
    Obsidian-style tag link #tag that behaves like a mistletoe.Link.
    Renders to <a href="/tags/tag">#tag</a> or similar.
    """
    # Matches hashtags at word boundaries: # followed by word characters/dashes
    pattern = re.compile(r'(?<!\w)#([\w\-]+)')
    parse_group = 0
    parse_inner = False

    def __init__(self, match):
        tag_name = match.group(1)

        # URL target (you can change the prefix to match your app)
        self.target = f"/tags/{tag_name}"
        self.title = ""
        self.label = None
        self.dest_type = "taglink"
        self.title_delimiter = None

        # Displayed text is "#tag"
        self.children = [RawText(f"#{tag_name}")]

In [None]:
#| export
class Frontmatter(BlockToken):
    """
    YAML Frontmatter token.
    This is a leaf block token that captures metadata enclosed by --- ... ---.
    """

    repr_attributes = BlockToken.repr_attributes + ("data",)

    # Match `---` followed by anything until the next `---` on its own line
    pattern = re.compile(
        r'(?ms)^(?:---)\s*\n(.*?)(?:\n---\s*$)'
    )

    def __init__(self, content: str):
        self.content = content.strip()
        try:
            self.data = yaml.safe_load(self.content) or {}
        except yaml.YAMLError as e:
            self.data = {"error": str(e)}
        
        # Process the YAML data into children, parsing each value as markdown
        self.children = []
        
        self.children = []

        for k, v in self.data.items():
            if k.lower() == "tags":
                tags = v if isinstance(v, list) else [v]
                            # Prepend "#" so later inline parsing makes TagLinks
                processed = [f"#{t}" for t in tags if isinstance(t, str)]
                self.children.append(Properties(k, processed))
            else: self.children.append(Properties(k, v))
    
            # if k == "tags":
            #     tags = v if isinstance(v, list) else [v]
            #     tokenized = [
            #         TagLink(tag) if isinstance(tag, str) else parse_inline_value(tag)
            #         for tag in tags
            #     ]
            #     self.children.append(Properties(k, tokenized))
            # else:
            #     # parsed_value = parse_inline_value(v)
            #     self.children.append(Properties(k, v))

    @classmethod
    def start(cls, line: str) -> bool:
        # Only starts if line is `---`
        return line.strip() == "---"

    @classmethod
    def check_interrupts_paragraph(cls, lines):
        return cls.start(lines.peek())

    @classmethod
    def read(cls, lines):
        # Consume first line (`---`)
        next(lines)
        buffer = []
        for line in lines:
            if line.strip() == "---":
                break
            buffer.append(line)
        return "".join(buffer)

In [None]:
#| export
class ObsidianAstRenderer(AstRenderer):
    def __init__(self,  **kwargs):
        super().__init__(Frontmatter, WikiLink, TagLink, AnyLink,  **kwargs)  # register custom token
    
    def render_wiki_link(self, token):
        return self.render_link(token)

    def render_any_link(self, token):
        return self.render_link(token)

    def render_tag_link(self, token):
        return self.render_link(token)
    
    def render_frontmatter(self, token: Frontmatter) -> dict:
        # Just return dict so AST expansion shows structured metadata
        return token.children

In [None]:
#| export

class ObsidianHTMLRenderer(HTMLRenderer):
    def __init__(self, **kwargs):
        super().__init__(Frontmatter, WikiLink, TagLink, AnyLink, **kwargs)  # register custom tokens
    
    def render_wiki_link(self, token):
        return self.render_link(token)

    def render_any_link(self, token):
        return self.render_link(token)
    
    def render_tag_link(self, token):
        """Render TagLink as a span with tag styling."""
        return self.render_link(token)

    def render_frontmatter(self, token: Frontmatter) -> str:
        if not token.children:
            return '<div class="frontmatter"></div>'

        rows = []
        for prop in token.children:  # each is a Properties token
            key_html = f"<th>{prop.key}</th>"
            value_html = "".join(self.render(child) for child in prop.children)
            rows.append(f"<tr>{key_html}<td>{value_html}</td></tr>")

        return (
            '<div class="frontmatter">\n'
            '<table>\n'
            + "\n".join(rows)
            + "\n</table>\n</div>"
        )

In [None]:
text = """
---
title: Product Mindset for RAG
tags:
collection:
  - "[[RAG]]"
---

This is fun.
Check this "[[Note|My Alias]]".
2nd link ![Alias2](https://www.google.com/)

> This is a test
> my test
"""

In [None]:
#| export
def get_obsidianmd_ast(text):
    span_token.add_token(WikiLink)
    block_token.add_token(Frontmatter)
    span_token.add_token(TagLink)
    return Document(text)

In [None]:
o = get_obsidianmd_ast(text); o.children[0].children[2].children
fm = o.children[0]
fm.children

[<__main__.Properties with 1 child key='title' children=[<mistletoe.span_token.RawText content='Product Mindset for RAG'>] at 0x127b62b50>,
 <__main__.Properties with 0 children key='tags' children=[]>,
 <__main__.Properties with 1 child key='collection' children=[<__main__.WikiLink with 1 child target='RAG' title=''>] at 0x1278a9610>]

In [None]:
#| notest
# text = 'Check this "[[Note|My Alias]]".'
# html = markdown(text, renderer=ObsidianHTMLRenderer)
# print(html)

In [None]:

lines = [line for line in text.splitlines() if line.strip()]; lines
with ObsidianHTMLRenderer() as renderer:
    # html = renderer.render(Document(lines))
    html = renderer.render(Document(doc_real.read_text()))

print(html), html

<div class="frontmatter">
<table>
<tr><th>title</th><td>Product Mindset for RAG</td></tr>
<tr><th>tags</th><td></td></tr>
<tr><th>collection</th><td><a href="pages/RAG">RAG</a></td></tr>
</table>
</div>
<h2>Introduction</h2>
<p>Doing correct things involves approaching the problem with the right mindset. Approaching RAG as a project with technical metrics like choosing vector embeddings, llm , chunk size often misses the point of building a RAG product in the first place, providing correct information to the user in shortest possible time.</p>
<p>To get things right we need to think of RAG as recommendation system and identify the ways to continuously and iteratively improve our system as we systematically collect and receive user feedback. Our main objective should be customer delight, retention and increased engagement.</p>
<p>So how do we get started. In a project mindset, most of the time people face issues with chicken and egg problem.  Without domain knowledge, without knowing an

(None,
 '<div class="frontmatter">\n<table>\n<tr><th>title</th><td>Product Mindset for RAG</td></tr>\n<tr><th>tags</th><td></td></tr>\n<tr><th>collection</th><td><a href="pages/RAG">RAG</a></td></tr>\n</table>\n</div>\n<h2>Introduction</h2>\n<p>Doing correct things involves approaching the problem with the right mindset. Approaching RAG as a project with technical metrics like choosing vector embeddings, llm , chunk size often misses the point of building a RAG product in the first place, providing correct information to the user in shortest possible time.</p>\n<p>To get things right we need to think of RAG as recommendation system and identify the ways to continuously and iteratively improve our system as we systematically collect and receive user feedback. Our main objective should be customer delight, retention and increased engagement.</p>\n<p>So how do we get started. In a project mindset, most of the time people face issues with chicken and egg problem.  Without domain knowledge,

In [None]:
#| notest
text = """
This is fun
Check this [[Note|My Alias]].
"""
ast = get_obsidianmd_ast(text); ast.children

[<mistletoe.block_token.Paragraph with 5 children line_number=2>]

In [None]:
#| export

def print_ast(token, indent=0):
    pad = "  " * indent
    data = {k: v for k, v in getattr(token, "__dict__", {}).items() if not k.startswith("_")}
    print(f"{pad}{token.__class__.__name__}: {data}")
    
    # Safely iterate over children if present and iterable
    children = getattr(token, "children", None)
    if children:
        for child in children:
            print_ast(child, indent + 1)

In [None]:
text = """
---
title: Product Mindset for RAG
tags:
source: https://www.youtube.com/
collection:
  - "[[RAG]]"
---
This is fun #some
Check this [[Note|My Alias]].
My link [Alias](note)
2nd link ![Alias2](https://www.google.com/)

> This is a test
> my test
"""
doc = get_obsidianmd_ast(text)
print_ast(doc)

Document: {'footnotes': {}, 'line_number': 1}
  Frontmatter: {'content': 'title: Product Mindset for RAG\ntags:\nsource: https://www.youtube.com/\ncollection:\n  - "[[RAG]]"', 'data': {'title': 'Product Mindset for RAG', 'tags': None, 'source': 'https://www.youtube.com/', 'collection': ['[[RAG]]']}, 'line_number': 2}
    Properties: {'key': 'title'}
      RawText: {'content': 'Product Mindset for RAG'}
    Properties: {'key': 'tags'}
    Properties: {'key': 'source'}
      AnyLink: {'target': 'https://www.youtube.com/', 'title': ''}
        RawText: {'content': 'https://www.youtube.com/'}
    Properties: {'key': 'collection'}
      WikiLink: {'target': 'RAG', 'alias': 'RAG', 'title': '', 'label': None, 'dest_type': 'wikilink', 'title_delimiter': None}
        RawText: {'content': 'RAG'}
  Paragraph: {'line_number': 9}
    RawText: {'content': 'This is fun '}
    TagLink: {'target': '/tags/some', 'title': '', 'label': None, 'dest_type': 'taglink', 'title_delimiter': None}
      RawText:

In [None]:
#| notest
from IPython.core.display import display_html


# doc = Document(doc_real.read_text().splitlines())
html = None
with ObsidianAstRenderer() as renderer:
    doc = get_obsidianmd_ast(text); print(doc)
    # doc = get_obsidianmd_ast(text); print(doc.children[0].children)
    # html = renderer.render(doc)

print(html)

<mistletoe.block_token.Document with 3 children line_number=1>
None


In [None]:
# doc = Document(text.splitlines())
# html = None
# a = Document(text.splitlines()).children[0]; a.children[0]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()