In [1]:
from pathlib import Path

SNAPSHOTS_DIR = Path("data/platform-docs-snapshots/")
VERSIONS_DIR = Path("data/platform-docs-versions-english/")

from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = DirectoryLoader(VERSIONS_DIR, glob="[!.]*/[!.]*.md", loader_cls=TextLoader).load()
docs = [d for d in docs if Path(d.metadata['source']) != VERSIONS_DIR / "README.md"]

In [5]:
doc = docs[0]

In [7]:
doc.json()

'{"page_content": "Pinterest has a longstanding commitment to creating a safe and positive place online. \\u00a0We continually invest in our policies, products and partnerships to support the safety and wellbeing of our community because it\\u2019s the right thing for the people on our platform, and we view the Digital Services Act (DSA) as a continuation of our commitment to making Pinterest an inspiring and welcoming place for everyone.\\n\\nThe DSA is a uniform legal framework for providers of digital services in the European Union (EU). \\u00a0In order to create a safe, predictable, and trustworthy online environment, the DSA aims to create a single set of rules across the EU\\u2019s Member States governing the transparency and accountability obligations of online platforms.\\n\\nIf there is content on Pinterest you think may be illegal under EU law, you can report it by using our in-product reporting features or by using the following form: https://pinterest.com/about/dsa/.\\n\\nI

In [11]:
import re

def replace_alternate_headings(text):
    lines = text.split('\n')
    new_lines = []
    i = 0
    while i < len(lines):
        if i + 1 < len(lines) and lines[i].strip():
            # '=' underline headings
            if re.match(r'^=+$', lines[i + 1]):
                new_lines.append(f'# {lines[i]}')
                i += 2
                continue
            # '-' underline headings
            elif re.match(r'^-+$', lines[i + 1]):
                new_lines.append(f'## {lines[i]}')
                i += 2
                continue
        new_lines.append(lines[i])
        i += 1
    return '\n'.join(new_lines)

markdown_text = """
This is a document

Heading 1
=========

Some text under heading 1.

Heading 2
---------

Some text under heading 2.

Another Heading | With Pipe
----------------------------

# heading

## heading

### heading

| Table |
|-------|
| Value |
"""

new_markdown = replace_alternate_headings(markdown_text)
print(new_markdown)


This is a document

# Heading 1

Some text under heading 1.

## Heading 2

Some text under heading 2.

## Another Heading | With Pipe

# heading

## heading

### heading

| Table |
|-------|
| Value |



In [22]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split = [("#", "H1"),("##", "H2"), ('###', "H3")]
#replace_alternate_headings(docs[100].page_content)

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(new_markdown)
filtered_header_splits = []
for d in md_header_splits:
    if len(d.page_content) == 0:
        continue

    # source = "Source document" + d.source.split('/')[-1] + '\n'
    hierarchy = "Source location: " 
    for k, v in d.metadata.items():
        if k.startswith('H'):
            hierarchy = hierarchy + f"{k}: {v}; "

    content = "Content: \n" + d.page_content  + '\n'
     
    all = hierarchy + "\n" + content

    print(all)

len(md_header_splits)

Source location: 
Content: 
This is a document

Source location: H1: Heading 1; 
Content: 
# Heading 1  
Some text under heading 1.

Source location: H1: Heading 1; H2: Heading 2; 
Content: 
## Heading 2  
Some text under heading 2.

Source location: H1: Heading 1; H2: Another Heading | With Pipe; 
Content: 
## Another Heading | With Pipe

Source location: H1: heading; H2: heading; H3: heading; 
Content: 
# heading  
## heading  
### heading  
| Table |
|-------|
| Value |



5