In [211]:
from pathlib import Path

VERSIONS_DIR = "../data/platform-docs-versions/"

from langchain_community.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os

def list_folders(directory):
    # Ensure the directory exists
    if not os.path.isdir(directory):
        print(f"Error: '{directory}' is not a directory.")
        return

    # List all items in the directory
    items = os.listdir(directory)

    # Filter out only the directories
    return [item for item in items if os.path.isdir(os.path.join(directory, item))]


In [53]:
import re

def replace_alternate_headings(text):
    lines = text.split('\n')
    new_lines = []
    i = 0
    while i < len(lines):
        if i + 1 < len(lines) and lines[i].strip():
            # '=' underline headings
            if re.match(r'^=+$', lines[i + 1]):
                new_lines.append(f'# {lines[i]}')
                i += 2
                continue
            # '-' underline headings
            elif re.match(r'^-+$', lines[i + 1]):
                new_lines.append(f'## {lines[i]}')
                i += 2
                continue
        new_lines.append(lines[i])
        i += 1
    return '\n'.join(new_lines)

markdown_text = """This is a document

Heading 1
=========

Some text under heading 1.

Heading 2
---------

Some text under heading 2.

Another Heading | With Pipe
----------------------------

# heading

## heading

### heading

| Table |
|-------|
| Value |
"""

new_markdown = replace_alternate_headings(markdown_text)
print(new_markdown)

This is a document

# Heading 1

Some text under heading 1.

## Heading 2

Some text under heading 2.

## Another Heading | With Pipe

# heading

## heading

### heading

| Table |
|-------|
| Value |



In [83]:
"".split('\n')

['']

In [166]:
def find_substring_line(content, substring):
    all_lines = content.split('\n')

    substring_lines = substring.split('\n')
    for line_num, line in enumerate(all_lines):
        if substring_lines[0].strip(' ') == line.strip(' '):
            line_end = line_num + len(substring_lines)
            return line_num, line_end
    return None, None

In [167]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

def process_document(document, merge_h3=False):
    if merge_h3:
        headers_to_split = [("#", "H1"),("##", "H2")]
    
    headers_to_split = [("#", "H1"),("##", "H2"), ('###', "H3")]
    new_markdown = replace_alternate_headings(document.page_content)

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text(new_markdown)
    filtered_header_splits = []
    threshold = 6000 # length of allowed threshold

    metadata = item = None

    for d in md_header_splits:
        # skip if empty
        if len(d.page_content) <= 50:
            continue

        # add to previous if short and have exactly the same metadata
        if metadata == d.metadata.items() and item is not None and len(item) + d.page_content < threshold:
            item = item + '\n' + d.page_content
            filtered_header_splits[-1][0] = item
            continue
        
        # create a new split
        source = "Source document: " + document.metadata['source']
        metadata = d.metadata.items()
        hierarchy = "Paragraph location: " + '\n'
        for k, v in metadata:
            if k.startswith('H'):
                hierarchy = hierarchy + f"\t{v}\n"

        content = "Content: \n" + d.page_content  + '\n'
        
        item = f"{source}\n{hierarchy}\n{content}" 

        if new_markdown is not None and d.page_content is not None:
            line_start, line_end = find_substring_line(new_markdown, d.page_content)
            metadata=(line_start, line_end)
        
        # if line_start is not None:
        #     print(f"The substring '{d.page_content[:100]}' starts at line {line_start}, end at line {line_end}.")
        # else:
        #     print(f"The substring '{d.page_content[:100]}' was not found in the file.")       

        filtered_header_splits.append((item, metadata))


    return filtered_header_splits

In [168]:
docs = TextLoader("data/pdf_outputs/Apple_API-Ad-Repository/Ad Repository API.md").load()
docs

[Document(page_content='## Ad Repository API\n\nJanuary 2024\n\n###### Contents\n\n* Getting Started................\n* 3 Versioning\n* 3 Usability\n* 3 Get a list of app and developer names\n* 4 Request example\n* 4 Request parameters\n* 4 Response example\n* 5 Response properties\n* 6 Get a list of countries or regions\n* 7 Response example\n* 7 Response properties\n* 8 Get a list of ads\n* 9 Request example\n* 9 Request parameters\n* 9 Response example\n* 10 Response properties\n* 11 Get ad variations\n* 14 Request parameters\n* 14 Response example\n* 15 Response properties\n* 17 Get advertising-restrictions\n* 20 Request example\n* 20 Request parameters\n* 20 Response example\n* 21 Response properties\n* 22 Response properties\n* 22 Changelog\n\n## Getting Started\n\nUse the Ad Repository API to look up Apple-delivered advertising on the App Store in select European countries and regions as well as information about qualifying advertising restrictions. You can manage API calls prog

In [209]:
import json

def save_splits(folder_name, splits):

    data = {}
    for idx, split in enumerate(splits):
        data[idx] = {
            'content': split[0],
            'line_start': split[1][0],
            'line_end': split[1][1]
        }

    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    with open(f"{folder_name}", "w") as f:
        json.dump(data, f, indent=4)
        print(f"Dumping in {folder_name}...")


In [189]:
splits = process_document(docs[0])
for split in splits:
    print(split)
    print('---------------------------')

('Source document: data/pdf_outputs/Apple_API-Ad-Repository/Ad Repository API.md\nParagraph location: \n\tAd Repository API\n\nContent: \n## Ad Repository API  \nJanuary 2024  \n###### Contents  \n* Getting Started................\n* 3 Versioning\n* 3 Usability\n* 3 Get a list of app and developer names\n* 4 Request example\n* 4 Request parameters\n* 4 Response example\n* 5 Response properties\n* 6 Get a list of countries or regions\n* 7 Response example\n* 7 Response properties\n* 8 Get a list of ads\n* 9 Request example\n* 9 Request parameters\n* 9 Response example\n* 10 Response properties\n* 11 Get ad variations\n* 14 Request parameters\n* 14 Response example\n* 15 Response properties\n* 17 Get advertising-restrictions\n* 20 Request example\n* 20 Request parameters\n* 20 Response example\n* 21 Response properties\n* 22 Response properties\n* 22 Changelog\n', (0, 30))
---------------------------
('Source document: data/pdf_outputs/Apple_API-Ad-Repository/Ad Repository API.md\nParagr

In [190]:
save_splits('data', 'test', splits) # test would be original folder name

In [213]:

folders = list_folders(VERSIONS_DIR)

for folder in folders:
    full_path = VERSIONS_DIR + folder
    print(f"Looking in folder {full_path}")
    docs = DirectoryLoader(Path(VERSIONS_DIR + folder), glob="[!.]*.md", loader_cls=TextLoader).load()

    print(f"Found {len(docs)}")
    for doc in docs:
        name = doc.metadata['source'].replace('data', '_jsons').replace('.md', '.json')

        splits = process_document(doc)

        save_splits(name, splits) # test would be original folder name

Looking in folder ../data/platform-docs-versions/Pinterest_DSA-Home-Page
Found 1
Dumping in ../_jsons/platform-docs-versions/Pinterest_DSA-Home-Page/DSA-Home-Page.md.json...
Looking in folder ../data/platform-docs-versions/Snapchat_Transparency-World
Found 1
Dumping in ../_jsons/platform-docs-versions/Snapchat_Transparency-World/Transparency Report.md.json...
Looking in folder ../data/platform-docs-versions/Google_Custom-Search-JSON-API
Found 1
Dumping in ../_jsons/platform-docs-versions/Google_Custom-Search-JSON-API/Custom Search JSON API.md.json...
Looking in folder ../data/platform-docs-versions/Booking.com_AD-API
Found 1
Dumping in ../_jsons/platform-docs-versions/Booking.com_AD-API/Ads Repository API Documentation.md.json...
Looking in folder ../data/platform-docs-versions/Facebook_Instagram-API
Found 2
Dumping in ../_jsons/platform-docs-versions/Facebook_Instagram-API/API Reference.md.json...
Dumping in ../_jsons/platform-docs-versions/Facebook_Instagram-API/Instagram API Documen