### Document Loading

In [4]:
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader

directory_path = "./arch-wiki/html/en/"

if not os.path.exists(directory_path):
    print(f"Directory not found: {directory_path}")
else:
    loader = DirectoryLoader(
        path=directory_path,
        glob="**/*.html", # all html files and subdirectories
        loader_cls=TextLoader, # keeps html tags for document splitting
        show_progress=True
    )

docs = loader.load() # compare it to lazy load

100%|██████████████████████████████████████████████████████████████████████▉| 2491/2493 [00:00<00:00, 8595.14it/s]


In [5]:
len(docs)

2491

### Document Splitting

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

# initilaize header splitter
html_header_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# initilaize text splitter to ensure chunks fit in local llm
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)

final_chunks=[]

for doc in docs:
    # if html file, split by header, then split again into smaller chunks
    if doc.metadata.get("source", "").endswith(".html"):
        header_splits = html_header_splitter.split_text(doc.page_content)

        # IMPORTANT: Manually carry over the 'source' from the original file
        for split in header_splits:
            split.metadata["source"] = doc.metadata.get("source")

        # DEBUG
        print(f"Found {len(header_splits)} header sections.")
        if len(header_splits) > 0:
            print(f"Metadata of first section: {header_splits[0].metadata}")

        
        final_chunks.extend(text_splitter.split_documents(header_splits))
    else:
        # non-html, use recursive splitter directly
        final_chunks.extend(text_splitter.split_documents([doc]))

Found 63 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/WireGuard.html'}
Found 15 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Sugar.html'}
Found 26 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Character_encoding.html'}
Found 13 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Category:Hardware_detection_and_troubleshooting.html'}
Found 27 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Intel_Quartus_Prime.html'}
Found 46 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Dhcpcd.html'}
Found 27 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Guitarix.html'}
Found 25 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Bluetooth_mouse.html'}
Found 25 header sections.
Metadata of first section: {'source': 'arch-wiki/html/en/Command-line_shell.html'}
Found 17 header sections.
Metadata o

In [10]:
len(final_chunks)

75185