Разбиение на чанки при помощи MarkdownHeaderTextSplitter и RecursiveCharacterTextSplitter.

Здесь мы учитываем не только markdown headers но и длинну чанков.

Так же добавляем необходимые метаданные в виде 

In [1]:
from pathlib import Path
import re
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MIN_CHARS = 220
MAX_URL_RATIO = 0.35

In [None]:
input_dir = Path("/Users/sergey/Desktop/Moodle_RAG/data/moodle_docs/clean_markdown")
output_dir = Path("/Users/sergey/Desktop/Moodle_RAG/data/moodle_docs/chunks_md")
output_dir.mkdir(parents=True, exist_ok=True)

In [4]:
headers_to_split_on = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
]

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=False,
)


In [5]:
char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1100,
    chunk_overlap=160,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [6]:
def extract_meta(text: str, file_path: Path):
    lines = text.splitlines()
    title = file_path.stem
    for line in lines:
        if line.startswith("# "):
            title = line[2:].strip()
            break

    youtube_links = sorted(set(re.findall(r"https?://(?:www\.)?(?:youtube\.com|youtu\.be)/[^\s)\]]+", text)))
    source_links = sorted(set(re.findall(r"https?://docs\.moodle\.org/[^\s)\]]+", text)))

    return {
        "doc_title": title,
        "youtube_links": youtube_links,
        "source_links": source_links,
    }

In [None]:
SKIP_H2 = {
    "sources",
    "media",
    "tools",
    "in other languages",
    "what links here",
    "related changes",
    "special pages",
    "privacy",
    "about moodle docs",
    "disclaimers",
}

def is_bad_chunk(text: str, meta: dict) -> bool:
    t = text.strip()
    if not t:
        return True

    h2 = (meta.get("h2") or "").strip().lower()
    if h2 in SKIP_H2:
        return True

    # Очень короткие чанки (обычно шум)
    if len(t) < MIN_CHARS:
        return True

    # URL-чанки (почти только ссылки/картинки)
    urls = re.findall(r"https?://\S+", t)
    url_chars = sum(len(u) for u in urls)
    if len(t) > 0 and (url_chars / len(t)) > MAX_URL_RATIO:
        return True

    # Слишком мало буквенного текста
    letters = sum(ch.isalpha() for ch in t)
    if letters < 80:
        return True

    return False


for md_file in sorted(input_dir.glob("*.md")):
    text = md_file.read_text(encoding="utf-8")
    doc_meta = extract_meta(text, md_file)

    header_docs = md_splitter.split_text(text)
    chunks = char_splitter.split_documents(header_docs)

    # фильтрация мусора
    good_chunks = []
    for ch in chunks:
        content = ch.page_content.strip()
        if not is_bad_chunk(content, ch.metadata):
            good_chunks.append(ch)

    # merge маленьких хвостов с предыдущим чанком 
    merged = []
    for ch in good_chunks:
        content = ch.page_content.strip()
        if merged and len(content) < 320:
            prev = merged[-1]
            prev.page_content = prev.page_content.rstrip() + "\n\n" + content
        else:
            merged.append(ch)

    doc_out_dir = output_dir / md_file.stem
    doc_out_dir.mkdir(parents=True, exist_ok=True)

    for i, chunk in enumerate(merged):
        meta = {**doc_meta, **chunk.metadata, "chunk_index": i}
        content = chunk.page_content.strip()

        chunk_text = (
            f"---\n"
            f"{meta}\n"
            f"---\n\n"
            f"{content}\n"
        )
        (doc_out_dir / f"chunk_{i:04d}.md").write_text(chunk_text, encoding="utf-8")

print("Done")

Done
