# 02 â€“ Chunking Experiments

This notebook is for trying different chunking strategies.

- Import `FixedWindowChunker`, `MarkdownHeadingChunker`, and `PageMarkerChunker`.
- Compare chunk length distributions and metadata.



In [None]:
import sys
from pathlib import Path

# Add src directory to path so we can import modules
sys.path.insert(0, str(Path("../src").resolve()))

from loaders import get_loader_for_path
from cleaning import clean_text
from chunking.fixed_window import FixedWindowChunker

DATA_DIR = Path("../data/raw")

files = sorted(DATA_DIR.rglob("*"))

supported_suffixes = {".txt", ".md", ".pdf", ".json", ".csv"}

candidates = [
    f for f in files
    if f.is_file() and f.suffix.lower() in supported_suffixes
]

if not candidates:
    raise RuntimeError("No supported files found.")

sample_path = candidates[0]
print(f"Using sample file: {sample_path}")

loader = get_loader_for_path(sample_path)
doc = loader.load(sample_path)[0]

doc.text = clean_text(doc.text)
print(f"Loaded Document: {doc.id}")
print(f"Length of cleaned text: {len(doc.text)} chars")



In [None]:
# Try different fixed-window configs
configs = [
    {"size": 300, "overlap": 50},
    {"size": 600, "overlap": 100},
    {"size": 1000, "overlap": 200},
]

for cfg in configs:
    chunker = FixedWindowChunker(size=cfg["size"], overlap=cfg["overlap"])
    chunks = chunker.chunk(doc)
    lengths = [len(c.text) for c in chunks]

    print("\n====================================================")
    print(f"Chunk config: size={cfg['size']} overlap={cfg['overlap']}")
    print(f"Number of chunks: {len(chunks)}")
    print(
        f"Min length: {min(lengths)}  Max length: {max(lengths)}  "
        f"Avg: {sum(lengths)/len(lengths):.1f}"
    )
    print("First chunk preview:")
    print(chunks[0].text[:300])

