### Mini: Document processing (PDF) — 5‑minute intro

Goal: show a minimal PDF → text → clean → chunk → save flow so you can plug it into RAG later.

Steps we’ll do now:
- Load a sample PDF (`data/sample.pdf`)
- Extract text with a lightweight library
- Clean whitespace and normalize
- Chunk into small slices with overlap
- Save chunks to JSONL for later indexing

Keep this simple: this is not about perfect parsing—just a quick, practical baseline.


In [None]:
# Minimal deps; run once per environment
import sys, subprocess

def pip_install(package: str):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

pip_install("pypdf")

print("Ready ✔")


In [None]:
from pathlib import Path
from pypdf import PdfReader

PDF_PATH = Path("data/sample.pdf")
assert PDF_PATH.exists(), f"Missing file: {PDF_PATH}"

reader = PdfReader(str(PDF_PATH))
num_pages = len(reader.pages)

pages_text = []
for i in range(num_pages):
    page = reader.pages[i]
    text = page.extract_text() or ""
    pages_text.append(text)

raw_text = "\n\n".join(pages_text)
print({"pages": num_pages, "chars": len(raw_text)})
raw_text[:500]


In [None]:
import re

def basic_clean(text: str) -> str:
    # collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # strip odd nulls or control chars
    text = re.sub(r"[\x00-\x08\x0B-\x1F\x7F]", "", text)
    # trim
    return text.strip()

clean_text = basic_clean(raw_text)
print({"chars_before": len(raw_text), "chars_after": len(clean_text)})
clean_text[:500]


In [None]:
from typing import List, Dict

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    chunks: List[str] = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunks.append(text[start:end])
        if end == n:
            break
        start = end - overlap
    return chunks

chunks = chunk_text(clean_text, chunk_size=700, overlap=100)
print({"num_chunks": len(chunks), "first_len": len(chunks[0]) if chunks else 0})
chunks[:2]


In [None]:
import json
from pathlib import Path

OUT_PATH = Path("data/sample_chunks.jsonl")

# quick preview
for i, ch in enumerate(chunks[:3]):
    print(f"--- chunk {i} ({len(ch)} chars) ---\n{ch[:300]}\n")

with OUT_PATH.open("w", encoding="utf-8") as f:
    for i, ch in enumerate(chunks):
        rec = {"id": f"sample-{i}", "text": ch}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"Saved {len(chunks)} chunks -> {OUT_PATH}")


In [None]:
# LangChain loader setup
import sys, subprocess

def pip_install(package: str):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Core packages for loaders and splitters live in community package
pip_install("langchain")
pip_install("langchain-community")

print("LangChain ready ✔")


In [None]:
from datetime import datetime
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

PDF_PATH = Path("data/sample.pdf")
loader = PyPDFLoader(str(PDF_PATH))
docs = loader.load()  # one Document per page

# add simple metadata
for i, d in enumerate(docs):
    d.metadata.update({
        "doc_id": f"sample_pdf",
        "source": str(PDF_PATH),
        "page": d.metadata.get("page", i),
        "type": "pdf",
        "module": "doc_processing_min",
        "ingested_at": datetime.utcnow().isoformat() + "Z",
    })

len(docs), docs[0].metadata, docs[0].page_content[:300]


In [None]:
# quick glance at a couple of pages
for i, d in enumerate(docs[:2]):
    print({"i": i, "chars": len(d.page_content), "meta": {k: d.metadata[k] for k in ["doc_id", "page", "type", "module"]}})
    print(d.page_content[:300].replace("\n", " ") + "\n---\n")


In [None]:
# make documents ready for next step: recursive chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100,
    add_start_index=True,
)

chunked_docs = splitter.split_documents(docs)
print({"pages": len(docs), "chunks": len(chunked_docs)})
chunked_docs[:2]
