In [None]:
"""AWS Documentation â†’ Chroma Ingestion

Interactive notebook for ingesting AWS PDF documentation into a Chroma vector store using Ollama embeddings, batching, metadata enrichment, and guarded timeouts around vector-store writes.
"""


In [None]:
import re
import signal
import uuid
from contextlib import contextmanager
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Iterator

import httpx
import ollama
import pandas as pd
from langchain_chroma import Chroma
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)


In [None]:
@dataclass(frozen=True)
class Config:
    data_root: Path = Path(".")
    csv_filename: str = "AWSDocs.csv"
    chroma_dirname: str = "chroma_db_AWSDocs"
    collection_name: str = "AWSDocs"
    embedding_model: str = "nomic-embed-text"
    chunk_size: int = 1000
    chunk_overlap: int = 200
    batch_size: int = 50
    add_timeout_seconds: int = 60
    request_timeout: httpx.Timeout = field(
        default_factory=lambda: httpx.Timeout(30.0, connect=5.0)
    )
    header_splits: tuple[tuple[str, str], ...] = (
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    )

    @property
    def csv_path(self) -> Path:
        return self.data_root / self.csv_filename

    @property
    def chroma_path(self) -> Path:
        return self.data_root / self.chroma_dirname


config = Config()


Loaded 46 rows from AWSDocs.csv


In [None]:
def load_documents_frame(config: Config) -> pd.DataFrame:
    if not config.csv_path.exists():
        raise FileNotFoundError(f"Input CSV not found at {config.csv_path}")

    frame = pd.read_csv(config.csv_path)
    print(f"Loaded {len(frame)} rows from {config.csv_path}")
    return frame


df = load_documents_frame(config)


2025-11-12 22:09:23,790 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [None]:
embeddings = OllamaEmbeddings(model=config.embedding_model)
embeddings._client = ollama.Client(host=embeddings.base_url, timeout=config.request_timeout)

vector_store = Chroma(
    collection_name=config.collection_name,
    persist_directory=str(config.chroma_path),
    embedding_function=embeddings,
)

markdown_splitter = MarkdownHeaderTextSplitter(list(config.header_splits))
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config.chunk_size,
    chunk_overlap=config.chunk_overlap,
)


In [None]:
@contextmanager
def time_limit(seconds: int, timeout_message: str):
    def _raise_timeout(_signum, _frame):
        raise TimeoutError(timeout_message)

    original_handler = signal.signal(signal.SIGALRM, _raise_timeout)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)
        signal.signal(signal.SIGALRM, original_handler)


def fetch_markdown(url: str) -> str:
    loader = DoclingLoader(
        file_path=url,
        export_type=ExportType.MARKDOWN,
    )
    docs_as_markdown = loader.load()
    if not docs_as_markdown:
        raise ValueError(f"No content returned for {url}")
    return docs_as_markdown[0].page_content


def should_skip_chunk(chunk) -> tuple[bool, str]:
    headers_combined = " ".join(
        header.lower()
        for header in (
            chunk.metadata.get("Header 1", ""),
            chunk.metadata.get("Header 2", ""),
            chunk.metadata.get("Header 3", ""),
        )
        if header
    )
    if "table of contents" in headers_combined:
        return True, "Table of Contents"

    content_to_check = chunk.page_content.strip()
    if content_to_check and re.fullmatch(r"[|\-\s]+", content_to_check):
        return True, "Markdown table fragment"

    return False, ""


def split_markdown(markdown_content: str, metadata: dict) -> list:
    semantic_chunks = markdown_splitter.split_text(markdown_content)
    for chunk in semantic_chunks:
        chunk.metadata.update(metadata)

    final_chunks = text_splitter.split_documents(semantic_chunks)

    filtered_chunks = []
    for idx, chunk in enumerate(final_chunks, start=1):
        should_skip, reason = should_skip_chunk(chunk)
        if should_skip:
            print(f"Skipping chunk {idx} ({reason})")
            continue
        filtered_chunks.append(chunk)

    return filtered_chunks


def chunk_batches(chunks: list, size: int) -> Iterator[list]:
    for start in range(0, len(chunks), size):
        yield chunks[start : start + size]


def store_chunks(chunks: list, *, source: str, config: Config) -> int:
    if not chunks:
        return 0

    chunk_ids = [str(uuid.uuid4()) for _ in chunks]
    total_batches = (len(chunks) + config.batch_size - 1) // config.batch_size
    stored_chunks = 0

    for batch_index, batch_docs in enumerate(
        chunk_batches(chunks, config.batch_size), start=1
    ):
        start_idx = (batch_index - 1) * config.batch_size
        end_idx = start_idx + len(batch_docs)
        batch_ids = chunk_ids[start_idx:end_idx]

        print(
            f"  Adding batch {batch_index}/{total_batches} "
            f"({len(batch_docs)} chunks)..."
        )

        timeout_message = (
            f"Timed out adding batch {batch_index}/{total_batches} "
            f"for {source} after {config.add_timeout_seconds} seconds"
        )

        try:
            with time_limit(config.add_timeout_seconds, timeout_message):
                vector_store.add_documents(
                    documents=batch_docs,
                    ids=batch_ids,
                )
        except TimeoutError as exc:
            print(f"{exc}. Skipping batch.")
            continue

        stored_chunks += len(batch_docs)

    return stored_chunks


def normalize_str(value: object) -> str:
    if value is None:
        return ""
    text = str(value).strip()
    if not text or text.lower() == "nan":
        return ""
    return text


def process_row(row: pd.Series, *, url: str, config: Config) -> int:
    metadata = {
        "domain": normalize_str(row.get("Domain")),
        "service": normalize_str(row.get("Service")),
        "source": url,
    }

    markdown_content = fetch_markdown(url)
    chunks = split_markdown(markdown_content, metadata)
    return store_chunks(chunks, source=url, config=config)


def ingest_dataframe(dataframe: pd.DataFrame, *, config: Config) -> int:
    total_chunks = 0

    for index, row in dataframe.iterrows():
        url = normalize_str(row.get("PDF_URL"))
        if not url:
            print(f"Row {index}: missing PDF_URL. Skipping.")
            continue

        print(f"Processing {url}...")
        try:
            stored = process_row(row, url=url, config=config)
        except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.TimeoutException) as exc:
            print(f"HTTP timeout while processing {url}: {exc}. Skipping.")
            continue
        except ValueError as exc:
            print(f"Data issue for {url}: {exc}. Skipping.")
            continue
        except Exception as exc:
            print(f"Error processing {url}: {exc}")
            continue

        total_chunks += stored
        if stored:
            print(f"Completed {url}: stored {stored} chunks.")
        else:
            print(f"Completed {url}: no chunks to store.")
        print()

    print(f"Ingestion complete. Stored {total_chunks} chunks in total.")
    return total_chunks


In [None]:
# Kick off ingestion when ready.
# total_chunks = ingest_dataframe(df, config=config)
# print(f"Stored {total_chunks} chunks in total.")


Processing https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf...


2025-11-12 22:09:54,360 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:09:54,620 - INFO - Going to convert document batch...
2025-11-12 22:09:54,622 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:09:54,652 - INFO - Loading plugin 'docling_defaults'
2025-11-12 22:09:54,655 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-12 22:09:54,661 - INFO - Loading plugin 'docling_defaults'
2025-11-12 22:09:54,666 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-12 22:09:56,263 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:09:56,270 - INFO - Accelerator device: 'mps'
2025-11-12 22:09:59,356 - INFO - Accelerator device: 'mps'
2025-11-12 22:10:00,012 - INFO - Processing document ec2-ug.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/AWSEC2/latest/UserGuide/ec2-ug.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/lambda/latest/dg/lambda-dg.pdf...


2025-11-12 22:10:55,867 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:10:55,978 - INFO - Going to convert document batch...
2025-11-12 22:10:55,979 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:10:55,980 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:10:55,980 - INFO - Accelerator device: 'mps'
2025-11-12 22:10:57,044 - INFO - Accelerator device: 'mps'
2025-11-12 22:10:57,646 - INFO - Processing document lambda-dg.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/lambda/latest/dg/lambda-dg.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/AmazonECS/latest/developerguide/ecs-dg.pdf...


2025-11-12 22:12:03,483 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:12:03,616 - INFO - Going to convert document batch...
2025-11-12 22:12:03,617 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:12:03,618 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:12:03,618 - INFO - Accelerator device: 'mps'
2025-11-12 22:12:04,800 - INFO - Accelerator device: 'mps'
2025-11-12 22:12:05,402 - INFO - Processing document ecs-dg.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/AmazonECS/latest/developerguide/ecs-dg.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/eks/latest/userguide/eks-ug.pdf...


2025-11-12 22:13:05,406 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:13:05,607 - INFO - Going to convert document batch...
2025-11-12 22:13:05,609 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:13:05,610 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:13:05,610 - INFO - Accelerator device: 'mps'
2025-11-12 22:13:06,854 - INFO - Accelerator device: 'mps'
2025-11-12 22:13:07,466 - INFO - Processing document eks-ug.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/eks/latest/userguide/eks-ug.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/elasticbeanstalk/latest/dg/awseb-dg.pdf...


2025-11-12 22:14:10,230 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:14:10,343 - INFO - Going to convert document batch...
2025-11-12 22:14:10,345 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:14:10,352 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:14:10,353 - INFO - Accelerator device: 'mps'
2025-11-12 22:14:11,856 - INFO - Accelerator device: 'mps'
2025-11-12 22:14:12,513 - INFO - Processing document awseb-dg.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/elasticbeanstalk/latest/dg/awseb-dg.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/batch/latest/userguide/batch_user.pdf...


2025-11-12 22:15:23,007 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:15:23,066 - INFO - Going to convert document batch...
2025-11-12 22:15:23,067 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:15:23,068 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:15:23,068 - INFO - Accelerator device: 'mps'
2025-11-12 22:15:24,478 - INFO - Accelerator device: 'mps'
2025-11-12 22:15:25,086 - INFO - Processing document batch_user.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/batch/latest/userguide/batch_user.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/AmazonS3/latest/userguide/s3-userguide.pdf...


2025-11-12 22:16:27,820 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 22:16:28,030 - INFO - Going to convert document batch...
2025-11-12 22:16:28,031 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 22:16:28,033 - INFO - Auto OCR model selected ocrmac.
2025-11-12 22:16:28,033 - INFO - Accelerator device: 'mps'
2025-11-12 22:16:29,710 - INFO - Accelerator device: 'mps'
2025-11-12 22:16:30,454 - INFO - Processing document s3-userguide.pdf


Timed out processing https://docs.aws.amazon.com/pdfs/AmazonS3/latest/userguide/s3-userguide.pdf after 60 seconds. Skipping.
Processing https://docs.aws.amazon.com/pdfs/ebs/latest/userguide/ebs-ug.pdf...


KeyboardInterrupt: 