In [None]:
import re
import signal
import uuid
from contextlib import contextmanager

import httpx
import ollama
import pandas as pd
from langchain_chroma import Chroma
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [3]:
df = pd.read_csv('AWSDocs.csv')

In [None]:
CHROMA_DB_PATH = "./chroma_db_AWSDocs"
EMBEDDING_MODEL_NAME = "nomic-embed-text"
BATCH_SIZE = 50
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
PROCESS_TIMEOUT_SECONDS = 60

In [5]:
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME)

In [6]:
timeout = httpx.Timeout(30.0, connect=5.0)

In [7]:
embeddings._client = ollama.Client(host=embeddings.base_url, timeout=timeout)

In [8]:
vector_store = Chroma(
    collection_name="AWSDocs",
    persist_directory=CHROMA_DB_PATH,
    embedding_function=embeddings
)

2025-11-12 21:36:52,611 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [9]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

In [None]:
@contextmanager
def time_limit(seconds: int, timeout_message: str):
    def _raise_timeout(_signum, _frame):
        raise TimeoutError(timeout_message)

    original_handler = signal.signal(signal.SIGALRM, _raise_timeout)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)
        signal.signal(signal.SIGALRM, original_handler)


def fetch_markdown(url: str) -> str:
    loader = DoclingLoader(
        file_path=url,
        export_type=ExportType.MARKDOWN,
    )
    docs_as_markdown = loader.load()
    if not docs_as_markdown:
        raise ValueError(f"No content returned for {url}")
    return docs_as_markdown[0].page_content


def should_skip_chunk(chunk):
    headers_combined = " ".join(
        header.lower()
        for header in (
            chunk.metadata.get("Header 1", ""),
            chunk.metadata.get("Header 2", ""),
            chunk.metadata.get("Header 3", ""),
        )
        if header
    )
    if "table of contents" in headers_combined:
        return True, "Table of Contents"

    content_to_check = chunk.page_content.strip()
    if content_to_check and re.fullmatch(r"[|\-\s]+", content_to_check):
        return True, "Markdown table fragment"

    return False, ""


def prepare_chunks(markdown_content: str, metadata: dict):
    semantic_chunks = markdown_splitter.split_text(markdown_content)
    for chunk in semantic_chunks:
        chunk.metadata.update(metadata)

    final_chunks = text_splitter.split_documents(semantic_chunks)

    filtered_chunks = []
    for idx, chunk in enumerate(final_chunks, start=1):
        should_skip, reason = should_skip_chunk(chunk)
        if should_skip:
            print(f"Skipping chunk {idx} ({reason})")
            continue
        filtered_chunks.append(chunk)

    return filtered_chunks


def add_chunks_to_store(chunks):
    if not chunks:
        return 0

    chunk_ids = [str(uuid.uuid4()) for _ in chunks]
    total_chunks = len(chunks)
    total_batches = (total_chunks + BATCH_SIZE - 1) // BATCH_SIZE

    for batch_index in range(total_batches):
        start = batch_index * BATCH_SIZE
        end = start + BATCH_SIZE
        batch_docs = chunks[start:end]
        batch_ids = chunk_ids[start:end]

        print(
            f"  Adding batch {batch_index + 1}/{total_batches} "
            f"({len(batch_docs)} chunks)..."
        )

        vector_store.add_documents(
            documents=batch_docs,
            ids=batch_ids,
        )

    return total_chunks


def process_row(row: pd.Series) -> int:
    raw_url = row.get("PDF_URL")

    is_missing = False
    if raw_url is None:
        is_missing = True
    else:
        try:
            is_missing = bool(pd.isna(raw_url))
        except TypeError:
            is_missing = False

    if is_missing:
        raise ValueError("Missing PDF_URL value")

    url = str(raw_url).strip()
    if not url or url.lower() == "nan":
        raise ValueError("Empty PDF_URL value")

    metadata = {
        "domain": row.get("Domain", ""),
        "service": row.get("Service", ""),
        "source": url,
    }

    markdown_content = fetch_markdown(url)
    chunks = prepare_chunks(markdown_content, metadata)
    return add_chunks_to_store(chunks)

In [None]:
for index, row in df.iterrows():
    raw_url = row.get("PDF_URL")
    url = "" if raw_url is None else str(raw_url).strip()

    if not url or url.lower() == "nan":
        print(f"Row {index}: missing PDF_URL. Skipping.")
        continue

    print(f"Processing {url}...")
    try:
        timeout_message = (
            f"Timed out processing {url} after {PROCESS_TIMEOUT_SECONDS} seconds"
        )
        with time_limit(PROCESS_TIMEOUT_SECONDS, timeout_message):
            chunks_added = process_row(row)

        if chunks_added:
            print(f"Completed {url}: stored {chunks_added} chunks.")
        else:
            print(f"Completed {url}: no chunks to store.")

    except TimeoutError as exc:
        print(f"{exc}. Skipping.")
        continue
    except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.TimeoutException) as exc:
        print(f"HTTP timeout while processing {url}: {exc}. Skipping.")
        continue
    except Exception as exc:
        print(f"Error processing {url}: {exc}")
        continue


2025-11-12 21:37:07,548 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-12 21:37:07,831 - INFO - Going to convert document batch...
2025-11-12 21:37:07,832 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-12 21:37:07,850 - INFO - Loading plugin 'docling_defaults'
2025-11-12 21:37:07,852 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-12 21:37:07,858 - INFO - Loading plugin 'docling_defaults'
2025-11-12 21:37:07,865 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-12 21:37:09,124 - INFO - Auto OCR model selected ocrmac.
2025-11-12 21:37:09,129 - INFO - Accelerator device: 'mps'
2025-11-12 21:37:10,817 - INFO - Accelerator device: 'mps'
2025-11-12 21:37:11,797 - INFO - Processing document ec2-ug.pdf
2025-11-12 21:54:55,650 - INFO - Finished converting document ec2-ug.pdf in 1071.71 sec.


[[Document(metadata={'Header 2': 'How standard burstable performance instances work'}, page_content='When a burstable performance instance configured as standard is in a running state, it continuously earns (at a millisecond-level resolution) a set rate of earned credits per hour. For T2 Standard, when the instance is stopped, it loses all its accrued credits, and its credit balance is reset to zero. When it is restarted, it receives a new set of launch credits, and begins to accrue earned credits. For T4g, T3a, and T3 Standard instances, the CPU credit balance persists for seven days after the instance stops and the credits are lost thereafter. If you start the instance within seven days, no credits are lost.'),
  Document(metadata={'Header 2': 'How standard burstable performance instances work'}, page_content='T2 Standard instances receive two types of CPU credits: earned credits and launch credits . When a T2 Standard instance is in a running state, it continuously earns (at a milli