In [95]:
import pandas as pd
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
import io
import os
import time
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings

In [96]:
df = pd.read_csv('AWSDocs.csv')

In [97]:
first_row = df.iloc[43]

In [98]:
first_row

Domain                                             Analytics
Service                                            sagemaker
PDF_URL    https://docs.aws.amazon.com/pdfs/next-generati...
Name: 43, dtype: object

In [99]:
loader = DoclingLoader(
    file_path=first_row['PDF_URL'],
    export_type=ExportType.MARKDOWN,
)

In [100]:
docs = loader.load()

2025-11-11 18:53:43,321 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-11 18:53:43,436 - INFO - Going to convert document batch...
2025-11-11 18:53:43,446 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-11 18:53:43,462 - INFO - Auto OCR model selected ocrmac.
2025-11-11 18:53:43,469 - INFO - Accelerator device: 'mps'
2025-11-11 18:53:49,093 - INFO - Accelerator device: 'mps'
2025-11-11 18:53:50,343 - INFO - Processing document next-generation-sagemaker-ug.pdf
2025-11-11 18:55:25,649 - INFO - Finished converting document next-generation-sagemaker-ug.pdf in 102.75 sec.


In [101]:
len(docs)

1

In [102]:
markdown_content = docs[0].page_content

In [103]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
splits = markdown_splitter.split_text(markdown_content)

# 'splits' is now a list of Document objects split by headers
print(f"Split document into {len(splits)} chunks based on Markdown headers.")

Split document into 121 chunks based on Markdown headers.


In [104]:
BATCH_SIZE = 50  
CHUNK_SIZE = 2000 # Safety net: Max characters per *final* chunk
CHUNK_OVERLAP = 200

In [105]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

In [109]:
CHROMA_DB_PATH = "./chroma_db_AWS_Docs"
EMBEDDING_MODEL_NAME = "nomic-embed-text"

In [110]:
embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME)

In [111]:
vector_store = Chroma(
    collection_name="AWS_Docs",
    embedding_function=embeddings,
    persist_directory=CHROMA_DB_PATH,
)

2025-11-11 18:56:50,363 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [116]:
def batch_list(input_list, batch_size):
    """Yield successive n-sized chunks from input_list."""
    for i in range(0, len(input_list), batch_size):
        yield input_list[i:i + batch_size]

In [118]:
for split in splits:

    chunks = text_splitter.split_text(split.page_content)

    if chunks:
        print(f"Split {split.page_content} into {len(chunks)} chunks")

        doc_batches = batch_list(chunks, BATCH_SIZE)

        batch_num = 0
        for batch in doc_batches:
            batch_num += 1
            vector_store.add_documents(batch)
            print(f"Added {len(batch)} documents to the vector store")
            time.sleep(5)


Split aws into 1 chunks


AttributeError: 'str' object has no attribute 'id'

In [61]:
for i in range(len(splits)):
    vector_store.add_documents(splits[i:i+1])
    print(f"Added {i+1} documents to the vector store")
    time.sleep(5)

2025-11-11 18:33:35,799 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Added 1 documents to the vector store


2025-11-11 18:33:40,887 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Added 2 documents to the vector store


2025-11-11 18:33:46,044 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Added 3 documents to the vector store


2025-11-11 18:33:51,352 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 500 Internal Server Error"


ResponseError: do embedding request: Post "http://127.0.0.1:56969/embedding": EOF (status code: 500)

In [None]:









def load_and_process_pdfs(service_docs):
    """
    Downloads, loads, splits, and embeds all PDFs using Docling,
    then saves them to Chroma.
    """
    
    # 3. Initialize components
    
    # Embedding model (Using Ollama)


    # Chroma vector store
    vector_store = Chroma(
        persist_directory=CHROMA_DB_PATH,
        embedding_function=embeddings
    )
    
    # Initialize the two-stage splitters

    
    # Safety net splitter

    
    print(f"Initialized Chroma DB at {CHROMA_DB_PATH}")
    
    # 4. Process each service document
    total_services = len(service_docs)
    for i, service in enumerate(service_docs):
        domain = service['Domain']
        service_name = service['Service']
        url = service['PDF_URL']
        
        print(f"\n--- Processing {i+1}/{total_services}: {domain} - {service_name} ---")
        print(f"URL: {url}")
        
        try:
            start_time = time.time()
            
            # A. Load full PDF as Markdown
            print("Initializing DoclingLoader to export Markdown...")

            
            print("Loading and parsing with Docling (this may take a while)...")
            docs_as_markdown = loader.load()
            
            if not docs_as_markdown:
                print("Docling returned no content. Skipping.")
                continue
                
            markdown_content = docs_as_markdown[0].page_content
            
            # B. Stage 1 Split: Semantic Markdown splitting
            print("Splitting document by Markdown headers...")
            semantic_chunks = markdown_splitter.split_text(markdown_content)
            
            # C. Add our custom metadata to each semantic chunk
            for chunk in semantic_chunks:
                chunk.metadata["domain"] = domain
                chunk.metadata["service"] = service_name
                chunk.metadata["source"] = url

            # D. Stage 2 Split: Safety net
            print(f"Applying safety split to {len(semantic_chunks)} semantic chunks...")
            final_chunks = text_splitter.split_documents(semantic_chunks)
            
            # E. Embed and add final chunks to Chroma
            if final_chunks:
                print(f"Found {len(final_chunks)} final chunks to add.")
                
                # --- ## MODIFIED SECTION ## ---
                
                # 1. Create a list of unique IDs
                chunk_ids = [str(uuid.uuid4()) for _ in final_chunks]
                
                # 2. Batch both the documents and the IDs
                for i in range(0, len(final_chunks), BATCH_SIZE):
                    batch_docs = final_chunks[i:i + BATCH_SIZE]
                    batch_ids = chunk_ids[i:i + BATCH_SIZE]
                    
                    batch_num = (i // BATCH_SIZE) + 1
                    total_batches = (len(final_chunks) // BATCH_SIZE) + 1

                    print(f"  Adding batch {batch_num}/{total_batches}...")
                    
                    # 3. Pass both documents and ids to the vector store
                    vector_store.add_documents(
                        documents=batch_docs,
                        ids=batch_ids
                    )
                # --- ## END OF MODIFIED SECTION ## ---
                
                end_time = time.time()
                print(f"Successfully processed {service_name} in {end_time - start_time:.2f} seconds.")
            else:
                print(f"No text chunks extracted from {service_name}.")

        except Exception as e:
            print(f"Error: Failed to process {service_name} from {url}. Skipping. Details: {e}")

    print("\n--- All documents processed! ---")
    print(f"Vector database is persistent and saved in '{CHROMA_DB_PATH}'")
    
    return vector_store

In [124]:
OllamaEmbeddings(model=EMBEDDING_MODEL_NAME).embed_query("Initial connection test")
print("Ollama connection successful.")

2025-11-11 19:18:50,857 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Ollama connection successful.


In [125]:
if os.path.exists(CHROMA_DB_PATH):
    print(f"Database already exists at {CHROMA_DB_PATH}.")
    print("To re-build, please delete this directory and run again.")
else:
    service_docs = parse_csv_data(CSV_DATA)
    load_and_process_pdfs(service_docs)

2025-11-11 19:18:52,074 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 19:18:52,078 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Initializing Ollama embeddings...
Initialized Chroma DB at ./chroma_db_AWSDocs

--- Processing 1/1: Analytics - sagemaker ---
URL: https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
Initializing DoclingLoader to export Markdown...
Loading and parsing with Docling (this may take a while)...


2025-11-11 19:18:53,379 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-11 19:18:53,453 - INFO - Going to convert document batch...
2025-11-11 19:18:53,460 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-11 19:18:53,473 - INFO - Auto OCR model selected ocrmac.
2025-11-11 19:18:53,478 - INFO - Accelerator device: 'mps'
2025-11-11 19:19:03,777 - INFO - Accelerator device: 'mps'
2025-11-11 19:19:05,341 - INFO - Processing document next-generation-sagemaker-ug.pdf
2025-11-11 19:19:45,321 - INFO - Finished converting document next-generation-sagemaker-ug.pdf in 53.07 sec.


Splitting document by Markdown headers...
Applying safety split to 121 semantic chunks...
Found 141 final chunks to add.
  Adding batch 1/3...


2025-11-11 19:19:46,846 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 500 Internal Server Error"


Error: Failed to process sagemaker from https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf. Skipping. Details: do embedding request: Post "http://127.0.0.1:57664/embedding": EOF (status code: 500)

--- All documents processed! ---
Vector database is persistent and saved in './chroma_db_AWSDocs'


In [None]:
def main():
    print("Checking Ollama connection...")
    try:
        OllamaEmbeddings(model=EMBEDDING_MODEL_NAME).embed_query("Initial connection test")
        print("Ollama connection successful.")
    except Exception as e:
        print("Error: Could not connect to Ollama.")
        print("Please make sure the Ollama application is running and you have run:")
        print(f"ollama pull {EMBEDDING_MODEL_NAME}")
        return



if __name__ == "__main__":
    main()