In [22]:
import os
import uuid
import json
from dotenv import load_dotenv
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# === Load environment variables ===
load_dotenv()

# === Configuration ===
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PDF_DIR = "pdfs"
INDEX_NAME = "guideline-rag"
EMBED_DIM = 768  # for intfloat/e5-base

# === Initialize Pinecone ===
pc = Pinecone(api_key=PINECONE_API_KEY)

# Delete existing index if needed
if INDEX_NAME in [i.name for i in pc.list_indexes()]:
    pc.delete_index(INDEX_NAME)


if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBED_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(INDEX_NAME)

# === Load embedding model ===
embedder = SentenceTransformer("intfloat/e5-base")

# === Helper: chunk text ===
def chunk_text(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# === Helper: batch vectors to stay within 2MB limit ===
def batch_vectors(vectors):
    batch = []
    batch_size = 0
    for vector in vectors:
        size = len(json.dumps(vector).encode("utf-8"))
        if batch_size + size > 2 * 1024 * 1024:  # 2MB
            yield batch
            batch = [vector]
            batch_size = size
        else:
            batch.append(vector)
            batch_size += size
    if batch:
        yield batch


def save_chunks_to_disk(filename, chunks):
    output_dir = Path("docling_chunks")
    output_dir.mkdir(exist_ok=True)
    chunk_path = output_dir / f"{Path(filename).stem}_chunks.jsonl"
    with chunk_path.open("w", encoding="utf-8") as f:
        for chunk in chunks:
            json.dump({"text": chunk, "source": filename}, f)
            f.write("\n")
    print(f"💾 Saved chunks to {chunk_path}")

# === Configure Docling with OCR and layout parsing ===
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

converter = DocumentConverter(
    allowed_formats=[InputFormat.PDF],
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# === Process all PDFs ===
for filename in os.listdir(PDF_DIR):
    if not filename.lower().endswith(".pdf"):
        continue

    path = os.path.join(PDF_DIR, filename)
    print(f"\n📄 Processing: {filename}")

    try:
        result = converter.convert(path)
        full_text = result.document.export_to_markdown()
    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")
        continue

    chunks = chunk_text(full_text)

    save_chunks_to_disk(filename, chunks)
    
    if not chunks:
        print(f"⚠️ No content found in {filename}")
        continue

    # E5 expects "passage: ..." format
    texts_to_embed = [f"passage: {chunk}" for chunk in chunks]
    embeddings = embedder.encode(texts_to_embed, show_progress_bar=True)

    vectors = [{
        "id": str(uuid.uuid4()),
        "values": emb.tolist(),
        "metadata": {
            "text": chunk,
            "source": filename
        }
    } for chunk, emb in zip(chunks, embeddings)]

    for batch in batch_vectors(vectors):
        index.upsert(vectors=batch)

    print(f"✅ Uploaded {len(vectors)} chunks from {filename}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: intfloat/e5-base



📄 Processing: document.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.pipeline.base_pipeline:Processing document document.pdf
INFO:docling.document_converter:Finished converting document document.pdf in 120.68 sec.


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

✅ Uploaded 887 chunks from document.pdf

📄 Processing: blood-transfusion-pdf-1837331897029.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document blood-transfusion-pdf-1837331897029.pdf
INFO:docling.document_converter:Finished converting document blood-transfusion-pdf-1837331897029.pdf in 13.27 sec.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Uploaded 85 chunks from blood-transfusion-pdf-1837331897029.pdf

📄 Processing: iron-bc.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document iron-bc.pdf
INFO:docling.document_converter:Finished converting document iron-bc.pdf in 23.50 sec.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document iron1.pdf


✅ Uploaded 124 chunks from iron-bc.pdf

📄 Processing: iron1.pdf


INFO:docling.document_converter:Finished converting document iron1.pdf in 23.66 sec.


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

✅ Uploaded 174 chunks from iron1.pdf

📄 Processing: 112024db.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 112024db.pdf
INFO:docling.document_converter:Finished converting document 112024db.pdf in 59.97 sec.


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

✅ Uploaded 164 chunks from 112024db.pdf

📄 Processing: Patient Blood Management Guideline_Final.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Patient Blood Management Guideline_Final.pdf
INFO:docling.document_converter:Finished converting document Patient Blood Management Guideline_Final.pdf in 45.89 sec.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

✅ Uploaded 340 chunks from Patient Blood Management Guideline_Final.pdf

📄 Processing: Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf
INFO:docling.document_converter:Finished converting document Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf in 16.20 sec.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Uploaded 117 chunks from Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf

📄 Processing: ezx325.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document ezx325.pdf
INFO:docling.document_converter:Finished converting document ezx325.pdf in 106.05 sec.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

✅ Uploaded 486 chunks from ezx325.pdf

📄 Processing: Guidelines_UBWEN.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Guidelines_UBWEN.pdf
INFO:docling.document_converter:Finished converting document Guidelines_UBWEN.pdf in 5.22 sec.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Uploaded 34 chunks from Guidelines_UBWEN.pdf

📄 Processing: Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf
INFO:docling.document_converter:Finished converting document Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf in 15.41 sec.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Uploaded 99 chunks from Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf


Save chunks only

In [23]:
import os
import json
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

# === Config ===
PDF_DIR = "10pdfs"

# === Chunking helper ===
def chunk_text(text, chunk_size=500):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# === Save chunks to disk ===
def save_chunks_to_disk(filename, chunks):
    output_dir = Path("docling_chunks")
    output_dir.mkdir(exist_ok=True)
    chunk_file = output_dir / f"{Path(filename).stem}_chunks.jsonl"
    with chunk_file.open("w", encoding="utf-8") as f:
        for chunk in chunks:
            json.dump({"text": chunk, "source": filename}, f)
            f.write("\n")
    print(f"💾 Saved chunks to {chunk_file}")

# === Configure Docling ===
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

converter = DocumentConverter(
    allowed_formats=[InputFormat.PDF],
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# === Process PDFs and save chunks ===
for filename in os.listdir(PDF_DIR):
    if not filename.lower().endswith(".pdf"):
        continue

    path = os.path.join(PDF_DIR, filename)
    print(f"\n📄 Processing: {filename}")

    try:
        result = converter.convert(path)
        full_text = result.document.export_to_markdown()
    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")
        continue

    chunks = chunk_text(full_text)
    save_chunks_to_disk(filename, chunks)


📄 Processing: document.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70041f74270850b7bedf7c8f5c2dcede
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.pipeline.base_pipeline:Processing document document.pdf
INFO:docling.document_converter:Finished converting document document.pdf in 119.98 sec.


💾 Saved chunks to docling_chunks/document_chunks.jsonl

📄 Processing: blood-transfusion-pdf-1837331897029.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document blood-transfusion-pdf-1837331897029.pdf
INFO:docling.document_converter:Finished converting document blood-transfusion-pdf-1837331897029.pdf in 13.74 sec.


💾 Saved chunks to docling_chunks/blood-transfusion-pdf-1837331897029_chunks.jsonl

📄 Processing: iron-bc.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document iron-bc.pdf
INFO:docling.document_converter:Finished converting document iron-bc.pdf in 22.46 sec.


💾 Saved chunks to docling_chunks/iron-bc_chunks.jsonl

📄 Processing: iron1.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document iron1.pdf
INFO:docling.document_converter:Finished converting document iron1.pdf in 18.04 sec.
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document 112024db.pdf


💾 Saved chunks to docling_chunks/iron1_chunks.jsonl

📄 Processing: 112024db.pdf


INFO:docling.document_converter:Finished converting document 112024db.pdf in 34.93 sec.


💾 Saved chunks to docling_chunks/112024db_chunks.jsonl

📄 Processing: Patient Blood Management Guideline_Final.pdf


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Patient Blood Management Guideline_Final.pdf
INFO:docling.document_converter:Finished converting document Patient Blood Management Guideline_Final.pdf in 39.25 sec.
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf


💾 Saved chunks to docling_chunks/Patient Blood Management Guideline_Final_chunks.jsonl

📄 Processing: Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf


INFO:docling.document_converter:Finished converting document Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015.pdf in 13.32 sec.
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document ezx325.pdf


💾 Saved chunks to docling_chunks/Bsh-Committee-for-standards-in-haematology-guidelines-on-the-identification-and-management-of-pre-operative-anaemia-Kotz-2015_chunks.jsonl

📄 Processing: ezx325.pdf


INFO:docling.document_converter:Finished converting document ezx325.pdf in 100.49 sec.
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Guidelines_UBWEN.pdf


💾 Saved chunks to docling_chunks/ezx325_chunks.jsonl

📄 Processing: Guidelines_UBWEN.pdf


INFO:docling.document_converter:Finished converting document Guidelines_UBWEN.pdf in 5.50 sec.
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf


💾 Saved chunks to docling_chunks/Guidelines_UBWEN_chunks.jsonl

📄 Processing: Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf


INFO:docling.document_converter:Finished converting document Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery.pdf in 14.69 sec.


💾 Saved chunks to docling_chunks/Anaesthesia - 2015 - Mu%C3%B1oz - Pre%E2%80%90operative haematological assessment in patients scheduled for major surgery_chunks.jsonl
