## PDF to Docs Conversion

### Setup

In [1]:
import os
from typing import Any
from pydantic import BaseModel
from dotenv import load_dotenv
from tqdm import tqdm

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_docling.loader import ExportType
from langchain_docling import DoclingLoader
from langchain_docling.loader import MetaExtractor, BaseChunk
from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions

from ragapp.rag.med_agent_graph import MedTechAgent, DeviceEnum
from constants import (
    DATA_DIR,
    LP_DOCLING_CACHE_DIR,
    LP_PLUMBER_CACHE_DIR,
    LP_DOCLING_COLLECTION_NAME,
    LP_PLUMBER_COLLECTION_NAME,
    LP_DEVICES,
    EMBEDDINGS_MODEL_NAME,
    TOGETHER_META_LLAMA_70B_FREE,
    GROQ_GEMMA_9B,
)

In [2]:
load_dotenv()

# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
embedding = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)

In [4]:
class PDFMetadata(BaseModel):
    pdf_title: str
    device: DeviceEnum


file_path_to_metadata = dict()

for pdf_data_dir in LP_DEVICES:
    pdf_files_path = os.path.join(DATA_DIR, pdf_data_dir)

    for pdf_file_name in os.listdir(pdf_files_path):
        full_file_path = os.path.join(pdf_files_path, pdf_file_name)
        file_path_to_metadata[full_file_path] = PDFMetadata(
            pdf_title=pdf_file_name, device=pdf_data_dir.lower()
        )

### Docling

In [5]:
class MedTechMetaExtractor(MetaExtractor):
    def __init__(self, file_to_metadata: dict[str, PDFMetadata]):
        super().__init__()
        self.file_to_metadata = file_to_metadata

    def extract_chunk_meta(self, file_path: str, chunk: BaseChunk) -> dict[str, Any]:
        """Extract chunk meta."""
        metadata = super().extract_chunk_meta(file_path, chunk)
        pages = {
            str(prov.page_no)
            for doc_item in chunk.meta.doc_items
            for prov in doc_item.prov
        }
        page = min(pages)  # todo: may be select all pages, not only first one.
        my_metadata = self.file_to_metadata[file_path].model_dump()
        metadata.update({"page": page})
        metadata.update(my_metadata)
        return metadata

In [6]:
if os.path.exists(LP_DOCLING_CACHE_DIR):
    print("Trying to load existing doc cache.")
    docling_storage = Chroma(
        collection_name=LP_DOCLING_COLLECTION_NAME,
        embedding_function=embedding,
        persist_directory=LP_DOCLING_CACHE_DIR,
    )
    print("Docling doc cache loaded.")
else:
    print("Creating new doc cache.")
    pipeline_options = PdfPipelineOptions(do_ocr=False)
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    docling_loader = DoclingLoader(
        file_path=file_path_to_metadata.keys(),
        converter=doc_converter,
        export_type=ExportType.DOC_CHUNKS,
        chunker=HybridChunker(tokenizer=EMBEDDINGS_MODEL_NAME),
        meta_extractor=MedTechMetaExtractor(file_path_to_metadata),
    )
    got_by_docling_documents = filter_complex_metadata(docling_loader.load())
    print("Got docs! \nStarting storage creating...")
    docling_storage = Chroma.from_documents(
        documents=got_by_docling_documents,
        embedding=embedding,
        collection_name=LP_DOCLING_COLLECTION_NAME,
        persist_directory=LP_DOCLING_CACHE_DIR,
    )
    print("Finished storage creating.")

Trying to load existing doc cache.
Docling doc cache loaded.


### PDFPlumber

In [7]:
if os.path.exists(LP_PLUMBER_CACHE_DIR):
    print("Trying to load existing pipeline cache.")
    plumber_storage = Chroma(
        collection_name=LP_PLUMBER_COLLECTION_NAME,
        embedding_function=embedding,
        persist_directory=LP_PLUMBER_CACHE_DIR,
    )
    print("Plumber doc cache loaded.")
else:
    print("Creating new pipeline cache.")
    got_by_plumber_documents = []

    for pdf_file_path, pdf_metadata in tqdm(file_path_to_metadata.items()):
        loader = PDFPlumberLoader(pdf_file_path)
        for doc in loader.load():
            doc.metadata.update(pdf_metadata.model_dump())
            got_by_plumber_documents.append(doc)

    print("Got docs! \nStarting storage creating.")
    plumber_storage = Chroma.from_documents(
        documents=got_by_plumber_documents,
        embedding=embedding,
        collection_name=LP_PLUMBER_COLLECTION_NAME,
        persist_directory=LP_PLUMBER_CACHE_DIR,
    )
    print("Plumber doc cache loaded.")

Trying to load existing pipeline cache.
Plumber doc cache loaded.


### Test Searching

In [8]:
docling_storage.similarity_search(
    "How do I troubleshoot low volume on Lifepak 15?",
    k=2,
    filter={"device": DeviceEnum.lifepak_15.value},
)[0].metadata

{'device': 'lifepak 15',
 'page': '226',
 'pdf_title': 'Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf',
 'source': '/home/tikhon/PycharmProjects/RAGapp/data/LIFEPAK 15/Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf'}

In [10]:
med_agent = MedTechAgent(
    vector_storage=plumber_storage,
    rag_model_name=TOGETHER_META_LLAMA_70B_FREE,
    device_model_name=GROQ_GEMMA_9B,
)

In [11]:
question = "How do I troubleshoot low volume on LP 20?"

inputs = {"question": question, "k": 5}
response = med_agent.run(**inputs)

if response.retrieval_result is not None:
    res = response.retrieval_result.answer
else:
    res = response.device_classification.reasoning

In [12]:
res

'To troubleshoot low volume on LP 20, you can try adjusting the pulse tone volume by highlighting and selecting SPO2 on the home screen, then selecting SPO2 VOLUME, and rotating the Speed Dial to the desired volume. Press the Speed Dial to set the volume.'

In [15]:
response.retrieval_result.sources

[SourceDocument(title='Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf', pages=[50])]