In [None]:
# Import necessary modules from LangChain, ChromaDB, and Python standard library
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
import os
import re
import chromadb

In [None]:
# Initialize the embedding model used for semantic chunking and indexing
embedding_model = HuggingFaceEmbeddings(
    model_name="emilyalsentzer/Bio_ClinicalBERT"  # Clinical BERT model optimized for medical text
)

  embedding_model = HuggingFaceEmbeddings(


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'




No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with mean pooling.


In [None]:
# Directory where PDF files are stored
data_folder = "./data"

# List all PDF files in the data folder
pdf_files = [
    os.path.join(data_folder, f)
    for f in os.listdir(data_folder)
    if f.lower().endswith(".pdf")
]


In [4]:

for path in pdf_files:
    print(f"Ingesting file: {path}")  # Start of processing for this file

    if not os.path.exists(path):
        raise FileNotFoundError(f"PDF file not found: {path}")

Ingesting file: ./data\discharge.pdf
Ingesting file: ./data\general-discharge.pdf
Ingesting file: ./data\history.pdf
Ingesting file: ./data\mri.pdf


In [None]:
def extract_patient_name(text: str) -> str:
    match = re.search(r"Patient name:\s*\n*\s*([^\n]+)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return "unknown"

# List to accumulate all chunks from all PDFs collectively
all_chunks = []

# Process each PDF file
for path in pdf_files:
    print(f"Ingesting file: {path}")  # Print which file is being processed

    # Check if file exists at path
    if not os.path.exists(path):
        raise FileNotFoundError(f"PDF file not found: {path}")

    # Load PDF contents using UnstructuredPDFLoader with specific mode and strategy for layout accuracy
    loader = UnstructuredPDFLoader(path, mode="elements", strategy="hi_res")
    docs = loader.load()  # Load documents as a list of chunks

    # Add metadata to each document chunk (source filename and patient name)
    for doc in docs:
        doc.metadata["source"] = os.path.basename(path)
        doc.metadata["patient_name"] = extract_patient_name(doc.page_content)

    # Print a summary of loaded documents for this file
    print(f"\nLoaded {len(docs)} documents from {os.path.basename(path)}:\n")
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get("source", "unknown")
        patient = doc.metadata.get("patient_name", "unknown")
        snippet = doc.page_content[:500].replace("\n", " ")  # Show first 500 chars as snippet
        print(f"Document {i}:")
        print(f"  Source: {source}")
        print(f"  Patient Name: {patient}")
        print(f"  Content snippet: {snippet}\n{'-'*60}")

    # Use SemanticChunker to split documents into semantically coherent chunks
    # with a buffer size to maintain context across chunks
    splitter = SemanticChunker(embeddings=embedding_model, buffer_size=20)
    chunks = splitter.split_documents(docs)
    print(f"Split into {len(chunks)} semantically-coherent chunks.")
    print(f"Completed chunking for {path}")

    # Add the chunks from this file to the global list accumulating all chunks
    all_chunks.extend(chunks)

    # Filter out any chunks with complex or problematic metadata before storage
    filtered_chunks = filter_complex_metadata(chunks)

    # Use the PDF filename without extension as the Chroma collection name
    doc_name = os.path.splitext(os.path.basename(path))[0]

    # Store chunks into a Chroma collection, embedding them using the clinical embedding model
    db = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        collection_name=doc_name,
        persist_directory="./chroma_db"  # Directory to persist the Chroma database
    )
    print(f"Completed storage to Chroma for {path}")



Ingesting file: ./data\discharge.pdf


Loaded 54 document elements from ./data\discharge.pdf
Split into 54 semantically-coherent chunks.
Completed chunking for ./data\discharge.pdf
Total chunks from all PDFs: 54
Completed storage to Chroma for ./data\discharge.pdf
Ingesting file: ./data\general-discharge.pdf


Cannot set gray non-stroke color because /'P32' is an invalid float value
Cannot set gray non-stroke color because /'P40' is an invalid float value
Cannot set gray non-stroke color because /'P53' is an invalid float value
Cannot set gray non-stroke color because /'P71' is an invalid float value
Cannot set gray non-stroke color because /'P79' is an invalid float value
Cannot set gray non-stroke color because /'P87' is an invalid float value
Cannot set gray non-stroke color because /'P99' is an invalid float value
Cannot set gray non-stroke color because /'P107' is an invalid float value
Cannot set gray non-stroke color because /'P115' is an invalid float value




Cannot set gray non-stroke color because /'P32' is an invalid float value
Cannot set gray non-stroke color because /'P40' is an invalid float value
Cannot set gray non-stroke color because /'P53' is an invalid float value
Cannot set gray non-stroke color because /'P71' is an invalid float value
Cannot set gray non-stroke color because /'P79' is an invalid float value
Cannot set gray non-stroke color because /'P87' is an invalid float value
Cannot set gray non-stroke color because /'P99' is an invalid float value
Cannot set gray non-stroke color because /'P107' is an invalid float value
Cannot set gray non-stroke color because /'P115' is an invalid float value


Loaded 81 document elements from ./data\general-discharge.pdf
Split into 82 semantically-coherent chunks.
Completed chunking for ./data\general-discharge.pdf
Total chunks from all PDFs: 136
Completed storage to Chroma for ./data\general-discharge.pdf
Ingesting file: ./data\history.pdf
Loaded 135 document elements from ./data\history.pdf
Split into 136 semantically-coherent chunks.
Completed chunking for ./data\history.pdf
Total chunks from all PDFs: 272
Completed storage to Chroma for ./data\history.pdf
Ingesting file: ./data\mri.pdf
Loaded 110 document elements from ./data\mri.pdf
Split into 110 semantically-coherent chunks.
Completed chunking for ./data\mri.pdf
Total chunks from all PDFs: 382
Completed storage to Chroma for ./data\mri.pdf


In [6]:

client = chromadb.PersistentClient(path="./chroma_db")
print(client.list_collections())

[Collection(name=history), Collection(name=general-discharge), Collection(name=discharge), Collection(name=mri)]
