In [1]:

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.retrievers.bm25 import BM25Retriever
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import langchain_community.retrievers as retrievers
import requests
import langextract as lx
from langextract import data as lx_data
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
#from langchain_community.retrievers import RedundancyFilterRetriever
import chromadb
import os


In [None]:
# Load PDF files from the data directory
data_folder = "./data"
pdf_files = [
    os.path.join(data_folder, f)
    for f in os.listdir(data_folder)
    if f.lower().endswith(".pdf")
]

In [None]:
# Configuration for the vector database and embedding
db_path = "./chroma_db"  # Persistent Chroma database directory
embedding_model = HuggingFaceEmbeddings(model_name="emilyalsentzer/Bio_ClinicalBERT")  # Clinical embedding model

# Initialize persistent Chroma client to access collections
client = chromadb.PersistentClient(path=db_path)

# Collect collection names based on PDF filenames (without extensions)
collection_names = [os.path.splitext(os.path.basename(p))[0] for p in pdf_files]

  embedding_model = HuggingFaceEmbeddings(model_name="emilyalsentzer/Bio_ClinicalBERT")


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'




No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with mean pooling.


In [None]:
# Groq API settings for LLM calls
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
GROQ_API_KEY = "your-api-key"  # Secure your API key appropriately
MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"


In [None]:
# Prompt template to guide LLM responses specifically for healthcare context
prompt_template = """
You are a helpful, precise, and compassionate healthcare assistant. ONLY use the medical context provided below to answer the question accurately and completely.

If the information needed to answer the question is NOT present in the context, reply with "I don't know." Do NOT guess or add any information that is not explicitly stated in the context.

If you don't know the exact answer, you may provide any relevant information related to the question that is contained within the context to help guide the user.

When asked about multiple documents, provide a clear, structured summary or overview of each document separately, including key information such as document type, patient(s) involved, dates, diagnoses, and other important content.

Read the context carefully and synthesize a concise but comprehensive answer addressing the question. Avoid repeating information verbatim and focus on clarity and helpfulness.

After providing the answer, suggest 3 relevant follow-up questions that someone might ask based on the same document to help explore related information. Format these suggestions as a numbered list prefixed by:

"Related questions you might ask:"

Medical Context:
{context}

Question:
{question}

Answer:

---

Related questions you might ask:
1.
2.
3.

"""


In [None]:
# Create a LangChain prompt template instance with context and question as input variables
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
# Function to get response from the Groq LLM API given the formatted prompt
def get_llama4_response(formatted_prompt):
    """
    Calls Groq API with the prompt and returns the LLM response

    Args:
        formatted_prompt (str): The full prompt including question and medical context.

    Returns:
        str: The response generated by the LLM.
    """
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": "You are a helpful healthcare assistant."},
            {"role": "user", "content": formatted_prompt}
        ],
        "max_tokens": 5000,
        "temperature": 0.3  # Low temperature for focused, precise answers
    }
    response = requests.post(GROQ_API_URL, json=data, headers=headers)
    response.raise_for_status()
    result = response.json()
    return result["choices"][0]["message"]["content"].strip()

In [8]:
query = input("Enter the question? ")



In [None]:
all_top_chunks = []  # Initialize an empty list to collect the top relevant document chunks from all document collections

# Initialize the cross-encoder model for reranking the relevance of document chunks.
# Cross-encoders jointly score the query and document pairs for better fine-grained relevance estimation.
cross_encoder = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")

# Wrap the cross-encoder in a reranker utility that applies reranking and limits results to top 12 documents.
# This helps trim down retrieved documents to the most relevant subset.
reranker = CrossEncoderReranker(model=cross_encoder, top_n=12)

# Loop over each document collection stored in the Chroma vector store.
# Each collection corresponds to one PDF document indexed separately.
for doc_name in collection_names:
    # Load the Chroma embedding retriever for this collection.
    # Configure it to perform MMR (Maximal Marginal Relevance) search to promote diverse, relevant retrieval.
    db_doc = Chroma(collection_name=doc_name, persist_directory=db_path, embedding_function=embedding_model)
    dense_retriever = db_doc.as_retriever(search_type="mmr", search_kwargs={"k": 30, "fetch_k": 50})

    # Fetch all raw documents and their metadata from the current Chroma collection.
    # We'll use these to construct a BM25 retriever - a sparse text matching method.
    all_docs = db_doc.get(include=["documents", "metadatas"])

    # Convert retrieved raw documents and metadata into LangChain Document objects.
    bm25_docs = [Document(page_content=text, metadata=metadata) for text, metadata in zip(all_docs["documents"], all_docs["metadatas"])]

    # Initialize the BM25 retriever from these documents.
    # BM25 is a classic keyword-based retrieval technique that complements dense vector retrieval.
    bm25_retriever = BM25Retriever.from_documents(bm25_docs)
    bm25_retriever.k = 30  # Set BM25 to retrieve top 30 documents by relevance

    # Create an ensemble retriever that merges results from both dense retriever and BM25 retriever.
    # This leverages both semantic embeddings and keyword matching for better coverage.
    ensemble_retriever = MergerRetriever(retrievers=[dense_retriever, bm25_retriever])

    # Use the ensemble retriever to fetch candidate document chunks relevant to the user query.
    # These candidates come from merging dense and sparse retrieval outputs.
    candidate_docs = ensemble_retriever.get_relevant_documents(query)

    # Wrap the ensemble retriever results with a ContextualCompressionRetriever.
    # This component compresses and reranks documents using the cross-encoder reranker for precision.
    # The idea is to remove redundancy and focus on the most relevant slices of text.
    final_retriever = ContextualCompressionRetriever(base_compressor=reranker, base_retriever=ensemble_retriever)

    # Retrieve compressed and reranked relevant documents, keeping only the top 15 for efficiency.
    compressed_docs = final_retriever.get_relevant_documents(query)
    top_compressed = compressed_docs[:15]

    # From the top compressed results, extract key metadata and content and add them to the global accumulator.
    # This prepares a combined set of best candidate chunks from all collections for downstream processing.
    all_top_chunks.extend([
        (
            doc.metadata.get("source", doc_name),               # Prefer extracted 'source' metadata; fallback to collection name
            doc.metadata.get("patient_name", "unknown"),        # Prefer patient name metadata if present
            doc.page_content.strip()                             # Actual textual content of the chunk, stripped of excess whitespace
        )
        for doc in top_compressed
    ])

# Finally, after processing all collections, report how many top relevant chunks were accumulated globally.
print(f"Total reranked top chunks aggregated: {len(all_top_chunks)}")


  candidate_docs = ensemble_retriever.get_relevant_documents(query)


Total reranked top chunks aggregated: 48


In [10]:
print("\n--- User Question ---\n")
print(query)


--- User Question ---

what is the history of patient name john doe


In [None]:
# Convert aggregated tuples into Document objects for further processing
all_docs = [Document(page_content=text, metadata={"source": source, "patient_name": patient})
            for source, patient, text in all_top_chunks]

In [None]:
# Remove duplicate document chunks based on content to avoid redundancy
unique_texts = set()
unique_docs = []
for doc in all_docs:
    text = doc.page_content.strip()
    if text not in unique_texts:
        unique_docs.append(doc)
        unique_texts.add(text)

In [None]:
# Print snippets of top unique chunks for inspection
print(f"Top {len(unique_docs)} unique retrieved chunks:")
for i, doc in enumerate(unique_docs, 1):
    source = doc.metadata.get("source", "unknown")
    patient = doc.metadata.get("patient_name", "unknown")
    snippet = doc.page_content[:500].replace("\n", " ")
    print(f"[Chunk {i}] Source: {source} Patient: {patient}")
    print(f"Content snippet: {snippet}\n{'-'*80}")

Top 34 unique retrieved chunks:
[Chunk 1] Source: discharge.pdf Patient: John Doe Medical Record Number: 123456789 Date admitted: July 1, 2050 Date discharged: July 3, 2050 Attending Physician: Dr. Will Teachwell Resident Physician: Dr. Bea Goodoc Diagnosis: Left-sided systolic congestive heart failure EF of 35% on echo performed 3 months ago Other diagnoses: Type 2 Diabetes Stage 2 Hypertension Osteoarthritis
Content snippet: Patient name: John Doe Medical Record Number: 123456789 Date admitted: July 1, 2050 Date discharged: July 3, 2050 Attending Physician: Dr. Will Teachwell Resident Physician: Dr. Bea Goodoc Diagnosis: Left-sided systolic congestive heart failure EF of 35% on echo performed 3 months ago Other diagnoses: Type 2 Diabetes Stage 2 Hypertension Osteoarthritis
--------------------------------------------------------------------------------
[Chunk 2] Source: discharge.pdf Patient: unknown
Content snippet: Mr. Doe is a 72 year old gentleman with a history of CHF who presen

In [14]:
cross_encoder = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")

# Create a list of (query, document_text) pairs for each unique document chunk
# This prepares inputs for the cross-encoder which scores the relevance of each chunk to the query
pairs = [(query, doc.page_content) for doc in unique_docs]

# Use the cross-encoder to score each (query, document) pair
# The output 'scores' is a list of relevance scores, one per document chunk
scores = cross_encoder.score(pairs)

# Sort the documents in descending order based on their relevance scores
# Pair each score with its corresponding document, sort by score descending, then extract documents
reranked_docs = [doc for _, doc in sorted(zip(scores, unique_docs), key=lambda x: x[0], reverse=True)]

# Select the top n most relevant document chunks after reranking
top_n_docs = reranked_docs[:25]

In [None]:
# Group final top chunks by their source document for structuring context to pass to LLM
context_by_doc = {}
for doc in top_n_docs:
    source = doc.metadata.get("source", "unknown")
    patient = doc.metadata.get("patient_name", "unknown")
    context_by_doc.setdefault(source, []).append(f"Patient: {patient}\n{doc.page_content.strip()}")

# Construct final context string combining all top chunks grouped by document source
context_text = "\n\n".join([
    f"Source: {source}\n" + "\n---\n".join(chunk_texts)
    for source, chunk_texts in context_by_doc.items()
])

In [None]:
# Format prompt for LLM with question and structured context
final_prompt = PROMPT.format(context=context_text, question=query)

In [None]:
# Show the exact prompt sent to the LLM for transparency/debugging
print("\n--- Final Prompt Sent to LLM ---\n")
print(final_prompt)


--- Final Prompt Sent to LLM ---


You are a helpful, precise, and compassionate healthcare assistant. ONLY use the medical context provided below to answer the question accurately and completely.

If the information needed to answer the question is NOT present in the context, reply with "I don't know." Do NOT guess or add any information that is not explicitly stated in the context.

If you don't know the exact answer, you may provide any relevant information related to the question that is contained within the context to help guide the user.

When asked about multiple documents, provide a clear, structured summary or overview of each document separately, including key information such as document type, patient(s) involved, dates, diagnoses, and other important content.

Read the context carefully and synthesize a concise but comprehensive answer addressing the question. Avoid repeating information verbatim and focus on clarity and helpfulness.

After providing the answer, suggest 3 

In [None]:
# Call LLM through Groq API to get final answer and print it
print("\nGenerating answer from Groq API...\n--- Answer ---\n")
answer = get_llama4_response(final_prompt)
print(answer)


Generating answer from Groq API...
--- Answer ---

The history of patient John Doe includes a 72-year-old gentleman with a history of congestive heart failure (CHF), Type 2 Diabetes, Stage 2 Hypertension, and Osteoarthritis. He presented with a 3-day history of gradually worsening lower extremity edema, weight gain, and shortness of breath. He reported good understanding and adherence to his medications but mentioned a recent dietary indiscretion, consuming several baskets of chips and salsa, which likely led to his exacerbation.

He was admitted on July 1, 2050, and discharged on July 3, 2050, under the care of Dr. Will Teachwell and Resident Physician Dr. Bea Goodoc. His diagnosis included left-sided systolic congestive heart failure with an ejection fraction (EF) of 35% on an echo performed 3 months ago.

Related questions you might ask:
1. What are the specific dietary recommendations provided to John Doe to manage his CHF and other conditions?
2. How will John Doe's volume status