In [36]:
import os
from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers import ParentDocumentRetriever, ContextualCompressionRetriever
from langchain.storage import InMemoryStore, create_kv_docstore, LocalFileStore
from langchain_community.document_compressors import FlashrankRerank
from langchain_mistralai import ChatMistralAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# -----------------------------
# 1. Load documents
# -----------------------------
def load_documents(pdf_path: str):
    loader = PDFMinerLoader(
        pdf_path,
        mode="single",
        images_inner_format="markdown-img",
        images_parser=RapidOCRBlobParser(),
    )
    return loader.load()

# -----------------------------
# 2. Create embeddings & text splitters
# -----------------------------
def get_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


def get_splitters():
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
    return parent_splitter, child_splitter

# -----------------------------
# 3. Create / Load Vectorstore (persistent)
# -----------------------------
def get_vectorstore(persist_dir: str, embeddings):
    return Chroma(
        collection_name="split_parents",
        embedding_function=embeddings,
        persist_directory=persist_dir,
    )

# -----------------------------
# 4. Create retriever with compression
# -----------------------------
# def get_retriever(vectorstore, parent_splitter, child_splitter,persist_dir, docs=None):
#     store = InMemoryStore()  
#     retriever = ParentDocumentRetriever(
#         vectorstore=vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#         parent_splitter=parent_splitter,
#         search_kwargs={"k": 10},
#     )
#     if docs:
#         retriever.add_documents(docs)
#     compressor = FlashrankRerank()
#     return ContextualCompressionRetriever(
#         base_compressor=compressor,
#         base_retriever=retriever,
#     )

from langchain.storage import LocalFileStore
from langchain.storage import create_kv_docstore

def get_retriever(vectorstore, parent_splitter, child_splitter, persist_dir, docs=None):
    # LocalFileStore handles persistence
    kv_store = LocalFileStore(persist_dir)
    # Wrap it so it can handle Document serialization
    store = create_kv_docstore(kv_store)

    retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter,
        search_kwargs={"k": 10},
    )
    if docs:
        retriever.add_documents(docs)

    compressor = FlashrankRerank()
    return ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=retriever,
    )
# -----------------------------
# 5. Setup LLM
# -----------------------------
def get_llm():
    os.environ["MISTRAL_API_KEY"] = "P0AUWKMTSLXCWrAZNdIlbS7ErumZBXnW"
    return ChatMistralAI(model="mistral-small", temperature=0.7)

# -----------------------------
# 6. Setup QA Chain
# -----------------------------
def get_qa_chain(llm, retriever):
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
Use the following context to answer the question in detail (medical domain):

Context:
{context}

Question:
{question}

Answer:
"""
    )
    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt},
    )

# -----------------------------
# 7. Query execution
# -----------------------------
def run_query(qa_chain, query: str):
    result = qa_chain({"query": query})
    print("Answer:", result['result'])
    print("\nSource Chunks Used:")
    for src in result['source_documents']:
        print("-", src.page_content, "...")
        print("*" * 80)
    return result

# -----------------------------
# Pipeline Runner with modes
# -----------------------------
def build_pipeline(pdf_path: str, persist_dir: str, query: str, init_mode: str = "auto"):
    embeddings = get_embeddings()
    parent_splitter, child_splitter = get_splitters()
    vectorstore = get_vectorstore(persist_dir, embeddings)

    retriever = None
    if init_mode == "build":
        docs = load_documents(pdf_path)
        retriever = get_retriever(vectorstore, parent_splitter, child_splitter,persist_dir, docs)
        # vectorstore.persist()
    elif init_mode == "load":
        retriever = get_retriever(vectorstore, parent_splitter, child_splitter, persist_dir)
    elif init_mode == "auto":
        existing = vectorstore.get()
        if not existing["ids"]:
            print("No existing vectorstore found. Building new one...")
            docs = load_documents(pdf_path)
            retriever = get_retriever(vectorstore, parent_splitter, child_splitter,persist_dir, docs)
            # vectorstore.persist()
        else:
            print("Loading existing vectorstore...")
            retriever = get_retriever(vectorstore, parent_splitter, child_splitter,persist_dir)

    llm = get_llm()
    qa_chain = get_qa_chain(llm, retriever)
    return run_query(qa_chain, query)


# -----------------------------
# Example Run
# -----------------------------



In [37]:
if __name__ == "__main__":
    pdf_path = "/home/thiru/draft-oasis-e1-manual-04-28-2024_edited.pdf"
    persist_dir = "./chroma_store"
    query = "what is this document about?"

    # Modes: "build", "load", or "auto"
    results = build_pipeline(pdf_path, persist_dir, query, init_mode="load")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:httpx:HTTP Request: POST https://api.mistral.ai/v1/chat/completions "HTTP/1.1 200 OK"


Answer: This document appears to be the Outcome and Assessment Information Set (OASIS-E1) manual for home health agencies (HHAs) that will be effective starting from January 1, 2025, issued by the Centers for Medicare & Medicaid Services (CMS). The manual provides guidance for HHAs on collecting and reporting high-quality OASIS data related to the care they provide to their patients.

The manual includes general data collection conventions and item-specific guidance, as well as links to resources for agencies. It also discusses the importance of data accuracy, OASIS data correction, and the implications for reporting. The manual is organized into several chapters and appendices, including a glossary and common acronyms, OASIS items, time points and uses, OASIS instruments, a description of changes from OASIS-E to OASIS-E1, references and resources, and OASIS and quality improvement.

Specifically, the excerpt provided includes information on the administrative requirements for electron

In [20]:
results

{'query': 'what is this document about?',
 'result': "This document appears to be the Outcome and Assessment Information Set (OASIS) Guidance Manual for home health agencies (HHAs) that provide services to patients under the Centers for Medicare & Medicaid Services (CMS). The manual is designed to help HHAs collect and report high-quality OASIS data, which includes standard data elements related to patient outcomes and assessments.\n\nThe manual includes an introduction to the OASIS system, the importance of data accuracy, and item-specific guidance for HHAs. The item-specific guidance is subdivided into sections and covers various aspects of patient assessment, such as hearing, speech, and vision.\n\nOne of the items in this manual is B0200: Hearing, which aims to identify the patient's ability to hear, including with assistive devices if they are used. Problems with hearing can contribute to sensory deprivation, social isolation, and mood and behavior disorders. Unaddressed communica