In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import Ollama



In [2]:
def load_documents(path):
    """Load documents from a PDF file."""
    loader = PyPDFLoader(path)
    return loader.load()

In [3]:
def split_documents(docs, chunk_size=1000, chunk_overlap=200):
    """Split documents into manageable chunks."""
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

In [4]:
def create_vectorstore(splits):
    """Embed the documents and return a Chroma vector store."""
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return Chroma.from_documents(splits, embedding=embeddings)

In [6]:
def build_rag_chain(retriever):
    """Build a RAG pipeline: Retriever → Prompt → LLM → Output."""
    prompt = PromptTemplate.from_template("""
You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question:
{question}

Answer:
""")
    llm = Ollama(model="llama3", temperature=0)

    def format_docs(docs):
        return "\n\n".join([doc.page_content for doc in docs])

    return (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | StrOutputParser()
    )

In [7]:
def main():
    pdf_path = "/Users/kryptonempyrean/Desktop/TS3043166.pdf"  
    query = "What is CLALIT and how is it involved in the document?"

    docs = load_documents(pdf_path)
    splits = split_documents(docs)
    vectorstore = create_vectorstore(splits)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    rag_chain = build_rag_chain(retriever)
    response = rag_chain.invoke(query)

    print("🔎 Query:", query)
    print("\n🤖 Answer:\n", response)

if __name__ == "__main__":
    main()

Ignoring wrong pointing object 169 0 (offset 0)
Ignoring wrong pointing object 178 0 (offset 0)
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  llm = Ollama(model="llama3", temperature=0)


🔎 Query: What is CLALIT and how is it involved in the document?

🤖 Answer:
 According to the context, CLALIT is Israel's largest provider of public and semi-private health services. It is a not-for-profit entity that runs under Israeli law. The dataset used for analysis was retrieved from the CLALIT Health Services Israeli database, which provides comprehensive records for randomly selected members who receive routine healthcare services.
