In [None]:
pip install -U langchain langchain-community langchain-chroma langchain-text-splitters chromadb pypdf

In [None]:
%pip install -U langchain-classic

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_chroma import Chroma
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# -------------------- CONFIG --------------------
PDF_FILE = "C:/Users/jaybr/Downloads/RAG.pdf"
DB_DIR = "./rag_db"
EMBED_MODEL = "nomic-embed-text"
LLM_MODEL = "llama3.2"

# -------------------- LOAD PDF --------------------
loader = PyPDFLoader(PDF_FILE)
documents = loader.load()

# -------------------- SPLIT TEXT --------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    separators=["\n\n", "\n", ".", " "]
)
chunks = text_splitter.split_documents(documents)

# -------------------- VECTOR STORE --------------------
embeddings = OllamaEmbeddings(model=EMBED_MODEL)
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=DB_DIR
)

# -------------------- RETRIEVER --------------------
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":15})

# -------------------- LLM --------------------
llm = ChatOllama(model=LLM_MODEL, temperature=0)

# -------------------- PDF-ONLY PROMPT --------------------
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a PDF question-answering assistant.\n"
     "Answer ONLY using the provided context below.\n"
     "Do NOT use any external knowledge or assumptions.\n"
     "If the answer is not in the context, say exactly:\n"
     "'I could not find the answer in the provided document.'\n\n"
     "Context:\n{context}"
    ),
    ("human", "{input}")
])

# -------------------- RAG CHAIN --------------------
qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

# -------------------- QUERY --------------------
query = "Explain the results section in the PDF"

response = rag_chain.invoke({"input": query})

print("\nðŸ¤– AI Answer (strictly PDF only):")
print(response["answer"])


In [None]:
#without llm
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

# ---------------- CONFIG ----------------
PDF_FILE = "C:/Users/jaybr/Downloads/RAG.pdf"
DB_DIR = "./rag_db"
EMBED_MODEL = "nomic-embed-text"

# ---------------- LOAD PDF ----------------
loader = PyPDFLoader(PDF_FILE)
documents = loader.load()

# ---------------- SPLIT TEXT ----------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    separators=["\n\n", "\n", ".", " "]
)
chunks = text_splitter.split_documents(documents)

print(f"âœ… Split PDF into {len(chunks)} chunks.")

# ---------------- VECTOR STORE ----------------
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=DB_DIR
)

# ---------------- RETRIEVE (NO LLM) ----------------
query = "Explain the results described in the PDF"

results = vectorstore.similarity_search(query, k=5)

print("\nðŸ“š Retrieved chunks (NO LLM):\n")
for i, doc in enumerate(results, 1):
    print(f"--- Chunk {i} ---")
    print(doc.page_content)
    print("-" * 80)
