In [1]:
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

GOOGLE_API_KEY = os.environ['API_KEY']
os.environ['HF_HOME'] = 'D:\\Projects'

__DOCUMENT LOADING__

In [3]:
from langchain_community.document_loaders import PyPDFLoader

In [4]:
file_path = 'document/pdf_document.pdf'
loader = PyPDFLoader(file_path)

pages = []
for doc in loader.lazy_load():
    pages.append(doc)

In [5]:
len(pages)
# pages[127].page_content[:100]

258

__CHUNK SPLITTING__

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [30]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=350,
    length_function=len,
    is_separator_regex=False,
)

In [31]:
splits = splitter.split_documents(pages)

In [32]:
len(splits)

286

__EMBEDDING__

In [33]:
from langchain_huggingface import HuggingFaceEmbeddings

In [34]:
embeddings = HuggingFaceEmbeddings(
    model="BAAI/bge-small-en-v1.5"
)

In [35]:
from langchain_chroma import Chroma

In [36]:
vector_store = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory='./chroma-vectordb',
)

In [37]:
vector_store._collection.count()

1616

__RETRIEVAL PHASE__

In [38]:
retriever = vector_store.as_retriever(
    search_type='mmr',
    search_kwargs={'k':20, 'fetch_k':10}
)

In [39]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [53]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAI

In [54]:
llm = GoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    google_api_key=GOOGLE_API_KEY
)

template = """
You are a helpful AI agent. You have to answer questions based on given context. Try to give answers in 50-100 words. Use the context to generate the best possible answer for the question.
Question: {question}
Context : {context}

If the context is irrelevant to the question or it does not help you, write a one line apology to explain you don't have the required context.
"""

In [60]:
# question = input("question: ")
question = "who all published this book?"

In [70]:
relevant_splits = retriever.invoke(question)

retriever = BM25Retriever.from_documents(relevant_splits)

# pretty_print_docs(retriever.invoke(question))

context = retriever.invoke(question)

# pretty_print_docs(context)

prompt = PromptTemplate.from_template(template)
chain = prompt | llm
# print(chain.invoke({'question':question,'context':context}))