In [85]:
# pip install -qU "langchain[google-genai]"
# pip install langchain_community
# !pip install pypdf
# !pip install sentence-transformers
# !pip install faiss-cpu

# Gemini API Call

In [89]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", max_retries=2)
model.invoke("What is the capital of India?")

AIMessage(content='The capital of India is **New Delhi**.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='run--45fbe569-c984-4485-a91c-a4081134693b-0', usage_metadata={'input_tokens': 8, 'output_tokens': 26, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 17}})

# Load Document

In [87]:
# Loading Documents
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader

loader = DirectoryLoader('./', glob="./ABI GCC - Holiday Calendar 2024.pdf", loader_cls=PyPDFLoader)
docs = loader.load()

# Document Chunking
The text splitter from langchain splits the document into different chunks. Chunk size = 100 means each chunk has 100 characters. overlap = 10 means there is an overlop of 10 characters between different chunks to retain context and semantic continuity. 
Generally chunk size of 500-2000 is used.

Recursive nature adds makes the chunking smarter. Instead of cutting text at exact character counts, it splits based on double newlines (\n\n), new lines (\n), spaces( ) and individual characters(""). This helps preserve paragraphs/sentence/words thereby making the chunks more meaningful

In [88]:
# Splitting texts into chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10
)
texts = text_splitter.split_documents(docs)
texts

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-12-15T16:38:18+05:30', 'author': 'DUTTA SUBHASREE', 'moddate': '2023-12-15T16:38:18+05:30', 'source': 'ABI GCC - Holiday Calendar 2024.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='ABI GCC Holiday Calendar - Year 2024 (5 fixed holidays) \nMonth Date Day Holiday Comment'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-12-15T16:38:18+05:30', 'author': 'DUTTA SUBHASREE', 'moddate': '2023-12-15T16:38:18+05:30', 'source': 'ABI GCC - Holiday Calendar 2024.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='January 26-01-24 Friday Republic Day Statutory Holiday'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-12-15T16:38:18+05:30', 'a

# Generate Embeddings and store in Vector Store

In [None]:
# Generating embeddings and storing in vector store

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

db = FAISS.from_documents(texts, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))

In [93]:
query = "What holidays are on 15th Aug?"

relevant_doc = db.similarity_search(query)
print(relevant_doc[0].page_content)

August 15-08-24 Thursday Independence Day Statutory Holiday


# Define Retriever
Retriever will retrieve data from vector store created above based on similarity. It will return top 5 most relevant documents.

In [97]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
retriever.get_relevant_documents(query)

[Document(id='9f245dde-f330-4dcf-b141-c2b85f5418a4', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-12-15T16:38:18+05:30', 'author': 'DUTTA SUBHASREE', 'moddate': '2023-12-15T16:38:18+05:30', 'source': 'ABI GCC - Holiday Calendar 2024.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='August 15-08-24 Thursday Independence Day Statutory Holiday'),
 Document(id='15bcda2c-bbe9-4d3f-a150-98af047abc2a', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2023-12-15T16:38:18+05:30', 'author': 'DUTTA SUBHASREE', 'moddate': '2023-12-15T16:38:18+05:30', 'source': 'ABI GCC - Holiday Calendar 2024.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='August 19-08-24 Monday Raksha Bandhan Optional Holiday'),
 Document(id='43aeda0c-29c5-471b-a65b-e34a319634b4', metadata={'producer': 'Microsoft® Word for Microsoft 3

# Structured Prompt Template
prompt_template gives instructions for the LLM model to follow
{context} is the set of relevant documents retrieved from vector store based on similarity
{question} is user question

In [98]:
from langchain_core.prompts import PromptTemplate
prompt_template = """Based on provided context, respond to the question below while adhering to these guidelines:
1. If the answer is unclear or not found, do not speculate. Instead, state, "I do not know the answer"
2. If the answer is found provide a response in {author_name} writer or poet's style of writing.

Context: {context}

Question: {question}
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["author_name", "context", "question"]
)

# Complete RAG Pipeline execution

In [None]:
human_query = "When is Independence Day?"
poet_name = "Agatha Christie"

relevant_doc = retriever.get_relevant_documents(human_query)
context = "\n\n".join([doc.page_content for doc in relevant_doc])

# Generate a response using the context and prompt
chain = prompt | model
result = chain.invoke({
  "question": human_query,
  "context": context,
  "author_name": poet_name
})
result.content

'Indeed, the calendar, much like a well-kept ledger, reveals its secrets quite plainly. Independence Day, a statutory holiday, is clearly marked for the fifteenth of August, two thousand and twenty-four.'