In [None]:
pip install -U langchain-google-genai

In [1]:
import os
from dotenv import load_dotenv

# LangChain components
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain

# Load environment variables from .env file
load_dotenv()

# Configure the Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("Gemini API Key not found. Please set it in the .env file.")

# Set up the Gemini model for generation
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.5,
    max_tokens=500,
    timeout=None,
    max_retries=2,
    # other params...
)

In [2]:
# 1. PDF Processing and Text Extraction
# Load the PDF document from the specified path
loader = PyPDFLoader("RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf")
docs = loader.load()

# 2. Text Chunking
# Create a text splitter to divide the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)

print(f"Successfully split the document into {len(split_documents)} chunks.")

Successfully split the document into 97 chunks.


In [3]:
# 3. Vector Store Creation using FAISS and Gemini Embeddings
# Create embeddings using the Google Generative AI model
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Create a FAISS vector store from the document chunks and their embeddings
# This process can take a moment as it's embedding all the chunks.
print("Creating vector store...")
vector_store = FAISS.from_documents(split_documents, embeddings)
print("Vector store created successfully.")

Creating vector store...
Vector store created successfully.


In [4]:
print(f"Vector store contains {vector_store.index.ntotal} document chunks.")

Vector store contains 97 document chunks.


In [5]:
# Print the vectors stored in the FAISS vector_store
print(vector_store.index.reconstruct_n(0, vector_store.index.ntotal))

[[ 0.00718226 -0.00422319  0.02249423 ...  0.00161143 -0.00949374
   0.00066858]
 [ 0.00765999 -0.0034277   0.02191414 ... -0.00620803 -0.00244515
   0.0029413 ]
 [ 0.0169386  -0.00793521  0.02004413 ... -0.00686916 -0.01498914
   0.00635582]
 ...
 [-0.00571316 -0.01667989  0.03072413 ... -0.01253151  0.01249439
   0.00364568]
 [-0.007865    0.00577848  0.02488202 ... -0.00015541  0.00481908
   0.0127038 ]
 [-0.01798255 -0.00700301  0.02617744 ... -0.00503353 -0.0052387
   0.00881986]]


In [6]:
# 4. Retriever Setup
# Create a retriever from the vector store to fetch relevant documents
# k=3 means it will retrieve the top 3 most relevant chunks
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# 5. RAG Chain for Generation
# Define the prompt template to guide the LLM
# The template instructs the model on how to use the context to answer the question
prompt_template = ChatPromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.

    Context: {context}

    Question: {input}

    Answer:
    """
)

# Create a chain to combine the retrieved documents into a single string
document_chain = create_stuff_documents_chain(llm, prompt_template)

# Create the final retrieval chain that combines the retriever and the document chain
# This chain will first retrieve documents and then pass them to the LLM.
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [9]:
# 6. Query the RAG system
question = "What is the main cause of climate change?" # Change this to a question relevant to your PDF
print(f"\nQuerying the system with: '{question}'")

response = retrieval_chain.invoke({"input": question})

# Print the answer
print("\nAnswer:")
print(response["answer"])

# To see the retrieved context, you can inspect the response object
print("\nRetrieved Context:")
for i, doc in enumerate(response["context"]):
    print(f"--- Document {i+1} ---\n{doc.page_content}\n")


Querying the system with: 'What is the main cause of climate change?'

Answer:
The main cause of recent climate change is the increase in greenhouse gases in the atmosphere. These gases, such as carbon dioxide and methane, trap heat and intensify the natural greenhouse effect. This increase is primarily driven by human activities, particularly the burning of fossil fuels.

Retrieved Context:
--- Document 1 ---
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous 
oxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential 
for life on Earth, as it keeps the planet warm enough to support life. However, human 
activities have intensified this natural process, leading to a warmer climate. 
Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes co