In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.hyde.base import HypotheticalDocumentEmbedder
import os
from dotenv import load_dotenv 

load_dotenv()

True

In [22]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    timeout=None,
    max_retries=2,
    # other params...
)

base_embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
loader = PyPDFLoader("RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

print(f"Successfully split the document into {len(split_documents)} chunks.")

In [23]:
print("Creating the standard FAISS vector store...")
vectorstore_standard = FAISS.from_documents(documents=split_documents, embedding=base_embeddings)

# Create the standard retriever
standard_retriever = vectorstore_standard.as_retriever(search_kwargs={'k': 3})

print("Standard retriever is ready.")

Creating the standard FAISS vector store...
Standard retriever is ready.


In [25]:
print("Instantiating HypotheticalDocumentEmbedder...")

# 1. Create the HyDE embeddings object
# This object wraps our LLM and the base embeddings model.
hyde_embeddings = HypotheticalDocumentEmbedder.from_llm(
    llm=llm,
    base_embeddings=base_embeddings,
    prompt_key="web_search"  # Use a pre-defined prompt suitable for web documents
)

# 2. Create a new vector store using the HyDE embeddings
# The documents are still embedded with the base model, but queries will use the HyDE process.
print("Creating the HyDE-powered FAISS vector store...")
vectorstore_hyde = FAISS.from_documents(documents=split_documents, embedding=hyde_embeddings)

# 3. Create the HyDE retriever
hyde_retriever = vectorstore_hyde.as_retriever(search_kwargs={'k': 3})

print("HyDE retriever is ready.")

Instantiating HypotheticalDocumentEmbedder...
Creating the HyDE-powered FAISS vector store...
HyDE retriever is ready.


In [26]:
query = "What is the main cause of climate change?"

# --- Test the Standard Retriever ---
print("--- 1. Testing Standard Retriever ---")
standard_results = standard_retriever.invoke(query)

print(f"\nFound {len(standard_results)} documents:")
for i, doc in enumerate(standard_results):
    print(f"\n[Result {i+1}]")
    print(f"Source: {doc.metadata.get('source')}")
    print(f"Content: {doc.page_content[:300]}...")

# --- Test the HyDE Retriever ---
print("\n\n--- 2. Testing HyDE Retriever ---")
hyde_results = hyde_retriever.invoke(query)

print(f"\nFound {len(hyde_results)} documents:")
for i, doc in enumerate(hyde_results):
    print(f"\n[Result {i+1}]")
    print(f"Source: {doc.metadata.get('source')}")
    print(f"Content: {doc.page_content[:300]}...")

--- 1. Testing Standard Retriever ---

Found 3 documents:

[Result 1]
Source: RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf
Content: Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an exte...

[Result 2]
Source: RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf
Content: Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and 
natural gas used for electricity, heating, and transportation. The industrial revolution marked 
the beginning of a significant increase in fossil fuel consumption, which continues to rise 
tod...

[Result 3]
Source: RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf
Content: Agriculture 
Agriculture contributes to climate change through methane emissions from li

In [30]:
hypothetical_document_chain = hyde_embeddings.llm_chain
hypothetical_document_chain

PromptTemplate(input_variables=['QUESTION'], input_types={}, partial_variables={}, template='Please write a passage to answer the question\nQuestion: {QUESTION}\nPassage:')
| ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), temperature=0.0, max_retries=2, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x0000022862C3CCD0>, default_metadata=(), model_kwargs={})
| StrOutputParser()

In [29]:
# The llm_chain is an attribute of the HypotheticalDocumentEmbedder instance
hypothetical_document_chain = hyde_embeddings.llm_chain

# Invoke the chain with our query to see the generated document
hypothetical_document = hypothetical_document_chain.invoke({'QUESTION': query})

print("--- Hypothetical Document Generated by HyDE ---")
print(hypothetical_document)
print("---------------------------------------------")

--- Hypothetical Document Generated by HyDE ---
The main cause of climate change is **human activity**, primarily the emission of greenhouse gases into the Earth's atmosphere.

Specifically, the burning of fossil fuels—coal, oil, and natural gas—for energy production, transportation, and industrial processes releases vast quantities of carbon dioxide (CO2). Deforestation also contributes significantly, as trees absorb CO2, and their removal or burning releases stored carbon. Other human activities, such as agriculture (especially livestock farming and rice cultivation, which produce methane) and industrial processes, release other potent greenhouse gases like methane (CH4) and nitrous oxide (N2O).

These gases trap heat in the Earth's atmosphere, leading to a gradual increase in global temperatures, a phenomenon known as the enhanced greenhouse effect. While natural factors like volcanic eruptions and solar variations can influence climate, scientific consensus overwhelmingly points to

In [31]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# --- 1. Define the Answering Prompt ---
# This prompt instructs the LLM on how to use the context to answer the question.
prompt_template = ChatPromptTemplate.from_messages([
    ("system", """You are an expert assistant. Your task is to synthesize an answer to the user's question based *only* on the provided context.
    If the context does not contain the answer, state that you cannot answer. Do not use any external knowledge."""),
    ("human", "Context:\n{context}\n\nQuestion: {question}\n\nAnswer:")
])

# --- 2. Create a Helper Function to Format Documents ---
def format_docs(docs):
    """Combines the page_content of multiple documents into a single string."""
    return "\n\n---\n\n".join(doc.page_content for doc in docs)

# --- 3. Build the Generation Chain ---
# This chain will:
# 1. Take the user's question.
# 2. Use the question to retrieve documents with our HyDE retriever.
# 3. Format the retrieved documents into a context string.
# 4. Pass the context and question to the prompt.
# 5. Pass the formatted prompt to the LLM.
# 6. Parse the output.
rag_chain = (
    {"context": hyde_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

# --- 4. Run the Full RAG Chain ---
print("--- Running the Full RAG Chain (Retrieval + Generation) ---")

# We use the same query as before
# query = "What are the core principles behind agentic design patterns?"

# Invoke the chain. It will automatically run the HyDE retrieval and then the generation.
final_answer = rag_chain.invoke(query)

print(f"\nQuestion: {query}")
print(f"\nFinal Answer:\n{final_answer}")

--- Running the Full RAG Chain (Retrieval + Generation) ---

Question: What is the main cause of climate change?

Final Answer:
The primary cause of recent climate change is the increase in greenhouse gases in the atmosphere.
