In [27]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("Gemini API Key not found. Please set it in the .env file.")

# Set up the Gemini model for generation
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    timeout=None,
    max_retries=2,
    # other params...
)

base_embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [28]:
loader = PyPDFLoader("data/Understanding_Climate_Change.pdf")
docs = loader.load()

In [39]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=400, chunk_overlap=100)
split_documents = text_splitter.split_documents(docs)

print(f"Successfully split the document into {len(split_documents)} chunks.")

docstore = {}
for i, doc in enumerate(split_documents):
    doc.metadata["index"] = i
    docstore[i] = doc

Successfully split the document into 62 chunks.


In [30]:
print("\n--- Example of a modified chunk ---")
print(f"Content: {split_documents[1].page_content[:100]}...")
print(f"Metadata: {split_documents[1].metadata}")


--- Example of a modified chunk ---
Content: predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven b...
Metadata: {'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 0, 'page_label': '1', 'index': 1}


In [31]:
vectorstore = FAISS.from_documents(split_documents, base_embeddings)
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [None]:
def get_chunk_by_index(docstore: dict, target_index: int) -> Document:
    """
    Retrieve a chunk directly from the docstore based on its index.
    This is an O(1) operation, which is extremely fast.
    
    Args:
    docstore (dict): A dictionary mapping index to Document object.
    target_index (int): The index of the chunk to retrieve.
    
    Returns:
    Optional[Document]: The retrieved chunk, or None if the index is not found.
    """
    return docstore.get(target_index)

In [45]:
chunk = get_chunk_by_index(docstore, 1)
print(chunk.page_content[:1000])

predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous 
oxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential 
for life on Earth, as it keeps the planet warm enough to support life. However, human 
activities have intensified this natural process, leading to a warmer climate. 
Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and 
natural gas used for electricity, heating, and transportation. The industrial revolution marked 
the beginning of a significant increase in fossil fuel consumption, which continues to rise 
today. 
Coal


In [46]:
def retrieve_and_enrich_context_optimized(
    query: str, 
    retriever, 
    docstore: dict, 
    num_neighbors: int = 1
) -> str:
    """
    Retrieves relevant chunks, merges their indices, expands the context window around them,
    and constructs a single, coherent context string. This approach is highly efficient.

    Args:
    query (str): The query to search for.
    retriever: The retriever object configured with a vector store.
    docstore (dict): A dictionary mapping chunk index to Document object for fast lookup.
    num_neighbors (int): The number of chunks to retrieve before and after each relevant chunk.

    Returns:
    str: A single, concatenated context string ready for the LLM.
    """
    retrieved_docs = retriever.invoke(query)

    retrieved_indices = sorted(list(set([doc.metadata["index"] for doc in retrieved_docs])))

    expanded_indices = set()
    for index in retrieved_indices:
        start_index = max(0, index - num_neighbors)
        end_index = index + num_neighbors + 1
        expanded_indices.update(range(start_index, end_index))

    final_indices = sorted(list(expanded_indices))

    context_chunks = [get_chunk_by_index(docstore, i) for i in final_indices if get_chunk_by_index(docstore, i) is not None]

    final_context = "\n\n---\n\n".join([doc.page_content for doc in context_chunks])

    return final_context

In [50]:
query = "Explain the role of deforestation and fossil fuels in climate change."

print("=======================================================================")
print("                      COMPARISON OF RETRIEVAL RESULTS                  ")
print("=======================================================================")
print(f"\nQUERY: '{query}'\n")


print("--- 1. Baseline Retrieval (Top 1 Chunk Only) ---")
baseline_results = chunks_query_retriever.invoke(query)

if baseline_results:
    baseline_content = baseline_results[0].page_content
    print("Content Retrieved:")
    print(baseline_content)
else:
    print("No relevant documents found for baseline.")

print("\n" + "="*70 + "\n")


print("--- 2. Enriched Retrieval (Optimized Context Window) ---")
enriched_context = retrieve_and_enrich_context_optimized(
    query=query,
    retriever=chunks_query_retriever,
    docstore=docstore,
    num_neighbors=1
)

if enriched_context:
    print("Content Retrieved:")
    print(enriched_context)
else:
    print("No enriched context could be constructed.")

print("\n" + "="*70 + "\n")

                      COMPARISON OF RETRIEVAL RESULTS                  

QUERY: 'Explain the role of deforestation and fossil fuels in climate change.'

--- 1. Baseline Retrieval (Top 1 Chunk Only) ---
Content Retrieved:
greenhouse effect. 
Tropical Deforestation 
Tropical rainforests are particularly important for carbon storage. Deforestation in the 
Amazon, Congo Basin, and Southeast Asia has significant impacts on global carbon cycles 
and biodiversity. These regions are often cleared for agriculture, logging, and mining, leading 
to habitat loss and species extinction. 
Boreal Forests 
Boreal forests, found in the northern regions of North America, Europe, and Asia, also play a 
crucial role in sequestering carbon. Logging and land-use changes in these regions contribute 
to climate change. These forests are vital for regulating the Earth's climate and supporting 
indigenous communities and wildlife. 
Agriculture 
Agriculture contributes to climate change through methane emissions