In [12]:
import os
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_core.exceptions import OutputParserException

from pydantic import BaseModel, Field
from typing import List



In [2]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    timeout=None,
    max_retries=2,
    # other params...
)

embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [3]:
path = 'RAG_TECHNIQUES/data/Understanding_Climate_Change.pdf'

In [None]:
def load_pdf_document(file_path):
    """
    Load a PDF and return all pages as documents
    """
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        print(f"Loaded {len(documents)} pages from PDF")
        return documents
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return []
    
def create_document_chunks(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into overlapping chunks for processing
    """
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len  # Use character count for splitting
    )
    chunks = text_splitter.split_documents(documents)
    # Clean up the text
    for chunk in chunks:
        chunk.page_content = chunk.page_content.replace('\t', ' ')
        chunk.page_content = chunk.page_content.replace('\n\n\n', '\n\n')
        chunk.page_content = chunk.page_content.strip()
    print(f"Created {len(chunks)} chunks")
    print(f"Average chunk size: {sum(len(chunk.page_content) for chunk in chunks) // len(chunks)} characters")
    return chunks

In [6]:
# Load and chunk the document
documents = load_pdf_document(path)
if documents:
    chunks = create_document_chunks(documents, chunk_size=1000, chunk_overlap=100)
else:
    print("Failed to load document. Please check your PDF path.")

Loaded 33 pages from PDF
Created 97 chunks
Average chunk size: 799 characters


In [17]:
for chunk in chunks[:3]:
    print(chunk.page_content)
    print('----------------------------')
    # response_object = question_chain.invoke({"chunk_text": chunk.page_content})

Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an extended period. Over the past century, human 
activities, particularly the burning of fossil fuels and deforestation, have significantly 
contributed to climate change. 
Historical Context 
The Earth's climate has changed throughout history. Over the past 650,000 years, there have 
been seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about 
11,700 years ago marking the beginning of the modern climate era and human civilization. 
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which
----------------------------
change the amount of so

In [None]:
class HypotheticalQuestions(BaseModel):
    """A list of hypothetical questions that a user might ask about a text chunk."""
    questions: List[str] = Field(
        ...,
        description="A list of 3-5 varied, natural, and conversational questions based on the text."
    )

# Create the parser from our Pydantic model
pydantic_parser = PydanticOutputParser(pydantic_object=HypotheticalQuestions)

# Create a new, more robust prompt that includes the format instructions
question_prompt_with_schema = PromptTemplate(
    template="""You are an expert at generating questions. Your task is to analyze the following text and generate 3-5 essential questions that a user might ask about this content.

Requirements:
- Each question should capture a key concept or fact from the text.
- Questions should be varied (factual, conceptual, comparative).
- Questions should be natural and conversational.

Text:
{chunk_text}

{format_instructions}
""",
    input_variables=["chunk_text"],
    partial_variables={"format_instructions": pydantic_parser.get_format_instructions()}
)

question_chain = question_prompt_with_schema | llm | pydantic_parser

response_object = question_chain.invoke({"chunk_text": chunk_text})
# Not enough API calls made to generate a response, skipping.