In [9]:
import os
import sys
from dotenv import load_dotenv
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_cohere import CohereEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()

True

In [6]:
base_embeddings = CohereEmbeddings(
    model="embed-english-light-v3.0"
)

In [7]:
loader = PyPDFLoader("data/Understanding_Climate_Change.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=100
)

texts = text_splitter.split_documents(docs)

for doc in texts:
    doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces

vectorstore = FAISS.from_documents(texts, base_embeddings)

In [11]:
# Create a retriever
retriever = vectorstore.as_retriever()


#Create a contextual compressor
llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-2.5-flash", max_tokens=4000)
compressor = LLMChainExtractor.from_llm(llm)

#Combine the retriever with the compressor
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

# Create a QA chain with the compressed retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=compression_retriever,
    return_source_documents=True
)

In [12]:
query = "What is the main topic of the document?"
result = qa_chain.invoke({"query": query})
print(result["result"])
print("Source documents:", result["source_documents"])

The main topic of the document is **Climate Change**.
Source documents: [Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 30, 'page_label': '31'}, page_content='Chapter 21: Climate Change and Cultural Shifts'), Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 23, 'page_label': '24'}, page_content='It was the first major international treaty to address climate change.'), Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+0

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

prompt_template = ChatPromptTemplate.from_template(
    """Answer the question based only on the following context:
    
    Context:
    {context}
    
    Question: {question}
    """
)

def format_documents(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": compression_retriever | format_documents, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

full_rag_chain = RunnableParallel(
    {"answer": rag_chain, "source_documents": compression_retriever}
)

query = "What is the main topic of the document?"
result = full_rag_chain.invoke(query)

print("Answer:", result["answer"])
print("Source Documents:", result["source_documents"])

Answer: The main topic of the document is Climate Change.
Source Documents: [Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 30, 'page_label': '31'}, page_content='Chapter 21: Climate Change and Cultural Shifts'), Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 23, 'page_label': '24'}, page_content='It was the first major international treaty to address climate change.'), Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17: