In [8]:
from langgraph.graph import START, END, StateGraph, MessagesState
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import ToolNode
from langchain.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
from IPython.display import Image, display
from typing import Literal
import os

print("All imports successful✅")

All imports successful✅


In [9]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API_Key not found. Please set it in your .env file")
print("API key loaded successfully ✅")

API key loaded successfully ✅


In [10]:
## Initialize LLM
llm = ChatOpenAI(
    model = "gpt-5-nano",
    temperature=0.5,
    api_key = api_key
)
print(f"LLM initialized: {llm.model_name}")

LLM initialized: gpt-5-nano


In [11]:
file_path = r"c:\\Users\\Admin\\Desktop\\Tasks\\task3\\env\\docs"

loader = PyPDFDirectoryLoader(file_path)
pages = []

pages = loader.load()
    
print("Documents loaded successfully.✅")

Documents loaded successfully.✅


In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)

doc_splits = text_splitter.split_documents(pages)

print(f"Sample chunk: \n{doc_splits[0].page_content[:200]}...")
print("Documents chunked ✅")

Sample chunk: 
ACCESS GROUP  
SUSTAINABILITY REPORT
2023
Shaping a Sustainable Future...
Documents chunked ✅


In [13]:
embeddings = OpenAIEmbeddings(
    model= "text-embedding-3-small",
    api_key = api_key
)
print("Embeddings model initialized ✅")


chroma_path = "./chroma_db_personal_rag"
vectorstore = Chroma(
    collection_name="agentic_rag_docs",
    persist_directory=chroma_path,
    embedding_function=embeddings
)

# Make sure directory exists
import os
os.makedirs(chroma_path, exist_ok=True)

# Add documents in batches
batch_size = 1000
for i in range(0, len(doc_splits), batch_size):
    batch = doc_splits[i:i+batch_size]
    vectorstore.add_documents(documents=batch)
    print(f"Added batch {i} to {i+len(batch)}")

print(f"Vector store created with {len(doc_splits)} chunks")
print(f"Persisted to: {chroma_path}")

Embeddings model initialized ✅
Added batch 0 to 1000
Added batch 1000 to 2000
Added batch 2000 to 3000
Added batch 3000 to 4000
Added batch 4000 to 5000
Added batch 5000 to 6000
Added batch 6000 to 6076
Vector store created with 6076 chunks
Persisted to: ./chroma_db_personal_rag


In [14]:
@tool
def retrieve_documents(query: str) -> str:
    """
    Search for relevant documents in the knowledge base.
    
    Use this tool when you need information from the document collection
    to answer the user's question. Do NOT use this for:
    - General knowledge questions
    - Greetings or small talk
    - Simple calculations
    
    Args:
        query: The search query describing what information is needed
        
    Returns:
        Relevant document excerpts that can help answer the question
    """
    # Using MMR for diverse results
    retriever = vectorstore.as_retriever(
        search_type = "mmr",
        search_kwargs = {"k":5, "fetch_k":10}
    )

    results = retriever.invoke(query)
    if not results:
        return "No relevant documents found"
    
    formatted = "\n\n---\n\n".join(
        f"Document {i+1}:\n{doc.page_content}"
        for i, doc in enumerate(results)
    )
    return formatted

print("Retrieval tool created.✅")

Retrieval tool created.✅


In [23]:
system_prompt = SystemMessage(content="""You are PowerBot, a helpfulassistant with access to a document retrieval tool.

RETRIEVAL DECISION RULES:

If the question does not require documents, answer directly using general knowledge.

DO NOT retrieve for:
- Greetings: "Hello", "Hi", "How are you"
- Questions about your capabilities: "What can you help with?", "What do you do?"
- Simple math: "What is 2+2?"
- General knowledge
- Casual conversation: "Thank you", "Goodbye"

DO retrieve for:
- Questions asking for specific information that would be in documents
- Requests for facts, definitions, or explanations about specialized topics
- Any question where citing sources would improve the answer

Rule of thumb: If the user is asking for information (not just chatting), retrieve first.

When you retrieve documents, cite them in your answer. If documents don't contain the answer, say so.
""")

print("System prompt configured.✅")

System prompt configured.✅


In [24]:
# 1️⃣ Make sure you already have:
# - vectorstore (Chroma with doc_splits)
# - llm (ChatOpenAI initialized)
# - system_prompt (SystemMessage)

# 2️⃣ Define retrieval function
def retrieve_documents(query: str, k: int = 3):
    """
    Retrieve top-k relevant chunks from the Chroma vector store.
    Returns a list of text chunks.
    """
    return [doc.page_content for doc in vectorstore.similarity_search(query, k=k)]


# 3️⃣ Bind tool to LLM
tools = [retrieve_documents]
llm_with_tools = llm.bind_tools(tools)

# 4️⃣ Assistant node
def assistant(state):
    messages = [system_prompt] + state["messages"]
    response = llm_with_tools.invoke(messages)
    return {"messages": [response]}

# 5️⃣ Decide whether to continue
def should_continue(state):
    last_message = state["messages"][-1]
    if getattr(last_message, "tool_calls", None):
        return "tools"
    return "__end__"

print("Agent nodes defined ✅")


Agent nodes defined ✅


In [25]:
builder = StateGraph(MessagesState)

builder.add_node("assistant", assistant)
builder.add_node("tools", ToolNode(tools))

builder.add_edge(START, "assistant")
builder.add_conditional_edges(
    "assistant",
    should_continue,
    {"tools": "tools", "__end__": END}
)
builder.add_edge("tools", "assistant")

# Conversation memory
memory = MemorySaver()
agent = builder.compile(checkpointer=memory)

print("Agentic RAG system compiled✅")

Agentic RAG system compiled✅


In [26]:
def query_agent(test_queries: str, thread_id: str = "default"):
    for query in test_queries:
        print(f"\n{'='*70}")
        print(f"Query: {query}")
        print(f"{'='*70}")

        result = agent.invoke(
            {"messages": [HumanMessage(content=query)]},
            config={"configurable": {"thread_id": thread_id}}
        )

        #Check if Retrieval was used
        used_retrieval = any(
            isinstance(message, AIMessage) and message.tool_calls
            for message in result["messages"]
        )

        final_answer = result["messages"][-1].content
        print(f"Agent: {final_answer}")
        print(f"Decision: {'RETRIEVED' if used_retrieval else 'ANSWERED DIRECTLY'}")
        print(f"\n{'='*70}\n")

In [None]:
test1 = [
        "Hi",
        "Give me a comprehensive summary of what Access Bank's sustainability strategy is?",
        "Why do doctors wear white?"]
query_agent(test1, thread_id="test1")


Query: Hi
Agent: Hi there! What can I help you with today? If you’re looking for specific information from a document, tell me what you need and I’ll assist.
Decision: ANSWERED DIRECTLY



Query: Give me a comprehensive summary of what Access Bank's sustainability strategy is?
Agent: Here’s a comprehensive summary of Access Bank’s sustainability strategy based on the retrieved materials:

- Strategic framing and governance
  - Alignment with the Principles for Responsible Banking (PRB): Access Bank positions itself as a leader in sustainable finance by aligning its strategy with the PRB framework and applying robust self-assessment practices to evaluate its social and environmental impact.
  - Participation in the 2030 Core Group: The bank is engaged with this group, signaling a commitment to long-term, ambitious sustainability goals.
  - Integration into core operations: Sustainability considerations are embedded into the bank’s day-to-day operations rather than treated as a separate

In [28]:
test2 = ["What's up?",
        "What is the difference between IFRS S1 and S2 standards?",
        "What is Python?"]
query_agent(test2, thread_id="test2__")


Query: What's up?
Agent: Not much—I’m here and ready to help. What would you like to do or chat about? If you want ideas, I can help with explanations, research, brainstorming, or solving a problem.
Decision: ANSWERED DIRECTLY



Query: What is the difference between IFRS S1 and S2 standards?
Agent: Short answer: IFRS S1 and IFRS S2 are separate pieces of the IFRS Sustainability Disclosure Standards. S1 covers general sustainability-related financial disclosures; S2 is a climate-specific disclosure standard.

Key differences:

- Scope
  - S1 (General Requirements): Applies to sustainability-related financial disclosures across all material topics (environment, social, governance, etc.). It sets the overarching requirements for reporting such information.
  - S2 (Climate-Related Disclosures): Focuses specifically on climate-related risks and opportunities and is aligned with the TCFD framework.

- Level of prescriptiveness
  - S1: Provides the general framework and materiality guidance

In [29]:
test3 = ["What is 2 + 2?",
         "What did MTN report as their governance structure?",
         "What does Oluwaseyi mean?",
        "What are the GRI standards good for?"
    
]
query_agent(test3, thread_id="test3")


Query: What is 2 + 2?
Agent: 4
Decision: ANSWERED DIRECTLY



Query: What did MTN report as their governance structure?
Agent: MTN reported that their governance structure centers on the Board of Directors and the Executive Team, with key governance areas including Finance and Investment and Digital and Innovation. As of 31 December 2024, the Board consisted of 14 members (11 males and 3 females). MTN said its governance policies and practices are designed to create long-term value while fulfilling responsibilities to shareholders and stakeholders, upholding the highest standards of corporate governance.

Source: MTN Nigeria governance section (as of December 31, 2024).
Decision: RETRIEVED



Query: What does Oluwaseyi mean?
Agent: Oluwaseyi is a Yoruba given name meaning "God has done this" or "This is the work of God." It combines:
- Oluwa = God
- se = has done/made
- yi = this

It's a unisex name, common among Yoruba speakers. Pronunciation (approx): oh-LOO-wah-SEY-ee.
Decision: RE

Domain Choice and Rationale
I chose sustainability and corporate governance reporting as the domain for this agentic RAG system because it aligns directly with my current role as a sustainability intern. In practice, I regularly interact with frameworks such as IFRS Sustainability Disclosure Standards, GRI Standards, and ESG-related corporate disclosures. These materials are often lengthy, technical, and scattered across reports, which makes them a strong candidate for a retrieval-augmented system. Building the agent around this domain allowed me to test how well it could support real-world sustainability work, such as summarising company strategies or clarifying reporting standards, rather than relying on abstract or purely academic examples.


Chunk Size Tuning
Chunk size was tuned through a balance of practicality and experimentation. Sustainability and governance documents often contain closely related ideas across governance, strategy, and metrics, so overly small chunks risked breaking context, while very large chunks reduced retrieval accuracy. The final setup produced just over 6,000 chunks, which proved to be a good middle ground. This size allowed the agent to retrieve coherent sections—such as governance structures or sustainability strategies—without pulling in excessive, unrelated content. The results suggest that the chosen chunk size preserved meaning while still enabling precise similarity search.


Retrieval Decision Quality
The agent generally made good retrieval decisions. In the tests, conversational or general knowledge questions such as “What’s up?”, “What is Python?”, and “What is 2 + 2?” were answered directly without retrieval, which is the desired behaviour. Questions that clearly required document grounding—such as “What did MTN report as their governance structure?” and “Give me a comprehensive summary of Access Bank’s sustainability strategy”—correctly triggered retrieval. This shows that the routing logic is largely effective. However, a few general knowledge questions (for example, “Why do doctors wear white?”) were routed to retrieval when a direct answer would have been sufficient, indicating room for refinement.


What Worked Well and What Needs Improvement
What worked particularly well was the agent’s ability to summarise sustainability-related content in a structured and readable way once relevant documents were retrieved. This is especially useful in a sustainability context, where clarity and accuracy matter. The main area for improvement is retrieval selectivity. Introducing a stronger distinction between general knowledge and document-dependent questions would reduce unnecessary retrieval calls and improve efficiency. Overall, the system performed well for sustainability-focused use cases and shows clear potential as a practical support tool for sustainability and ESG analysis.