## An Agentic RAG System
An agentic RAG system on Computer Science domain knowledge


#### Importing require libraries

In [2]:
import os
from dotenv import load_dotenv
from IPython.display import Image, display
from langgraph.graph import START, END, StateGraph, MessagesState
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.prebuilt import ToolNode
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.tools import tool
from langchain_core.messages import AIMessage, ToolMessage, HumanMessage, SystemMessage, ToolMessage
from typing import Literal
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from pathlib import Path
MAX_MB=350

#### Load api key and intialize llm

In [3]:
persist_dir= "./chroma_db"
load_dotenv()
llm= ChatOpenAI(model='gpt-4o-mini', temperature=0)

### Load PDF with metadata Document

In [4]:
def _load_pdf(file_path:str):
    loader= PyMuPDFLoader(file_path)
    return loader.load()

In [5]:
def load_pdf_with_metadata(file_path:str,document_name:str, institution:str="University of Ibadan"):
    pages= _load_pdf(file_path)
    for page in pages:
        page.metadata.update({
            "document_name": document_name,
            "institution": institution,
            "page_no": (page.metadata.get("page") or 0)+1,
            "source_file": str(file_path),
        })
    return pages

# Now using the function to load the pdf files
documents=[]
skipped=[]
doc_dir= Path(os.getenv("DOCS_DIR"))
pdfs= sorted(doc_dir.glob("*.pdf"))

if not pdfs:
    print ("No pdf file found")

for i, pdf in enumerate(pdfs,1):
    size_mb=pdf.stat().st_size/(1024 * 1024)
    print(f"[{i}/{len(pdfs)}] Loading: {pdf.name} ({size_mb:.1f} MB)...", flush=True)
    if size_mb>MAX_MB:
        skipped.append((pdf.name, f"Too large ({size_mb:.1f} MB)"))
        print(f"   SKIP (too large)", flush=True)
        continue 
    try:
        pages=load_pdf_with_metadata(pdf,pdf.name)
        documents+=pages
        print(f"Loads: {len(pages)} pages", flush=True)
    except Exception as e:
        skipped.append((pdf.name, str(e)))
        print(f"    SKIP: {pdf.name} -> {e}", flush=True)
# Review loaded docs
print(f"✅ Total loaded pages: {len(documents)}")
if skipped:
    print("\nSkipped files:")
    for s in skipped:
        print(" -", s)

for doc in documents[:3]:
    print(doc.metadata)



[1/5] Loading: FINAL_Student_Hand_Book.pdf (302.5 MB)...
Loads: 216 pages
[2/5] Loading: GENDER_POLICY_DOMMY.pdf (1.0 MB)...
Loads: 23 pages
[3/5] Loading: handbook.pdf (29.1 MB)...
Loads: 84 pages
[4/5] Loading: MANUAL_OF_STYLE.pdf (0.2 MB)...
Loads: 27 pages
[5/5] Loading: SEXUAL_HASSASSMENT_HANDOUT_DOMMY.pdf (0.9 MB)...
Loads: 12 pages
✅ Total loaded pages: 362
{'producer': 'Corel PDF Engine Version 22.1.1.523', 'creator': 'CorelDRAW 2020', 'creationdate': '2025-03-18T09:55:05-07:00', 'source': 'docs\\FINAL_Student_Hand_Book.pdf', 'file_path': 'docs\\FINAL_Student_Hand_Book.pdf', 'total_pages': 216, 'format': 'PDF 1.7', 'title': 'FINAL Student Hand Book.cdr', 'author': 'UI PRESS', 'subject': '', 'keywords': '', 'moddate': '2025-03-18T09:55:05-07:00', 'trapped': '', 'modDate': "D:20250318095505-07'00'", 'creationDate': "D:20250318095505-07'00'", 'page': 0, 'document_name': 'FINAL_Student_Hand_Book.pdf', 'institution': 'University of Ibadan', 'page_no': 1, 'source_file': 'docs\\FINAL_

### Chunking the documents

In [6]:
## Chunk document
def chunking (documents):
    splitter= RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap= 100,
        separators=["\n\n","\n",". "," ",""],
    )
    chunks= splitter.split_documents(documents=documents)
    return chunks
chunks= chunking(documents)
print(f"✅ Complete! with total {len(chunks)} chunks")
print(f"\nSample chunk:")
print(f"{chunks[0].page_content[:200]}...")

✅ Complete! with total 459 chunks

Sample chunk:
Student 
Information 
Handbook 
S t u d e n t I n f o r m a Ɵ o n H a n d b o o k   
2 0 2 3 / 2 0 2 4...


In [7]:
# add ids to metadata
import hashlib
def add_chunk_id(chunks):
    key=f"{chunks.metadata.get("document_name")}|{chunks.metadata.get("page_no")}|{chunks.page_content}"
    return hashlib.sha1(key.encode('utf-8')).hexdigest()

ids=[add_chunk_id(id) for id in chunks]
print("✅ Ids add to the chunks successfully!")


✅ Ids add to the chunks successfully!


### Vector Store

In [8]:
# initializing emmbed
load_dotenv()

embed= OpenAIEmbeddings(model="text-embedding-3-small")

# initializing vectorstore

vectorstore= Chroma(collection_name="UI_Policies",embedding_function=embed, persist_directory=persist_dir)
vectorstore.add_documents(documents=chunks, ids=ids)

vec_cont= vectorstore._collection.count()
print(f" {len(chunks)} chunks added to Chroma and created {vec_cont} vector store", )

 459 chunks added to Chroma and created 459 vector store


In [9]:
# Testing the retriever
query="What is full meaning of UI"
result= vectorstore.similarity_search(query=query,k=4)
result[0].page_content

"WELCOME TO UI!\nUNIVERSITY MOTTO\nRECTE SAPERE FONS\n(For Knowledge and Sound Judgment)\nVision\nTo be a world-class institution for academic excellence geared towards \nmeeting societal needs\nMission\n \nTo expand the frontiers of knowledge through provision of \nexcellent conditions for learning and research.\n \nTo produce graduates who are worthy in character and sound \n \njudgment.\n \nTo contribute to the transformation of society through  \n \ncreativity and innovation. \n \nTo serve as a dynamic custodian of society's salutary values and \nthus sustain its integrity.\n1"

### Creating the Retriever tool

In [10]:
@tool
def doc_retriever(query: str)->str:
    """Search knowledge base for relevant documents.

    Use this tool when needed to answer any information about University of Ibadan.
    Do not use the tool for general knowledge question or normal greeting
    If keywords in the query aligns with words in the documents

    agrs:
        input: 
            query: question to search for
        output: the fetched documents (query results)
    """
    embedding= OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore= Chroma(collection_name="UI_Policies",persist_directory=persist_dir, embedding_function=embedding)
    retreiver= vectorstore.as_retriever(
        search_type="mmr", 
        search_kwargs = {"k": 4, "fetch_k":20 , "lamda_mult":0.5})
    response= retreiver.invoke(query)

    if not response:
        return "No relevance document found."
    
    formatted= "\n\n---\n\n". join(
        f"Document {i+1}:\n{doc.page_content}" 
        for i, doc in enumerate(response)
    )
    return formatted
    # return response

### Testing the retriever function

In [11]:
# Get count separately
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma(collection_name="UI_Policies",persist_directory=persist_dir, embedding_function=embedding)
db_count = vectorstore._collection.count()

# Use the tool
query = "University of ibadan was found when?"
results = doc_retriever.invoke(query)

print("Total count is:", db_count)
print(results)
# for i, doc in enumerate(results,1):
#     print(f"===== Document {i} =====")
#     print(f"content:{doc.page_content}")
#     print(f"Metadata:{doc.metadata}")

Total count is: 459
Document 1:
ABOUT THE UNIVERSITY OF IBADAN (UI)
UI Main Gate
Congratulations on your admission to the University of Ibadan, the Great 
UI! Welcome to the paths trodden by several generations of great women and 
men. Here is a brief history of UI which is meant to connect you with our rich 
past, link you with our richer present and prepare you to be part of our 
glorious future.
 
Established in 1948, the University of Ibadan, UI as it is fondly referred 
to, is the first University in Nigeria. Until 1962 when it became a full-fledged 
independent University, it was a College of the University of London in a 
special relationship scheme. Today, the University has a total enrolment of 
over 26,000 students shared among 17 different faculties: Arts, Science, 
Basic Medical Sciences, Clinical Sciences, Agriculture, The Social Sciences, 
Education, Veterinary Medicine, Technology, Pharmacy, Law, Public Health, 
Dentistry, Economics and Management Sciences, Renewable Nat

### Binding the llm with tool


In [15]:
tools=[doc_retriever]
llm_with_tool= llm.bind_tools(tools)

## Design the Graph

In [None]:
# 
builder= StateGraph(MemorySaver)