## An Agentic RAG System
An agentic RAG system on Computer Science domain knowledge


#### Importing require libraries

In [23]:
import os,time
from dotenv import load_dotenv
from IPython.display import Image, display
from langgraph.graph import START, END, StateGraph, MessagesState
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.prebuilt import ToolNode
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.tools import tool
from langchain_core.messages import AIMessage, ToolMessage, HumanMessage, SystemMessage, ToolMessage
from typing import Literal
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from pathlib import Path
MAX_MB=350

#### Load api key and intialize llm

In [24]:
load_dotenv()
llm= ChatOpenAI(model='gpt-4o-mini', temperature=0)

### Load PDF with metadata Document

In [25]:
def _load_pdf(file_path:str):
    loader= PyMuPDFLoader(file_path)
    return loader.load()

In [26]:
def load_pdf_with_metadata(file_path:str,document_name:str, institution:str="University of Ibadan"):
    pages= _load_pdf(file_path)
    for page in pages:
        page.metadata.update({
            "document_name": document_name,
            "institution": institution,
            "page_no": (page.metadata.get("page") or 0)+1,
            "source_file": str(file_path),
        })
    return pages

# Now using the function to load the pdf files
documents=[]
skipped=[]
doc_dir= Path(os.getenv("DOCS_DIR"))
pdfs= sorted(doc_dir.glob("*.pdf"))

if not pdfs:
    print ("No pdf file found")

for i, pdf in enumerate(pdfs,1):
    size_mb=pdf.stat().st_size/(1024 * 1024)
    print(f"[{i}/{len(pdfs)}] Loading: {pdf.name} ({size_mb:.1f} MB)...", flush=True)
    if size_mb>MAX_MB:
        skipped.append((pdf.name, f"Too large ({size_mb:.1f} MB)"))
        print(f"   SKIP (too large)", flush=True)
        continue 
    try:
        pages=load_pdf_with_metadata(pdf,pdf.name)
        documents+=pages
        print(f"Loads: {len(pages)} pages", flush=True)
    except Exception as e:
        skipped.append((pdf.name, str(e)))
        print(f"    SKIP: {pdf.name} -> {e}", flush=True)
# Review loaded docs
print(f"✅ Total loaded pages: {len(documents)}")
if skipped:
    print("\nSkipped files:")
    for s in skipped:
        print(" -", s)

for doc in documents[:3]:
    print(doc.metadata)



TypeError: argument should be a str or an os.PathLike object where __fspath__ returns a str, not 'NoneType'

### Chunking the documents

In [None]:
## Chunk document
def chunking (documents):
    splitter= RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap= 100,
        separators=["\n\n","\n",". "," ",""],
    )
    chunks= splitter.split_documents(documents=documents)
    return chunks
chunks= chunking(documents)
print(f"✅ Complete! with total {len(chunks)} chunks")
print(f"\nSample chunk:")
print(f"{chunks[0].page_content[:200]}...")

In [None]:
# add ids to metadata
import hashlib
def add_chunk_id(chunks):
    key=f"{chunks.metadata.get("document_name")}|{chunks.metadata.get("page_no")}|{chunks.page_content}"
    return hashlib.sha1(key.encode('utf-8')).hexdigest()

ids=[add_chunk_id(id) for id in chunks]
print("✅ Ids add to the chunks successfully!")


### Vector Store

In [None]:
# initializing emmbed
load_dotenv()
persist_dir= "./chroma_db"
embed= OpenAIEmbeddings(model="text-embedding-3-small")

# initializing vectorstore

vectorstore= Chroma(collection_name="UI_Policies",embedding_function=embed, persist_directory=persist_dir)
vectorstore.add_documents(documents=chunks, ids=ids)

vec_cont= vectorstore._collection.count()
print(f" {len(chunks)} chunks added to Chroma and created {vec_cont} vector store", )

In [None]:
# Testing the retriever
query="What is full meaning of UI"
result= vectorstore.similarity_search(query=query,k=4)

### Creating the Retriever tool

In [None]:
@tool
def doc_retriever(query: str)->str:
    """Search knowledge base for relevant documents.

    Use this tool when need to answer any information about University of Ibadan.
    Do not use the tool for general knowledge question or normal greeting

    agrs:
        input: 
            query: question to search for
        output: the fetched documents (query results)
    """
    retriever= Chroma()

_IncompleteInputError: incomplete input (647455926.py, line 2)