In [None]:
# paper used for this project

"""

file:///C:/Users/gobin/Documents/Agentic%20AI/Analytics%20Vidhya/section%204,%20RAG%20Systems%20Essentials/Notebooks/Module%207/Agentic_AI_Frameworks.pdf

"""

In [None]:
"""

# Study Title:
System Architecture, and Rollout with evaluation end-to-end Retrieval-Augmented Generation with Agentic AI

# System Overview:
End-to-end RAG project is completed in Jupyter Notebook with Anaconda Navigator. The project covers:  
•	Upload pdf paper,
•	 Make chunks from the paper
•	Do semantic embeddings
•	Use LLM for knowledge processing with relevant information
•	Analysis of outcomes precision and truthfulness
This notebook includes: user questions, real answers from paper, and artificial responses used for making an end to end RAG pipeline. Artificial responses are replaced with real responses from RAG.  This pipeline is applicable for Python ML/Generative AI, and Agentic AI. 

# Tech Stack:
•	LLM
•	Agentic AI
•	Prompt engineering
•	LangChain
•	RecursiveCharacterTextSplitter
•	Embedding model
•	DeepEval
•	Panda
•	Dataset created for RAG system
•	Evaluate RAG pipeline
•	Python jupyter Notebook
•	Panda

# Study Aim:
The objectives for a RAG pipeline easily replicated by others with quality analysis include:
•	Make an end-to-end RAG pipeline
•	Perform the pipeline
•	Use AI-generated answers set for RAG evaluation. 
# Critical Points:
•	Design RAG pipeline framework 
•	Pipeline supported with Agentic AI
•	Artificial data creation
•	RAG Accuracy and Faithfulness Scores: 51.8 % (Accuracy Score) ,  72.22 %  (Context Alignment)


"""

In [1]:
import sys
#print(sys.executable)


In [None]:
# Need to load these versions

# !pip install langchain==0.0.351
# !pip install langchain-community==0.0.12
# !pip install langchain-core==0.1.3
# !pip install langchain-openai==0.0.2
# !pip install langsmith==0.1.0


In [5]:


!pip install chromadb sentence-transformers




In [7]:
# RAGAS evaluation

!pip install ragas==0.1.6 datasets evaluate



In [9]:
!pip install tqdm dill deepeval ipykernel




In [1]:
!pip install python-dotenv




In [2]:
import os

In [3]:
from dotenv import load_dotenv

In [4]:

load_dotenv()

openapi_key = os.getenv('OPENAI_API_KEY')

# check
print('part of key:', openapi_key[:3])

part of key: sk-


In [11]:
from langchain_community.document_loaders import PyPDFLoader


In [12]:
# Load the paper and find the number of pages in the paper.
pdf_loader = PyPDFLoader(paper_location)
paper = pdf_loader.load()
print(f'Number of pages: {len(paper)}.')


Number of pages: 8.


In [13]:
# Want to see some text from the last page
print("Some text from the last page:\n", paper[7].page_content[:700])

Some text from the last page:
 International Conference on Robot and Human Interactive Com-
munication (RO-MAN). IEEE, 2019, pp. 1–8.
[24] C. DeChant, “Episodic memory in ai agents poses risks
that should be studied and mitigated,” arXiv preprint
arXiv:2501.11739, 2025.
[25] A. M. Nuxoll and J. E. Laird, “Enhancing intelligent agents with
episodic memory,” Cognitive Systems Research, vol. 17, pp. 34–
48, 2012.
[26] S. Joshi, “A comprehensive survey of ai agent frameworks
and their applications in financial services,” Available at SSRN
5252182, 2025.
[27] I. Okpala, A. Golgoon, and A. R. Kannan, “Agentic ai systems
applied to tasks in financial services: Modeling and model risk
management crews,” arXiv preprint arXiv:2502.


In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [22]:
# Set up the splitter

# Making text for RAG system

recursive_character_text_splitter_paper = RecursiveCharacterTextSplitter( chunk_size = 400, chunk_overlap = 70 )

# Change text in the paper into chunks for semantic search.

document_chunks = recursive_character_text_splitter_paper.split_documents(paper)

print('number of chunks:\n', len(document_chunks ))

print()

# See some text from a chunk

print('pick one of the chunks :\n', document_chunks[20].page_content[:1100])


number of chunks:
 116

pick one of the chunks :
 (tool use) in an iterative loop.
To break it down, we believe that modern agents
fundamentally differ from classical agents (e.g., Belief-
Desire-Intention (BDI) agents) by leveraging LLMs and
advanced technologies as versatile reasoning engines
and dynamic tool portfolios. Table I presents a compar-
ison between traditional and modern AI agents.
Given this broad evolution, it is now necessary to


In [25]:
# See the text from each segment

segment_texts = [i.page_content for i in document_chunks ]

# See the first 3 segment texts
segment_texts[:3]



['Agentic AI Frameworks: Architectures,\nProtocols, and Design Challenges\nHana Derouiche\nUniversity of Kairouan\nSMART Lab, University of Tunis , Tunisia\nhana.darouiche@gmail.com, 0009-0009-4162-5633\nZaki Brahmi\nUniversity of Sousse\nRiadi Lab, Compus Manouba , Tunisia\nzakibrahmi@gmail.com, 0000-0002-0432-4817\nHaithem Mazeni\nUniversity of Jandouba , Tunisia\nhaithem.mezni@gmail.com, 0000-0001-9932-8433',
 'haithem.mezni@gmail.com, 0000-0001-9932-8433\nAbstract—The emergence of Large Language Models\n(LLMs) has ushered in a transformative paradigm in\nartificial intelligence, Agentic AI, where intelligent agents\nexhibit goal-directed autonomy, contextual reasoning, and\ndynamic multi-agent coordination. This paper provides\na systematic review and comparative analysis of lead-',
 'a systematic review and comparative analysis of lead-\ning Agentic AI frameworks, including CrewAI, Lang-\nGraph, AutoGen, Semantic Kernel, Agno, Google ADK,\nand MetaGPT, evaluating their architectur

In [27]:
# Like to see clean text
clean_text_segments = [t.replace('\n', ' ') for t in segment_texts]
clean_text_segments[:3]


['Agentic AI Frameworks: Architectures, Protocols, and Design Challenges Hana Derouiche University of Kairouan SMART Lab, University of Tunis , Tunisia hana.darouiche@gmail.com, 0009-0009-4162-5633 Zaki Brahmi University of Sousse Riadi Lab, Compus Manouba , Tunisia zakibrahmi@gmail.com, 0000-0002-0432-4817 Haithem Mazeni University of Jandouba , Tunisia haithem.mezni@gmail.com, 0000-0001-9932-8433',
 'haithem.mezni@gmail.com, 0000-0001-9932-8433 Abstract—The emergence of Large Language Models (LLMs) has ushered in a transformative paradigm in artificial intelligence, Agentic AI, where intelligent agents exhibit goal-directed autonomy, contextual reasoning, and dynamic multi-agent coordination. This paper provides a systematic review and comparative analysis of lead-',
 'a systematic review and comparative analysis of lead- ing Agentic AI frameworks, including CrewAI, Lang- Graph, AutoGen, Semantic Kernel, Agno, Google ADK, and MetaGPT, evaluating their architectural principles, commun

In [46]:
#!pip install deepeval dill tqdm ipykernel

In [30]:
!pip install pypdf




In [31]:
from deepeval.synthesizer import Synthesizer

In [32]:
 # Activate the Synthesizer with gpt-4o for making synthetic questions and answers. 
synthesizer_questions_answers = Synthesizer(model='gpt-4o')


In [33]:

from deepeval.synthesizer.types import Evolution


In [34]:
#  Making questions from user for RAG system 

questions = [
    'What are the Agentic AI Frameworks: Architectures, Protocols, and Design Challenges?',
    'What are the main limitations and challenges in this paper?',
    'Provide all authors involved in this paper.']


# Real answer from paper     # I gave it. 

real_answer_paper =  [
 'Agentic AI Frameworks are architectures, protocols, and design workflows for self-operating AI systems that shows goal-directed autonomy, contextual reasoning, and dynamic multi-agent coordination.',
  'key limitations and challenges of agentic AI frameworks cover architectural rigidity, dynamic collaboration constraints, safety risks, and lack of interoperability.',
  'Hana Derouiche, Haithem Mazeni, Zaki Brahmi.']

# following artificial responses are replaced by output from a RAG model

artificial_responses = [
    'Agentic AI Frameworks are architectures and protocols for self-generated AI systems.',
    'Design challenges cover some reasoning, and protocol limitations.',
    'Zaki Brahmi']

In [36]:
import pandas as pd

# Making a data frame. 

data_frame = pd.DataFrame({
    'question': questions,
    'ground_truth': real_answer_paper,
    'answer': artificial_responses,
    'contexts': [segment_texts] * len(questions)
})


data_frame.head()


Unnamed: 0,question,ground_truth,answer,contexts
0,What are the Agentic AI Frameworks: Architectu...,"Agentic AI Frameworks are architectures, proto...",Agentic AI Frameworks are architectures and pr...,"[Agentic AI Frameworks: Architectures,\nProtoc..."
1,What are the main limitations and challenges i...,key limitations and challenges of agentic AI f...,"Design challenges cover some reasoning, and pr...","[Agentic AI Frameworks: Architectures,\nProtoc..."
2,Provide all authors involved in this paper.,"Hana Derouiche, Haithem Mazeni, Zaki Brahmi.",Zaki Brahmi,"[Agentic AI Frameworks: Architectures,\nProtoc..."


In [38]:
from datasets import Dataset

# Need to change the data frame into Hugging face dataset for RAG evaluation. 

hugging_face_dataset = Dataset.from_pandas(data_frame)

print()

# See some rows

#hugging_face_dataset[:2]


  from .autonotebook import tqdm as notebook_tqdm





In [40]:
# Load metrics from ragas

from ragas.metrics import AnswerCorrectness, Faithfulness

ragas_answer_correctness = AnswerCorrectness()

ragas_faithfulness = Faithfulness()

In [41]:
from ragas import evaluate

# Evaluation from RAG 
rag_outcomes = evaluate(hugging_face_dataset, metrics=[ragas_answer_correctness , ragas_faithfulness])


Evaluating: 100%|██████████| 6/6 [00:04<00:00,  1.34it/s]


In [42]:
# outcomes from RAG evaluation

print('outcomes from RAG evaluation:\n', rag_outcomes)


outcomes from RAG evaluation:
 {'answer_correctness': 0.5180, 'faithfulness': 0.7222}


In [None]:
"""

# Summary of Findings:
End to end RAG system starting from document loading to LLM model evaluation is shown. The results tell the system is partially faithful. So,
we need to find some space for improvement. In this project, I have shown my solid experience such as RAG system framework with NLP and LLM, 
Agentic AI systems, and Analytical evaluation pipeline. 

# Optimization of RAG Performance Metrics:
We can do the following things to get better scores. 
•	Play various chunks size and chunks overlap size. 
•	Find better embedding models  
•	Think of using a reranker model 
•	Need to refine structured prompt patterns
•	Force LLM to look at only retrieved text
•	Apply best k tuning to different chunks. 
•	Drop noisy chunks
•	Do chunk quality filtering
•	Use best LLM evaluator 

"""