# Build a Simple RAG System

## Install OpenAI, and LangChain dependencies

In [1]:
!pip install langchain==0.3.10
!pip install langchain-openai==0.2.12
!pip install langchain-community==0.3.11
!pip install dill

Collecting langchain-openai==0.2.12
  Downloading langchain_openai-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting openai<2.0.0,>=1.55.3 (from langchain-openai==0.2.12)
  Downloading openai-1.57.3-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai==0.2.12)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading langchain_openai-0.2.12-py3-none-any.whl (50 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading openai-1.57.3-py3-none-any.whl (390 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.2/390.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collec

## Install Chroma Vector DB and LangChain wrapper

In [2]:
!pip install langchain-chroma==0.1.4

Collecting langchain-chroma==0.1.4
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0 (from langchain-chroma==0.1.4)
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting fastapi<1,>=0.95.2 (from langchain-chroma==0.1.4)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting build>=1.0.3 (from chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0->langchain-chroma==0.1.4)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0->langchain-chroma==0.1.4)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0->langchain-chroma==0.1.4)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb!=0.5.4,!=0.

## Install RAG Evaluation Libraries

In [3]:
!pip install ragas==0.2.8
!pip install deepeval==1.4.7

Collecting ragas==0.2.8
  Downloading ragas-0.2.8-py3-none-any.whl.metadata (9.1 kB)
Collecting datasets (from ragas==0.2.8)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting appdirs (from ragas==0.2.8)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pysbd>=0.3.4 (from ragas==0.2.8)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas==0.2.8)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->ragas==0.2.8)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->ragas==0.2.8)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->ragas==0.2.8)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading ragas-0.2.8-py3-n

## Enter Open AI API Key

In [1]:
from getpass import getpass

OPENAI_KEY = getpass('Enter Open AI API Key: ')

Enter Open AI API Key: ··········


## Setup Environment Variables

In [2]:
import os

os.environ['OPENAI_API_KEY'] = OPENAI_KEY

### Open AI Embedding Models

LangChain enables us to access Open AI embedding models which include the newest models: a smaller and highly efficient `text-embedding-3-small` model, and a larger and more powerful `text-embedding-3-large` model.

In [3]:
from langchain_openai import OpenAIEmbeddings

# details here: https://openai.com/blog/new-embedding-models-and-api-updates
openai_embed_model = OpenAIEmbeddings(model='text-embedding-3-small')

## Loading and Processing the Data

### Get the dataset

In [4]:
# if you can't download using the following code
# go to https://drive.google.com/file/d/1QkSY9W5RyaBnY8c5FLIsmpPVXoHTQ-fb/view?usp=sharing download it
# manually upload it on colab
!gdown 1QkSY9W5RyaBnY8c5FLIsmpPVXoHTQ-fb

Downloading...
From: https://drive.google.com/uc?id=1QkSY9W5RyaBnY8c5FLIsmpPVXoHTQ-fb
To: /content/rag_eval_docs.csv
  0% 0.00/2.66k [00:00<?, ?B/s]100% 2.66k/2.66k [00:00<00:00, 8.13MB/s]


### Load and Process JSON Documents

In [5]:
import pandas as pd

df = pd.read_csv('./rag_eval_docs.csv')
df

Unnamed: 0,id,title,context
0,1,Machine Learning,Machine learning is a field of artificial inte...
1,2,Deep Learning,Deep learning is a subset of machine learning ...
2,3,Natural Language Processing (NLP),NLP is a branch of AI that enables computers t...
3,4,Pyramids,"Pyramids are ancient structures, often serving..."
4,5,Photosynthesis,Photosynthesis is the process plants use to co...
5,6,Biology,"Biology is the study of living organisms, cove..."
6,7,Quantum Mechanics,Quantum mechanics is a branch of physics that ...
7,8,Cryptocurrency,Cryptocurrency is a digital currency that uses...
8,9,Renewable Energy,"Renewable energy sources, such as solar and wi..."
9,10,Artificial Intelligence,Artificial intelligence refers to machines mim...


In [6]:
docs = df.to_dict(orient='records')
docs[:3]

[{'id': 1,
  'title': 'Machine Learning',
  'context': 'Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.'},
 {'id': 2,
  'title': 'Deep Learning',
  'context': 'Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition. Convolutional and recurrent neural networks are among the common architectures used.'},
 {'id': 3,
  'title': 'Natural Language Processing (NLP)',
  'context': 'NLP is a branch of AI that enables computers to understand, interpret, and generate human language. Techniques include tokenization, stemming, and sentiment analysis. Applications range from chatbots to language translation services.'}]

In [7]:
from langchain.docstore.document import Document
processed_docs = []

for doc in docs:
    metadata = {
        "title": doc['title'],
        "id": doc['id'],
    }
    data = doc['context']
    processed_docs.append(Document(page_content=data, metadata=metadata))
processed_docs[:3]

[Document(metadata={'title': 'Machine Learning', 'id': 1}, page_content='Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.'),
 Document(metadata={'title': 'Deep Learning', 'id': 2}, page_content='Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition. Convolutional and recurrent neural networks are among the common architectures used.'),
 Document(metadata={'title': 'Natural Language Processing (NLP)', 'id': 3}, page_content='NLP is a branch of AI that enables computers to understand, interpret, and generate human language. Techniques include tokenization, stemming, and sentiment analysis. Applications range from chatbots to language translation services.')]

## Index Document Chunks and Embeddings in Vector DB

Here we initialize a connection to a Chroma vector DB client, and also we want to save to disk, so we simply initialize the Chroma client and pass the directory where we want the data to be saved to.

In [None]:
from langchain_chroma import Chroma

# create vector DB of docs and embeddings - takes < 30s on Colab
chroma_db = Chroma.from_documents(documents=processed_docs,
                                  collection_name='my_db',
                                  embedding=openai_embed_model,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./my_db")

### Load Vector DB from disk

This is just to show once you have a vector database on disk you can just load and create a connection to it anytime

In [None]:
# load from disk
chroma_db = Chroma(persist_directory="./my_db",
                   collection_name='my_db',
                   embedding_function=openai_embed_model)

In [None]:
chroma_db

<langchain_chroma.vectorstores.Chroma at 0x7a5b1dfca560>

### Semantic Similarity based Retrieval

We use simple cosine similarity here and retrieve the top 3 similar documents based on the user input query

In [None]:
similarity_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                              search_kwargs={"k": 3, "score_threshold": 0.3})

In [None]:
from IPython.display import display, Markdown

def display_docs(docs):
    for doc in docs:
        print('Metadata:', doc.metadata)
        print('Content Brief:')
        display(Markdown(doc.page_content))
        print()

In [None]:
query = "what is AI?"
top_docs = similarity_retriever.invoke(query)
display_docs(top_docs)

Metadata: {'id': 10, 'title': 'Artificial Intelligence'}
Content Brief:


Artificial intelligence refers to machines mimicking human intelligence, like problem-solving and learning. AI includes applications like virtual assistants, robotics, and autonomous vehicles. It's evolving rapidly with advancements in machine learning and deep learning.


Metadata: {'id': 3, 'title': 'Natural Language Processing (NLP)'}
Content Brief:


NLP is a branch of AI that enables computers to understand, interpret, and generate human language. Techniques include tokenization, stemming, and sentiment analysis. Applications range from chatbots to language translation services.


Metadata: {'id': 1, 'title': 'Machine Learning'}
Content Brief:


Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.




In [None]:
query = "how do plants survive?"
top_docs = similarity_retriever.invoke(query)
display_docs(top_docs)

Metadata: {'id': 5, 'title': 'Photosynthesis'}
Content Brief:


Photosynthesis is the process plants use to convert sunlight into energy. This process produces glucose and releases oxygen as a byproduct. It is crucial for sustaining life on Earth by providing food and oxygen.




## Build the RAG Pipeline

In [None]:
from langchain_core.prompts import ChatPromptTemplate

rag_prompt = """You are an assistant who is an expert in question-answering tasks.
                Answer the following question using only the following pieces of retrieved context.
                If the answer is not in the context, do not make up answers, just say that you don't know.
                Keep the answer to the point based on the information from the context.

                Question:
                {question}

                Context:
                {context}

                Answer:
            """

rag_prompt_template = ChatPromptTemplate.from_template(rag_prompt)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
from operator import itemgetter


chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

src_rag_response_chain = (
    {
        "context": (itemgetter('context')
                        |
                    RunnableLambda(format_docs)),
        "question": itemgetter("question")
    }
        |
    rag_prompt_template
        |
    chatgpt
        |
    StrOutputParser()
)

rag_chain_w_sources = (
    {
        "context": similarity_retriever,
        "question": RunnablePassthrough()
    }
        |
    RunnablePassthrough.assign(response=src_rag_response_chain)
)

In [None]:
query = "What is AI?"
result = rag_chain_w_sources.invoke(query)
result

{'context': [Document(metadata={'id': 10, 'title': 'Artificial Intelligence'}, page_content="Artificial intelligence refers to machines mimicking human intelligence, like problem-solving and learning. AI includes applications like virtual assistants, robotics, and autonomous vehicles. It's evolving rapidly with advancements in machine learning and deep learning."),
  Document(metadata={'id': 3, 'title': 'Natural Language Processing (NLP)'}, page_content='NLP is a branch of AI that enables computers to understand, interpret, and generate human language. Techniques include tokenization, stemming, and sentiment analysis. Applications range from chatbots to language translation services.'),
  Document(metadata={'id': 1, 'title': 'Machine Learning'}, page_content='Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation 

In [None]:
query = "How do plants survive?"
result = rag_chain_w_sources.invoke(query)
result

{'context': [Document(metadata={'id': 5, 'title': 'Photosynthesis'}, page_content='Photosynthesis is the process plants use to convert sunlight into energy. This process produces glucose and releases oxygen as a byproduct. It is crucial for sustaining life on Earth by providing food and oxygen.')],
 'question': 'How do plants survive?',
 'response': 'Plants survive by using photosynthesis to convert sunlight into energy, producing glucose and releasing oxygen as a byproduct.'}

# Create End-to-End RAG Evaluation Workflow

![](https://i.imgur.com/GUIkpjy.png)

## Create a Synthetic RAG Golden Reference Dataset

In [8]:
doc_contexts = [doc.page_content for doc in processed_docs]
doc_contexts[:3]

['Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.',
 'Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition. Convolutional and recurrent neural networks are among the common architectures used.',
 'NLP is a branch of AI that enables computers to understand, interpret, and generate human language. Techniques include tokenization, stemming, and sentiment analysis. Applications range from chatbots to language translation services.']

In [9]:
from deepeval.synthesizer import Synthesizer
from deepeval.synthesizer import types



In [None]:
synthesizer = Synthesizer(model='gpt-4o',
                          embedder=OpenAIEmbeddings())

eval_data = synthesizer.generate_goldens(
    # Provide a list of context for synthetic data generation
    contexts=[[doc] for doc in doc_contexts],
    include_expected_output=True,
    max_goldens_per_context=1,
    num_evolutions=1,
    scenario="Retrieval Augmented Generation",
    task="Question Answering",
    evolutions={
        types.Evolution.REASONING: 0.1,     # Evolves the input to require multi-step logical thinking.
        types.Evolution.MULTICONTEXT: 0.9,  # Ensures that all relevant information from the context is utilized.
        types.Evolution.CONCRETIZING: 0.0,  # Makes abstract ideas more concrete and detailed.
        types.Evolution.CONSTRAINED: 0.0,   # Introduces a condition or restriction, testing the model's ability to operate within specific limits.
        types.Evolution.COMPARATIVE: 0.0,   # Requires a response that involves a comparison between options or contexts.
        types.Evolution.HYPOTHETICAL: 0.0,  # Forces the model to consider and respond to a hypothetical scenario.
        types.Evolution.IN_BREADTH: 0.0,    # Broadens the input to touch on related or adjacent topics.
    }
)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


✨ Generating up to 10 goldens using DeepEval (using gpt-4o, use case=QA, method=default): 100%|██████████| 10/10 [00:20<00:00,  2.00s/it]


In [None]:
eval_data[0]

Golden(input='How can machine learning algorithms use data patterns effectively for predictions and applications such as recommendations?', actual_output=None, expected_output='Machine learning algorithms use data patterns effectively by analyzing historical data to identify trends and relationships. This enables them to make accurate predictions and classifications. For applications like recommendation systems, these algorithms assess user behavior to suggest relevant items, while in image recognition, they identify distinct features to classify images accurately.', context=['Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.'], retrieval_context=None, additional_metadata={'evolutions': ['Multi-context'], 'synthetic_input_quality': 0.8, 'context_quality': None}, comments=None, t

## Save the Synthetic RAG Golden Reference Dataset

In [None]:
import dill

In [None]:
with open('golden_ref_data.bin', 'wb') as f:
    dill.dump(eval_data, f)

## Create RAG Evaluation Dataset

In [None]:
from deepeval.dataset import EvaluationDataset

eval_dataset = EvaluationDataset()

# load golden dataset
with open('golden_ref_data.bin', 'rb') as f:
    golden_docs = dill.load(f)

eval_dataset.goldens = golden_docs

In [None]:
eval_dataset.goldens[0]

Golden(input='How can machine learning algorithms use data patterns effectively for predictions and applications such as recommendations?', actual_output=None, expected_output='Machine learning algorithms use data patterns effectively by analyzing historical data to identify trends and relationships. This enables them to make accurate predictions and classifications. For applications like recommendation systems, these algorithms assess user behavior to suggest relevant items, while in image recognition, they identify distinct features to classify images accurately.', context=['Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.'], retrieval_context=None, additional_metadata={'evolutions': ['Multi-context'], 'synthetic_input_quality': 0.8, 'context_quality': None}, comments=None, t

In [None]:
eval_dataset.goldens[0].input

'How can machine learning algorithms use data patterns effectively for predictions and applications such as recommendations?'

In [None]:
rag_chain_w_sources.invoke(eval_dataset.goldens[0].input)

{'context': [Document(metadata={'id': 1, 'title': 'Machine Learning'}, page_content='Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation systems and image recognition.'),
  Document(metadata={'id': 2, 'title': 'Deep Learning'}, page_content='Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition. Convolutional and recurrent neural networks are among the common architectures used.'),
  Document(metadata={'id': 10, 'title': 'Artificial Intelligence'}, page_content="Artificial intelligence refers to machines mimicking human intelligence, like problem-solving and learning. AI includes applications like virtual assistants, robotics, and autonomous vehicles. It's evolving rapidly with advancements in machine learning and d

In [None]:
from typing import List
from deepeval.test_case import LLMTestCase
from deepeval.dataset import Golden
from tqdm import tqdm

def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
    test_cases = []
    for golden in tqdm(goldens):
        response_obj = rag_chain_w_sources.invoke(golden.input)
        test_case = LLMTestCase(
            input=golden.input,
            actual_output=response_obj['response'],
            expected_output=golden.expected_output,
            context=golden.context,
            retrieval_context=[doc.page_content for doc in response_obj['context']]
        )
        test_cases.append(test_case)
    return test_cases

In [None]:
eval_dataset.test_cases = convert_goldens_to_test_cases(eval_dataset.goldens)

100%|██████████| 10/10 [00:13<00:00,  1.31s/it]


In [None]:
eval_dataset.test_cases[0]

LLMTestCase(input='How can machine learning algorithms use data patterns effectively for predictions and applications such as recommendations?', actual_output='Machine learning algorithms use data patterns effectively for predictions and applications such as recommendations by analyzing past data to make predictions or classify information.', expected_output='Machine learning algorithms use data patterns effectively by analyzing historical data to identify trends and relationships. This enables them to make accurate predictions and classifications. For applications like recommendation systems, these algorithms assess user behavior to suggest relevant items, while in image recognition, they identify distinct features to classify images accurately.', context=['Machine learning is a field of artificial intelligence focused on enabling systems to learn patterns from data. Algorithms analyze past data to make predictions or classify information. Popular applications include recommendation s

## Run and View RAG Evaluations on the Evaluation Dataset

In [None]:
from deepeval import evaluate
from deepeval.metrics import ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, HallucinationMetric
from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric

contextual_precision = ContextualPrecisionMetric(threshold=0.5, include_reason=True, model="gpt-4o")
contextual_recall = ContextualRecallMetric(threshold=0.5, include_reason=True, model="gpt-4o")
contextual_relevancy = ContextualRelevancyMetric(threshold=0.5, include_reason=True, model="gpt-4o")
answer_relevancy = AnswerRelevancyMetric(threshold=0.5, include_reason=True, model="gpt-4o")
faithfulness = FaithfulnessMetric(threshold=0.5, include_reason=True, model="gpt-4o")
hallucination = HallucinationMetric(threshold=0.5, include_reason=True, model="gpt-4o")
ragas_answer_relevancy = RAGASAnswerRelevancyMetric(threshold=0.5, embeddings=OpenAIEmbeddings(), model="gpt-4o")

eval_results = evaluate(test_cases=eval_dataset.test_cases,
                        metrics=[contextual_precision, contextual_recall, contextual_relevancy,
                                 answer_relevancy, ragas_answer_relevancy, faithfulness, hallucination])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 10 test case(s) in parallel: |          |  0% (0/10) [Time Taken: 00:00, ?test case/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

None


Evaluating 10 test case(s) in parallel: |██████████|100% (10/10) [Time Taken: 00:49,  4.95s/test case]



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant node is perfectly ranked at the top, providing pertinent information on how 'Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition.' Kudos for getting it right!, error: None)
  - ✅ Contextual Recall (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because every sentence in the expected output is perfectly supported by the information in the nodes in the retrieval context. Great job capturing all the details accurately!, error: None)
  - ✅ Contextual Relevancy (score: 0.5, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.50 because while the retrieval context briefly mentions neural networks and their effectiveness in image and speech recognition, most of




In [None]:
eval_results.test_results[0]

TestResult(success=False, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the relevant node is perfectly ranked at the top, providing pertinent information on how 'Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition.' Kudos for getting it right!", strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0046825, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context mentions \'Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like image and speech recognition.\' This is relevant as it addresses the effectiveness of neural network architectures in enhancing deep learning for these tasks."\n    },\n    {\n        "verdict": "no",\n        "reason": "The context discusses \'NLP as a branch o

In [None]:
eval_metrics = []
for result in eval_results.test_results:
    eval_dict = {}
    eval_dict['Input'] = result.input
    eval_dict['Expected Output'] = result.expected_output
    eval_dict['Actual Output'] = result.actual_output
    eval_dict['Context'] = result.context
    eval_dict['Retrieval Context'] = result.retrieval_context
    eval_dict['Success'] = result.success
    metrics = result.metrics_data
    for metric in metrics:
        eval_dict[metric.name+'_Score'] = metric.score
    for metric in metrics:
        eval_dict[metric.name+'_Success'] = metric.success
    for metric in metrics:
        eval_dict[metric.name+'_Reason'] = metric.reason
    eval_metrics.append(eval_dict)

In [None]:
eval_metrics[0]

{'Input': 'In what ways do neural network architectures improve the effectiveness of deep learning in image and speech recognition tasks?',
 'Expected Output': 'Neural network architectures, such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs), enhance the effectiveness of deep learning in image and speech recognition by efficiently processing data with multiple layers. CNNs are particularly adept at handling spatial hierarchies in images, allowing for the automatic detection of features like edges and textures. RNNs, on the other hand, excel in processing sequential data, making them suitable for tasks involving speech recognition, where temporal patterns are crucial. Together, these architectures improve accuracy and efficiency in recognizing complex patterns in images and speech.',
 'Actual Output': "I don't know.",
 'Context': ['Deep learning is a subset of machine learning utilizing neural networks with many layers. It excels in complex tasks like imag

In [None]:
import pandas as pd

eval_results_df = pd.DataFrame(eval_metrics)
eval_results_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Input,In what ways do neural network architectures i...,"How do tokenization, stemming, and sentiment a...",What are the architectural achievements and my...,"How does modern biology, particularly in the a...",What are the implications of wave-particle dua...,In what ways do solar and wind energy contribu...,How is glucose and oxygen produced through pho...,In what ways does cryptography contribute to t...,How can machine learning algorithms use data p...,What are some AI applications that demonstrate...
Expected Output,"Neural network architectures, such as convolut...",Tokenization breaks text into manageable units...,The architectural achievements of the pyramids...,"Modern biology, through cell and DNA studies, ...",Wave-particle duality implies that particles a...,Solar and wind energy contribute to reducing e...,"Photosynthesis converts sunlight into energy, ...",Cryptography ensures the security of peer-to-p...,Machine learning algorithms use data patterns ...,AI applications that demonstrate the use of ma...
Actual Output,I don't know.,I don't know.,The architectural achievements of the pyramids...,"Modern biology, particularly in the areas of c...",The implications of wave-particle duality in q...,Solar and wind energy contribute to reducing e...,Photosynthesis produces glucose by converting ...,Cryptography contributes to the security of pe...,Machine learning algorithms use data patterns ...,Some AI applications that demonstrate the use ...
Context,[Deep learning is a subset of machine learning...,[NLP is a branch of AI that enables computers ...,"[Pyramids are ancient structures, often servin...","[Biology is the study of living organisms, cov...",[Quantum mechanics is a branch of physics that...,"[Renewable energy sources, such as solar and w...",[Photosynthesis is the process plants use to c...,[Cryptocurrency is a digital currency that use...,[Machine learning is a field of artificial int...,[Artificial intelligence refers to machines mi...
Retrieval Context,[Deep learning is a subset of machine learning...,[NLP is a branch of AI that enables computers ...,"[Pyramids are ancient structures, often servin...","[Biology is the study of living organisms, cov...",[Quantum mechanics is a branch of physics that...,"[Renewable energy sources, such as solar and w...",[Photosynthesis is the process plants use to c...,[Cryptocurrency is a digital currency that use...,[Machine learning is a field of artificial int...,[Artificial intelligence refers to machines mi...
Success,False,False,False,True,True,True,True,True,True,True
Contextual Precision_Score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Contextual Recall_Score,1.0,1.0,0.333333,0.666667,0.5,1.0,0.5,0.5,1.0,1.0
Contextual Relevancy_Score,0.5,0.5,1.0,1.0,0.666667,0.6,1.0,0.666667,0.666667,1.0
Answer Relevancy_Score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
eval_results_df[['Contextual Precision_Score', 'Contextual Recall_Score', 'Contextual Relevancy_Score',
                 'Answer Relevancy_Score', 'Answer Relevancy (ragas)_Score',
                 'Faithfulness_Score', 'Hallucination_Score']].describe()

Unnamed: 0,Contextual Precision_Score,Contextual Recall_Score,Contextual Relevancy_Score,Answer Relevancy_Score,Answer Relevancy (ragas)_Score,Faithfulness_Score,Hallucination_Score
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,1.0,0.75,0.76,1.0,0.781898,1.0,0.2
std,0.0,0.274986,0.215338,0.0,0.412537,0.0,0.421637
min,1.0,0.333333,0.5,1.0,0.0,1.0,0.0
25%,1.0,0.5,0.616667,1.0,0.943923,1.0,0.0
50%,1.0,0.833333,0.666667,1.0,0.972235,1.0,0.0
75%,1.0,1.0,1.0,1.0,0.991908,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0
