In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("neural-bridge/rag-dataset-12000")

In [3]:
ds 

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2400
    })
})

In [9]:
len(ds['train']['context'])

9600

# [RAG (Retrieval-Augmented Generation) metrics](https://medium.com/@med.el.harchaoui/rag-evaluation-metrics-explained-a-complete-guide-dbd7a3b571a8)

![RAG_scheme](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*hA2RpiXjL3dvm--v.png)

### For now we will more concentrate on evaluation of Generation part

## Faithfulness

This metric measure how the LLM answer is faithful to the provided context, does it respect what was given as input or not. Its considered as faithful if the claims made in the answer can be extracted from the provided context. To calculate it, we start by extracting all claims from the LLM provided answer first. Then for each claim we check if this one claim can be inferred from the retrieved context. It value range from 0 to 1. Higher is better.

![Faithulness](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*g8cCq5m5Fz2XOPBy.png)

## Answer Relevance

This metric measure the quality of the generated answer given the user query, how pertinent is the answer with respect the the user question. To assess this we need to know if the answer is complete or not, does it contain redundant information ?

To calculate this metric, we generate N question based on the answer, does questions should be normally similar the the original question if the provided answer is relevant to the original question, if not they will be different. To compare the N generated question, we use cosine or dot product vector similarity operators. The value should range between 0 and 1.
The formula for determining answer relevance is as follows:

![Answer Relevance](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*QffodGkNYSRzcH52.png)

In [16]:
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [15]:
# OPEN_API_KEY = "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"

In [32]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo", api_key=OPEN_API_KEY))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(api_key=OPEN_API_KEY))

In [4]:
from ragas import SingleTurnSample
from ragas.metrics import BleuScore

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
    "reference": "The company reported an 8% growth in Q3 2024, primarily driven by strong sales in the Asian market, attributed to strategic marketing and localized products, with continued growth anticipated in the next quarter."
}
metric = BleuScore()
test_data = SingleTurnSample(**test_data)
metric.single_turn_score(test_data)


0.13718598426177148

In [11]:
config = {
    "model": "gemini-1.5-flash",  # or other model IDs
    "temperature": 0.4,
    "max_tokens": None,
    "top_p": 0.8,
    # For Vertex AI only:
    #"project": "your-project-id",  # Required for Vertex AI
    #"location": "us-central1",     # Required for Vertex AI
}

Gemini_OPEN_API_KEY = "AIzaSyAej7Q9cR5zcFginFZ16o2LBf14mxhG4Ok"

In [13]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# Choose the appropriate import based on your API:
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize with Google AI Studio
evaluator_llm = LangchainLLMWrapper(ChatGoogleGenerativeAI(
    model=config["model"],
    api_key = Gemini_OPEN_API_KEY,
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
))

In [17]:
# Google AI Studio Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

evaluator_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(
    google_api_key = Gemini_OPEN_API_KEY,
    model="models/embedding-001",  # Google's text embedding model
    task_type="retrieval_document"  # Optional: specify the task type
))

In [18]:
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
}

metric = AspectCritic(name="summary_accuracy",llm=evaluator_llm, definition="Verify if the summary is accurate.")
test_data = SingleTurnSample(**test_data)
await metric.single_turn_ascore(test_data)

1