In [1]:
import numpy as np
import pandas as pd
import os 

import warnings

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("neural-bridge/rag-dataset-12000")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2400
    })
})

In [4]:
def convert_dataset_to_list_format(dataset_dict):
    train_ds_, test_ds_ = [], []
    
    # Process train split
    for sample in dataset_dict['train']:
        if sample['answer'] is not None:
            train_ds_.append({
                "user_input": sample['question'],
                "response": sample['answer'],
                "reference": sample['answer'][:len(sample['answer'])-5],  # Using answer as reference for now
                "retrieved_contexts": [sample['context']],
                "reference_contexts": [sample['context']]
            })
    
    # Process test split
    for sample in dataset_dict['test']:
        if sample['answer'] is not None:
            test_ds_.append({
                "user_input": sample['question'],
                "response": sample['answer'],
                "reference": sample['answer'][:len(sample['answer'])-5],  # Using answer as reference for now
                "retrieved_contexts": [sample['context']],
                "reference_contexts": [sample['context']]
            })
    
    return train_ds_, test_ds_

In [5]:
# Convert the dataset
train_ds, test_ds = convert_dataset_to_list_format(ds)

In [6]:
len(test_ds) 

2399

# [RAG (Retrieval-Augmented Generation) metrics (LLM based)](https://medium.com/@med.el.harchaoui/rag-evaluation-metrics-explained-a-complete-guide-dbd7a3b571a8)

![RAG_scheme](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*hA2RpiXjL3dvm--v.png)

### For now we will more concentrate on evaluation of Generation part

## Faithfulness

This metric measure how the LLM answer is faithful to the provided context, does it respect what was given as input or not. Its considered as faithful if the claims made in the answer can be extracted from the provided context. To calculate it, we start by extracting all claims from the LLM provided answer first. Then for each claim we check if this one claim can be inferred from the retrieved context. It value range from 0 to 1. Higher is better.

![Faithulness](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*g8cCq5m5Fz2XOPBy.png)

## Answer Relevance

This metric measure the quality of the generated answer given the user query, how pertinent is the answer with respect the the user question. To assess this we need to know if the answer is complete or not, does it contain redundant information ?

To calculate this metric, we generate N question based on the answer, does questions should be normally similar the the original question if the provided answer is relevant to the original question, if not they will be different. To compare the N generated question, we use cosine or dot product vector similarity operators. The value should range between 0 and 1.
The formula for determining answer relevance is as follows:

![Answer Relevance](https://miro.medium.com/v2/resize:fit:1400/format:webp/0*QffodGkNYSRzcH52.png)

# Traditional NLP metrics

## String simillarity 

Metric measures the similarity between the reference and the response using traditional string distance measures such as Levenshtein, Hamming, and [Jaro](https://srinivas-kulkarni.medium.com/jaro-winkler-vs-levenshtein-distance-2eab21832fd6)

## [BLEU](https://medium.com/nlplanet/two-minutes-nlp-learn-the-bleu-metric-by-examples-df015ca73a86)(Bilingual Evaluation Understudy)



In [7]:
from ragas import SingleTurnSample, EvaluationDataset, evaluate

#Traditional metrics
from ragas.metrics._string import NonLLMStringSimilarity, DistanceMeasure
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    
)
from ragas.metrics import BleuScore, RougeScore

# LLM metrics
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import Faithfulness, FaithfulnesswithHHEM, ResponseRelevancy

# Choose the appropriate import based on your API:
from langchain_community.chat_models import ChatOllama
from ragas import evaluate
from langchain_community.embeddings import OllamaEmbeddings

In [8]:
# to uload Dashboard on ragas website
os.environ["RAGAS_APP_TOKEN"] = "apt.4955-d3328fbcd0ba-45a7-8140-094d1ba2-fb3b7"

In [9]:
eval_ds = EvaluationDataset.from_list(test_ds[:4])

In [10]:
langchain_llm = ChatOllama(model="llama3.2:1b")
langchain_embeddings = OllamaEmbeddings(model="llama3.2:1b")

  langchain_llm = ChatOllama(model="llama3.2:1b")
  langchain_embeddings = OllamaEmbeddings(model="llama3.2:1b")


In [11]:
result = evaluate(
  eval_ds,
  metrics=[
    # BleuScore(),
    # RougeScore(),
    # NonLLMStringSimilarity(distance_measure=DistanceMeasure.LEVENSHTEIN),
    #ResponseRelevancy(),
    # context_precision,
    faithfulness,
    # answer_relevancy,
    # context_recall
  ], 
    llm=langchain_llm,
    embeddings=langchain_embeddings)

Evaluating:  50%|████████████████▌                | 2/4 [00:23<00:22, 11.48s/it]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt statement_generator_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[3]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating:  75%|████████████████████████▊        | 3/4 [00:29<00:09,  9.18s/it]Exception raised in Job[2]: TimeoutError()
Evaluating: 100%|█████████████████████████████████| 4/4 [03:00<00:00, 45.00s/it]


In [12]:
result

{'faithfulness': 0.2500}

In [13]:
# Initialize with Google AI Studio
Gemini_OPEN_API_KEY = "AIzaSyAej7Q9cR5zcFginFZ16o2LBf14mxhG4Ok"

config = {
    "model": "gemini-1.5-flash",  # or other model IDs
    "temperature": 0.4,
    "max_tokens": None,
    "top_p": 0.8,
}

evaluator_llm = LangchainLLMWrapper(ChatGoogleGenerativeAI(
    model=config["model"],
    api_key = Gemini_OPEN_API_KEY,
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
))

In [7]:
config = {
    "credentials_profile_name": "your-profile-name",  # E.g "default"
    "region_name": "your-region-name",  # E.g. "us-east-1"
    "llm": "your-llm-model-id",  # E.g "anthropic.claude-3-5-sonnet-20241022-v2:0"
    "embeddings": "your-embedding-model-id",  # E.g "amazon.titan-embed-text-v2:0"
    "temperature": 0.4,
}

In [11]:
len(eval_ds) 

2399

In [None]:
warnings.filterwarnings('ignore')

results = evaluate(dataset=eval_ds, metrics=[
    BleuScore(),
    RougeScore(),
    NonLLMStringSimilarity(distance_measure=DistanceMeasure.LEVENSHTEIN),
    #FaithfulnesswithHHEM(llm=evaluator_llm)
    Faithfulness(llm=evaluator_llm),
]
)

Evaluating:   0%|                            | 1/9596 [00:01<4:08:33,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-

In [None]:
results

In [97]:
scorer = Faithfulness(llm=evaluator_llm)
await scorer.single_turn_ascore(eval_ds[0])

1.0

In [11]:
# Google AI Studio Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

evaluator_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(
    google_api_key = Gemini_OPEN_API_KEY,
    model="models/embedding-001",  # Google's text embedding model
    task_type="retrieval_document"  # Optional: specify the task type
))

In [12]:
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic

test_data = {
    "user_input": "summarise given text\nThe company reported an 8% rise in Q3 2024, driven by strong performance in the Asian market. Sales in this region have significantly contributed to the overall growth. Analysts attribute this success to strategic marketing and product localization. The positive trend in the Asian market is expected to continue into the next quarter.",
    "response": "The company experienced an 8% increase in Q3 2024, largely due to effective marketing strategies and product adaptation, with expectations of continued growth in the coming quarter.",
}

metric = AspectCritic(name="summary_accuracy", llm=evaluator_llm, definition="Verify if the summary is accurate.")
test_data = SingleTurnSample(**test_data)
await metric.single_turn_ascore(test_data)

1

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

def evaluate_rag_result(query, context, answer):
    inputs = tokenizer(f"{query}\nContext: {context}\nAnswer: {answer}", return_tensors="pt")
    scores = model(**inputs).logits
    return scores.mean()

Fetching 2 files: 100%|██████████████████████████| 2/2 [28:47<00:00, 863.79s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:12<00:00,  6.33s/it]


In [18]:
evaluate_rag_result(test_ds[0]["user_input"], 
                    test_ds[0]["retrieved_contexts"],
                    test_ds[0]["response"],
                    )

tensor(-0.0366, grad_fn=<MeanBackward0>)

In [19]:
from transformers import pipeline 

In [20]:
from langchain.llms import HuggingFacePipeline
from transformers import TextGenerationPipeline

# Create the generation pipeline
generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.4,
    top_p=0.8,
    do_sample=True,
    return_full_text=False
)

# Wrap in LangChain's LLM interface
langchain_llm = HuggingFacePipeline(pipeline=generation_pipeline)


Device set to use mps:0


In [21]:
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(langchain_llm)
