#RAGBench Dataset evaluation - Capstone

In [None]:
!pip install llama-index
!pip install openai
!pip install faiss-cpu --quiet
!pip install llama-index-vector-stores-faiss
!pip install llama-index-llms-groq
!pip install llama-index-llms-openai
!pip install llama-index-embeddings-huggingface
!pip install datasets -U
!pip install llama-index-postprocessor-cohere-rerank
!pip install qdrant-client
!pip install "llama-index-vector-stores-qdrant"
!pip install json5
!pip install gradio json5
!pip install llama-index-embeddings-langchain
!pip install langchain-experimental
!pip install langchain-openai
!pip install rank_bm25
!pip install llama-index-retrievers-bm25
!pip install langchain-huggingface
!pip install voyageai langchain-voyageai
!pip install python-dotenv

In [3]:
from datasets import load_dataset
import faiss
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from sklearn.metrics import mean_squared_error,roc_auc_score
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.cohere_rerank import CohereRerank
from typing import List
# Import Qdrant client
from qdrant_client import QdrantClient, models
from llama_index.core.node_parser import SentenceSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings # Or any other embedding model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document as LangchainDocument
from llama_index.core.schema import TextNode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from dotenv import load_dotenv
import os

dotenv_path = os.path.join('sample_data', '.env')
load_dotenv(dotenv_path)

import os

# Replace 'YOUR_QDRANT_API_KEY' with your actual Qdrant API key
os.environ["QDRANT_API_KEY"] = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.PI4GGL4Rf8HyQlJARqQY3j03b3UXyn6o2XTWuVrNzYQ"
os.environ["OPENAI_API_KEY"] = "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"
os.environ["GROQ_API_KEY"] = "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"

# Access the environment variables
openai_key = os.getenv("OPENAI_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

In [4]:
from datasets import load_dataset, get_dataset_config_names
def print_ragbench_domains():
    """
    Prints the available domains (configurations) for the 'rungalileo/ragbench' dataset.
    """
    try:
        # get_dataset_config_names() returns a list of available configurations
        # for a given dataset, which in this case are the domains.
        available_domains = get_dataset_config_names("rungalileo/ragbench")

        print("Available domains (subsets) in 'rungalileo/ragbench' dataset:")
        for domain in available_domains:
            print(f"- {domain}")

        return available_domains

    except Exception as e:
        print(f"Error retrieving RAGBench domains: {e}")
        print("Please ensure you have an active internet connection.")
        return []

print_ragbench_domains()

README.md: 0.00B [00:00, ?B/s]

Available domains (subsets) in 'rungalileo/ragbench' dataset:
- covidqa
- cuad
- delucionqa
- emanual
- expertqa
- finqa
- hagrid
- hotpotqa
- msmarco
- pubmedqa
- tatqa
- techqa


['covidqa',
 'cuad',
 'delucionqa',
 'emanual',
 'expertqa',
 'finqa',
 'hagrid',
 'hotpotqa',
 'msmarco',
 'pubmedqa',
 'tatqa',
 'techqa']

In [5]:
from datasets import load_dataset, concatenate_datasets
from typing import List

def load_ragbench(domains: List[str], num_documents_per_domain: int = 50):
    all_datasets = []
    for domain in domains:
        try:
            # Load the test split for the current domain
            dataset = load_dataset("rungalileo/ragbench", domain, split="test")
            if len(dataset) > num_documents_per_domain:
                dataset = dataset.select(range(num_documents_per_domain))

            print(f"Loaded {len(dataset)} samples from RAGBench '{domain}' domain.")
            all_datasets.append(dataset)

        except Exception as e:
            print(f"Error loading dataset for domain '{domain}': {e}")

    if not all_datasets:
        raise ValueError("No datasets were loaded successfully.")

    # Concatenate the datasets
    combined_dataset = concatenate_datasets(all_datasets)
    print(f"Combined dataset contains {len(combined_dataset)} samples.")

    return combined_dataset

In [7]:
# @title Default title text
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever

class RAGRetriever:
    def __init__(self, subset="finqa", subset_input = ["finqa", "tatqa"], embedding_model="sentence-transformers/all-mpnet-base-v2", top_k=7, load_data=False, chunking_strategy="semantic_chunker"):
        self.top_k = top_k
        self.subset = subset
        self.chunking_strategy = chunking_strategy
        self.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
        self.subset_input = subset_input
        print(f"Embedding model : {self.embed_model}")
        print(f"Top K {top_k}")
        self.embedding_dim = len(self.embed_model.get_text_embedding("test"))

        # Qdrant Cloud Configuration
        self.qdrant_url = "https://9fd7db02-83d7-4d3d-adcb-c141aa17b113.eu-west-1-0.aws.cloud.qdrant.io"
        qdrantapi = os.getenv("QDRANT_API_KEY")
        self.qdrant_api_key = qdrant_api_key
        self.collection_name = f"ragbench_{subset}"

        self._initialize_qdrant_client()
        self.index = None
        if load_data:
            self._load_index()
        else:
            self._load_existing_index()
        #self.retriever = self.index.as_retriever(similarity_top_k=self.top_k)
        self.retriever = self.index.as_retriever(similarity_top_k=self.top_k, use_sparse=True,  alpha=0.1)
        print("Initialized Vector Retriever.")


    def _initialize_qdrant_client(self):
        self.qdrant_client = QdrantClient(
            url=self.qdrant_url,
            api_key=self.qdrant_api_key,
        )
        print("Initialized Qdrant client.")


    def _load_index(self):
        # Create collection
        try:
            """
            self.qdrant_client.recreate_collection(
                collection_name=self.collection_name,
                vectors_config=models.VectorParams(size=self.embedding_dim, distance=models.Distance.COSINE),
            )
            """
            self.qdrant_client.recreate_collection(
                  collection_name=self.collection_name,
                  vectors_config=models.VectorParams(size=self.embedding_dim, distance=models.Distance.COSINE),
                  sparse_vectors_config={"bm25": models.SparseVectorParams(modifier=models.Modifier.IDF)},
              )
            print(f"Collection '{self.collection_name}' created with dimension {self.embedding_dim}.")
        except Exception as e:
            print(f"Error creating collection '{self.collection_name}': {e}")
            raise e

        embeddings = HuggingFaceEmbeddings(model_name=self.embed_model.model_name)
        print("loaded HuggingFaceEmbeddings")
        langchain_documents: List[LangchainDocument] = []

        print("loading dataset")
        ragbench_dataset = load_ragbench(domains= self.subset_input)
        for sample in ragbench_dataset:
            for i, doc_text in enumerate(sample["documents"]):
                langchain_documents.append(LangchainDocument(
                    page_content=doc_text,
                ))
        if(self.chunking_strategy == "sentence_splitter"):
          parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
          nodes = parser.get_nodes_from_documents(langchain_documents)
        elif(self.chunking_strategy == "semantic_chunker"):
          parser = SemanticChunker(embeddings=embeddings, buffer_size=5, breakpoint_threshold_amount=0.8)
          langchain_chunks = parser.split_documents(langchain_documents)
          nodes = [TextNode(text=chunk.page_content) for chunk in langchain_chunks]
        else:
          raise ValueError(f"Invalid chuking strategy: {self.chunking_strategy}")

        vector_store = QdrantVectorStore(client=self.qdrant_client, collection_name=self.collection_name)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        print("Created VectorStoreIndex")
        self.index = VectorStoreIndex(
            nodes, # Use the created LlamaIndex TextNodes
            storage_context=storage_context,
            embed_model=self.embed_model,
            show_progress=True
        )

    def _load_existing_index(self):
        try:
            vector_store = QdrantVectorStore(client=self.qdrant_client, collection_name=self.collection_name)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            self.index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context, embed_model=self.embed_model)
            print(f"Loaded existing index from collection '{self.collection_name}'.")
        except Exception as e:
            print(f"Error loading existing index from collection '{self.collection_name}': {e}")
            raise e

    def retrieve(self, question: str):
        if not question or not isinstance(question, str):
            raise ValueError("Invalid question string.")

        retrieved_nodes = self.retriever.retrieve(question)
        return [node.text for node in retrieved_nodes]

In [8]:
from llama_index.llms.groq import Groq
from llama_index.llms.openai import OpenAI
from typing import List
class RAGGenerator:
    def __init__(self, model, llm="groq", temperature = 1.0):
        print(f"RAGGenerator model : {model} with LLM: {llm} & temperature: {temperature}")
        self.model = model
        if(llm == "groq"):
          self.llm = Groq(api_key=groq_api_key,  model=model, temperature=temperature)
        else:
          self.llm = OpenAI(api_key=openai_key, model=model, temperature=temperature)

    def generate(self, question: str, contexts: List[str]) -> str:
        prompt = f"""
        Using ONLY the following pieces of context, answer the question.
        Carefully read the context and extract any specific details, limitations, or exceptions mentioned that are relevant to the question.
        If the context provides information that partially answers the question or mentions related concepts, include that information in your answer.
        If the complete answer is found, provide it directly.
        If the answer cannot be found in the context provided, please state clearly that you cannot answer the question with the given information.

        Context:
        {chr(10).join(contexts)}

        Question: {question}

        Answer:
        """
        return self.llm.complete(prompt).text.strip()

In [9]:
from llama_index.llms.openai import OpenAI
from typing import Dict
import json
import re
class RAGJudge:
    def __init__(self, model, llm="openAI", temperature = 1.0):
        print(f"RAGJudge model : {model} with LLM: {llm} & temperature: {temperature}")
        self.model = model
        if(llm == "groq"):
          self.llm = Groq(api_key=groq_api_key,  model=model, temperature=temperature)
        else:
          self.llm = OpenAI(api_key=openai_key, model=model, temperature=temperature)

    def clean_and_load_llm_json(self,response_text_or_object: str) -> str:
        if isinstance(response_text_or_object, (dict, list)):
            parsed_json_initial = response_text_or_object
        else:
          cleaned_response_text = str(response_text_or_object).strip()

          match = re.search(r'```json\s*(.*?)\s*```', cleaned_response_text, re.DOTALL)
          if not match:
              match = re.search(r'```(?!\S)(.*?)\s*```', cleaned_response_text, re.DOTALL)

          json_candidate_str = ""
          if match:
              json_candidate_str = match.group(1).strip()
          else:
              json_candidate_str = cleaned_response_text

          try:
              parsed_json_initial = json.loads(json_candidate_str)
          except json.JSONDecodeError as e:
            raise ValueError(
                f"Failed to decode JSON from LLM response. "
                f"Check for malformed JSON or extra text. Error: {e}\n"
                f"Attempted to parse: '{json_candidate_str}'\n"
                f"Original response: '{response_text_or_object}'"
            ) from e


        if isinstance(parsed_json_initial, list) and len(parsed_json_initial) == 1 and isinstance(parsed_json_initial[0], dict):
           return parsed_json_initial[0]
        elif isinstance(parsed_json_initial, dict):
           return parsed_json_initial
        else:
           raise ValueError(
            f"LLM response parsed to an unexpected JSON type: {type(parsed_json_initial).__name__}. "
            "Expected a dictionary or a list containing a single dictionary."
        )

    def judge(self, question: str, predicted_answer: str, contexts: List[str]) -> Dict:
        prompt = f"""
I asked someone to answer a question based on one or more documents.
Your task is to review their response and assess whether or not each sentence
in that response is supported by text in the documents. And if so, which
sentences in the documents provide that support. You will also tell me which
of the documents contain useful information for answering the question, and
which of the documents the answer was sourced from.
Here are the documents, each of which is split into sentences. Alongside each
sentence is associated key, such as '0a.' or '0b.' that you can use to refer
to it:
'''
{chr(10).join(contexts)}
'''
The question was:
'''
{question}
'''
Here is their response, split into sentences. Alongside each sentence is
associated key, such as 'a.' or 'b.' that you can use to refer to it. Note
that these keys are unique to the response, and are not related to the keys
in the documents:
'''
{predicted_answer}
'''
You must respond with a JSON object matching this schema:
'''
{{
"relevance_explanation": string,
"all_relevant_sentence_keys": [string],
"overall_supported_explanation": string,
"overall_supported": boolean,
"sentence_support_information": [
{{
"response_sentence_key": string,
"explanation": string,
"supporting_sentence_keys": [string],
"fully_supported": boolean
}},
],
"all_utilized_sentence_keys": [string]
}}
'''
The relevance_explanation field is a string explaining which documents
contain useful information for answering the question. Provide a step-by-step
breakdown of information provided in the documents and how it is useful for
answering the question.
The all_relevant_sentence_keys field is a list of all document sentences keys
(e.g. ’0a’) that are relevant to the question. Include every sentence that is
useful and relevant to the question, even if it was not used in the response,
or if only parts of the sentence are useful. Ignore the provided response when
making this judgement and base your judgement solely on the provided documents
and question. Omit sentences that, if removed from the document, would not
impact someone’s ability to answer the question.
The overall_supported_explanation field is a string explaining why the response
*as a whole* is or is not supported by the documents. In this field, provide a
step-by-step breakdown of the claims made in the response and the support (or
lack thereof) for those claims in the documents. Begin by assessing each claim
separately, one by one; don’t make any remarks about the response as a whole
until you have assessed all the claims in isolation.
The overall_supported field is a boolean indicating whether the response as a
whole is supported by the documents. This value should reflect the conclusion
you drew at the end of your step-by-step breakdown in overall_supported_explanation.
In the sentence_support_information field, provide information about the support
*for each sentence* in the response.
The sentence_support_information field is a list of objects, one for each sentence
in the response. Each object MUST have the following fields:
- response_sentence_key: a string identifying the sentence in the response.
This key is the same as the one used in the response above.
- explanation: a string explaining why the sentence is or is not supported by the
documents.
- supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
support the response sentence. If the sentence is not supported, this list MUST
be empty. If the sentence is supported, this list MUST contain one or more keys.
In special cases where the sentence is supported, but not by any specific sentence,
you can use the string "supported_without_sentence" to indicate that the sentence
is generally supported by the documents. Consider cases where the sentence is
expressing inability to answer the question due to lack of relevant information in
the provided contex as "supported_without_sentence". In cases where the sentence
is making a general statement (e.g. outlining the steps to produce an answer, or
summarizing previously stated sentences, or a transition sentence), use the
sting "general".In cases where the sentence is correctly stating a well-known fact,
like a mathematical formula, use the string "well_known_fact". In cases where the
sentence is performing numerical reasoning (e.g. addition, multiplication), use
the string "numerical_reasoning".
- fully_supported: a boolean indicating whether the sentence is fully supported by
the documents.
- This value should reflect the conclusion you drew at the end of your step-by-step
breakdown in explanation.
- If supporting_sentence_keys is an empty list, then fully_supported must be false.
17
- Otherwise, use fully_supported to clarify whether everything in the response
sentence is fully supported by the document text indicated in supporting_sentence_keys
(fully_supported = true), or whether the sentence is only partially or incompletely
supported by that document text (fully_supported = false).
The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
were used to construct the answer. Include every sentence that either directly supported
the answer, or was implicitly used to construct the answer, even if it was not used
in its entirety. Omit sentences that were not used, and could have been removed from
the documents without affecting the answer.
You must respond with a valid JSON string. Use escapes for quotes, e.g. ‘\\"‘, and
newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
As a reminder: your task is to review the response and assess which documents contain
useful information pertaining to the question, and how each sentence in the response
is supported by the text in the documents.
"""
        response = self.llm.complete(prompt)
        print('response:', response)

        #return json.loads(response.text.strip())
        return self.clean_and_load_llm_json(response)

In [10]:
from llama_index.llms.openai import OpenAI
from typing import Dict
import json
import re
from sklearn.metrics import mean_squared_error, roc_auc_score
import numpy as np

class RAGEvaluator:
    def _compute_completeness(self, trace: dict) -> float:
        support_info = trace.get("sentence_support_information", [])
        if not support_info:
            return 0.0
        fully_supported = sum(1 for s in support_info if s.get("fully_supported", False))
        return fully_supported / len(support_info)

    def __init__(self, retriever: RAGRetriever, generator: RAGGenerator, judge: RAGJudge):
        self.retriever = retriever
        self.generator = generator
        self.judge = judge

    def evaluate_sample(self, sample: dict):
        question = sample["question"]
        original_answer = sample["response"]
        contexts = self.retriever.retrieve(question)
        predicted_answer = self.generator.generate(question, contexts)
        trace = self.judge.judge(question, predicted_answer, contexts)

        # Compute metrics from trace output
        utilized = len(trace.get("all_utilized_sentence_keys", []))
        relevant = max(len(trace.get("all_relevant_sentence_keys", [])), 1)
        predicted_utilization = utilized / relevant
        predicted_relevance = relevant / len(contexts)

        predicted_completeness = self._compute_completeness(trace)
        predicted_adherence = 1 if trace.get("overall_supported", False) else 0



        predicted = {
            "relevance": predicted_relevance,  # Optional: replace with LLM-based estimate
            "utilization": predicted_utilization,
            "completeness": predicted_completeness,
            "adherence": predicted_adherence
        }

        ground_truth = {
            "relevance": sample.get("ragas_context_relevance", 0),
            "utilization": sample.get("utilization_score", 0),
            "completeness": sample.get("completeness_score", 0),
            "adherence": 1 if sample.get("adherence_score", False) else 0
        }

        print(f"Question: {question}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"Context: {contexts}")
        print(f"Original Answer: {original_answer}")
        print(f"Trace Output: {trace}")
        print(f"Predicted Metrics: {predicted}")
        print(f"Ground Truth Metrics: {ground_truth}")

        return predicted, ground_truth, trace

    def evaluate_dataset(self, dataset: List[dict], number_of_samples = 2):
        pred_all, gt_all = [], []
        pred_adherence, gt_adherence = [], []
        trace_outputs = []
        sample_count = 0

        print(f"Type of dataset {type(dataset)}")

        for sample in dataset:
            if sample_count >= 30:  # Check if the counter has reached 10
              print("Limit 30 reached. Exiting")
              break  # Exit the loop
            try:
                pred, gt, trace = self.evaluate_sample(sample)
                for k in pred:
                    # Append only if not NaN
                    if not np.isnan(pred[k]) and not np.isnan(gt[k]):
                        pred_all.append(pred[k])
                        gt_all.append(gt[k])
                # Append adherence only if not NaN
                if not np.isnan(pred["adherence"]) and not np.isnan(gt["adherence"]):
                    pred_adherence.append(pred["adherence"])
                    gt_adherence.append(gt["adherence"])

                trace_outputs.append(trace)
                sample_count += 1
            except Exception as e:
                print(f"Skipping sample due to error: {e}")

        rmse = float('nan')  # Initialize with NaN
        auc = float('nan')   # Initialize with NaN

        if len(gt_all) > 0: # Check if there are samples to calculate metrics
            rmse = mean_squared_error(gt_all, pred_all) ** 0.5
            if len(set(gt_adherence)) > 1: # Check if there's more than one class for AUC
                auc = roc_auc_score(gt_adherence, pred_adherence)
            else:
                print("AUC cannot be calculated with only one class in ground truth adherence.")

            print(f"\n✅ RMSE: {rmse:.4f}")
            print(f"✅ AUC-ROC (Adherence): {auc:.4f}")
        else:
            print("\n❌ No samples were successfully evaluated. Cannot compute metrics.")

        return trace_outputs, rmse, auc

In [12]:
import pprint

# Step 2: Initialize generator and judge
def rag_pipeline(subset = "finqa", judgeModel = "llama-3.3-70b-versatile", embedding_model_input = "FinLang/finance-embeddings-investopedia", summary_model_input = "mistral-saba-24b", subset_input = ["finqa", "tatqa"], load_data = False, top_k=30, chunking_strategy="semantic_chunker"):
  retriever = RAGRetriever(subset = "finqa", top_k=top_k, load_data=load_data, embedding_model=embedding_model_input, subset_input = subset_input, chunking_strategy=chunking_strategy)
  generator = RAGGenerator(model=summary_model_input, llm="groq", temperature= 0)
  ragbench_dataset = load_ragbench(subset_input)
  judge = RAGJudge(model=judgeModel, llm="groq", temperature= 0)
  # Step 3: Evaluate the entire dataset
  evaluator = RAGEvaluator(retriever, generator, judge)
  trace_outputs, rmse, auc = evaluator.evaluate_dataset(ragbench_dataset) # Await the async evaluation
  #pprint.pprint(trace_outputs[0])
  return rmse, auc

In [None]:
finance_rmse, finance_auroc = rag_pipeline(load_data = True, top_k=25, subset_input = ["finqa", "tatqa"], subset = "finance", summary_model_input = "qwen/qwen3-32b")
print(f"RMSE: {finance_rmse}, AUC: {finance_auroc}")

#legal_rmse, legal_auroc = rag_pipeline(load_data = True, top_k=30, subset_input = ["cuad"], subset = "legal", embedding_model_input = 'nlpaueb/legal-bert-base-uncased')
#print(f"RMSE: {legal_rmse}, AUC: {legal_auroc}")

#med_rmse, med_auroc = rag_pipeline(load_data = True, top_k=30, subset_input = ["covidqa", "pubmedqa"], subset = "biomedical", embedding_model_input = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')
#print(f"RMSE: {med_rmse}, AUC: {med_auroc}")

In [None]:
import gradio as gr

# Create Gradio interfaces
reload_data = gr.Dropdown(
    choices=[True, False], # Add other subsets if available
    label="Do you want to reload the data ?",
    value="False"
)

subset_input = gr.Dropdown(
    choices=["finqa", "hotpotqa"], # Add other subsets if available
    label="Select RAGBench Subset",
    value="finqa"
)

judge_model_input = gr.Dropdown(
    choices=["lama3-70b-8192", "llama3-70b-8192"],
    label="Select RAGJudge Model",
    value="llama3-70b-8192"
)

# Add an input for the embedding model name
embedding_model_input = gr.Dropdown(
    choices=["sentence-transformers/all-mpnet-base-v2", "ProsusAI/finbert"],
    label="Select Embedding Model",
    value="ProsusAI/finbert"
)

# Add an input for the summary model name
summary_model_input = gr.Dropdown(
    choices=["llama3-8b-8192", "llama3-70b-8192"],
    label="Select Summary Model",
    value="llama3-8b-8192"
)

output_rmse = gr.Number(label="RMSE")
output_auc = gr.Number(label="AUC-ROC (Adherence)")
output_trace = gr.Textbox(label="First Sample Trace", lines=20)

# Create the Gradio app
app = gr.Interface(
    fn=rag_pipeline,
    inputs=[subset_input, judge_model_input, embedding_model_input, summary_model_input, reload_data],
    outputs=[output_rmse, output_auc, output_trace],
    title="RAGBench Dataset Evaluation",
    description="Evaluate RAG performance on a selected RAGBench subset.",
)

# Launch the app
if __name__ == "__main__":
    app.launch(debug=True)