# CUBO Evaluation & Benchmarking

A professional RAG system isn't just about getting answersâ€”it's about measuring quality.
This notebook demonstrates how to benchmark CUBO using Information Retrieval (IR) metrics and Generative Metrics (RAGAS).

## What You'll Learn
1. Create a synthetic Ground Truth dataset.
2. Run retrieval evaluation (Recall@K).
3. Run generative evaluation (RAGAS).

In [None]:
import sys
from pathlib import Path

# Add CUBO to path
cubo_root = Path(".").resolve().parent
if str(cubo_root) not in sys.path:
    sys.path.insert(0, str(cubo_root))

from cubo.core import CuboCore
from evaluation.metrics import IRMetricsEvaluator
import pandas as pd

# Initialize core engine
core = CuboCore()
core.initialize_components()
print("CUBO Core Initialized")

## 1. Create Ground Truth Dataset

In [None]:
documents = [
    {"text": "The Eiffel Tower is located in Paris, France.", "id": "doc_paris", "file_path": "paris.txt"},
    {"text": "The Colosseum is an ancient amphitheater in Rome, Italy.", "id": "doc_rome", "file_path": "rome.txt"},
    {"text": "Sushi is a traditional dish from Japan.", "id": "doc_japan", "file_path": "japan.txt"},
    {"text": "Pizza originated in Naples, Italy.", "id": "doc_naples", "file_path": "naples.txt"}
]

core.add_documents(documents)

ground_truth = {
    "Where is the Eiffel Tower?": ["doc_paris"]},
    "Tell me about Italian food": ["doc_rome", "doc_naples"]},
    "Famous buildings in Italy": ["doc_rome"]},
    "Japanese cuisine": ["doc_japan"]}
}

## 2. Evaluate Retrieval (Recall@K)

In [None]:
results = []
for query, relevant_ids in ground_truth.items():
    retrieved = core.query_retrieve(query, top_k=3)
    retrieved_ids = [doc['metadata'].get('id') for doc in retrieved]
    
    metrics = IRMetricsEvaluator.evaluate_retrieval(
        question_id=query,
        retrieved_ids=retrieved_ids,
        ground_truth={query: relevant_ids},
        k_values=[1, 3]
    )
    
    results.append({
        "query": query,
        "recall@3": metrics["recall_at_k"][3],
        "mrr": metrics["mrr"]
    })

df = pd.DataFrame(results)
print(df)
print(f"Avg Recall@3: {df['recall@3'].mean()}")

## 3. Evaluate Generation (RAGAS)

In [None]:
try:
    from evaluation.ragas_evaluator import run_ragas_evaluation
    
    questions = list(ground_truth.keys())
    ground_truths_list = ["Paris", "Italian food", "Colosseum", "Sushi"]
    
    contexts = []
    answers = []
    
    for q in questions:
        res = core.query_and_generate(q, top_k=2)
        answers.append(res['answer'])
        contexts.append([d['document'] for d in res['sources']])

    scores = run_ragas_evaluation(questions, contexts, ground_truths_list, answers)
    print("RAGAS Scores:", scores)
except Exception as e:
    print("RAGAS Eval skipped:", str(e))