### Evaluation

In [25]:
import pandas as pd
from src.eval import hit_rate_at_k, recall_at_k, mrr_at_k


def evaluate(facts, retrieved, metric, k):
    gold = [f["id"] for f in eval(facts)]
    retr = [f["id"] for f in retrieved]
    return metric(gold=gold, retrieved=retr, k=k)


def calculate_score(data, kset=(1, 2, 4, 8)):
    df = pd.DataFrame(data)

    metrics = {
        "hit_rate": hit_rate_at_k,
        "recall": recall_at_k,
        "mrr": mrr_at_k,
    }

    # compute per-row metrics
    for k in kset:
        for name, fn in metrics.items():
            col = f"{name}_at_k-{k}"
            df[col] = df.apply(
                lambda r, fn=fn, k=k: evaluate(
                    r["facts"], r["retrieved_facts"], fn, k
                ), axis=1,
            )

    # summarize
    return pd.DataFrame(
        {"k": k, **{m: (df[f"{m}_at_k-{k}"].mean() * 100).round(2) for m in metrics}}
        for k in kset
    )

### Pipeline

In [26]:
from datasets import load_dataset

dataset = load_dataset("weerayut/iq-data")

valid_scores = calculate_score(dataset['validation'])

In [27]:
valid_scores

Unnamed: 0,k,hit_rate,recall,mrr
0,1,84.0,27.03,84.0
1,2,92.0,45.28,88.0
2,4,95.0,64.68,88.92
3,8,98.0,85.65,89.46
