# Step 0. set up

In [81]:
#Step 0 — Setup
import sys
from pathlib import Path

# Adjust if needed — point to your repo
REPO_DIR = Path("/Users/ywxiu/jasp-multimodal-rag")
sys.path.append(str(REPO_DIR))



In [82]:
# import data
import pandas as pd
import numpy as np

# Load the file
path = "../data/evaluation/retrieval_results_sheet1.csv"

df = pd.read_csv(path)

df.head()
df.dtypes


QID                   int64
Question             object
Difficulty Level     object
Gold Standard Set    object
Retrieval Mode       object
Rank                  int64
Retrieved Doc        object
Relevance             int64
dtype: object

# Step 1: Calculate 4 Metrics

## 1. Success@k (Binary Accuracy)

Success@k indicates whether the system retrieves at least one fully relevant document within the top k results. 

For each question and retrieval mode, Success@k is assigned a value of 1 if any item in the top k has a relevance label of 2; otherwise, it is assigned 0.


In [83]:
def compute_success_at_k(group):
    """
    Success@k = 1 if ANY fully relevant doc (Relevance == 2)
    appears within the top k retrieved results.
    Otherwise 0.
    """

    # Success if at least one doc in top-k is fully relevant
    success = 1 if (group["Relevance"] == 2).any() else 0

    return pd.Series({"Success@k": success})

results_success = (
    df[df["Difficulty Level"].isin(["easy", "difficult"])]
    .groupby(["QID","Question", "Difficulty Level", "Retrieval Mode"])
    .apply(compute_success_at_k)
    .reset_index()
)
print (results_success)

   QID                                      Question Difficulty Level  \
0    1  How do I open a dataset and view it in JASP?             easy   
1    1  How do I open a dataset and view it in JASP?             easy   
2    1  How do I open a dataset and view it in JASP?             easy   
3    1  How do I open a dataset and view it in JASP?             easy   

            Retrieval Mode  Success@k  
0                     BM25          0  
1         bm_vector_fusion          1  
2  bm_vector_fusion_rerank          1  
3                   vector          1  


  .apply(compute_success_at_k)


In [84]:
# optional: save
out_path = f"success_at_k.csv"
results_success.to_csv(out_path, index=False)

print("Saved to:", out_path)
results_success.head()


Saved to: success_at_k.csv


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Success@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,1
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1
3,1,How do I open a dataset and view it in JASP?,easy,vector,1


## 2. Precision@k (Exact Accuracy)

Precision@k measures the proportion of retrieved documents within the top k that are relevant according to the gold-standard annotations. 

Whereas Succuss@5 evaluates binary accuracy, Precision@5 evaluates exactness—how many of the returned results are correctly relevant.

For each question–mode pair, each of the top 5 retrieved documents was compared against the gold-standard relevance labels (0 = irrelevant, 1 = partially relevant but not in the gold standard set, 2 = fully relevant and listed in the gold standard set ). 

A value of 1 indicates that all retrieved items in the top 5 are relevant; a value of 0 indicates no relevant items were retrieved.


In [85]:
def compute_precision_at_k(group, k=5):
    """
    Precision@k = (# fully relevant docs in top-k) / k
    If fewer than k docs are retrieved, divide by number retrieved.
    
    """
    n_retrieved = len(group)
    n_rel = (group["Relevance"] == 2).sum()

    denom = min(k, n_retrieved)
    precision = n_rel / denom

    return pd.Series({"Precision@k": precision})

results_precision = (
    df[df["Difficulty Level"].isin(["easy", "difficult"])]
    .groupby(["QID","Question", "Difficulty Level", "Retrieval Mode"])
    .apply(compute_precision_at_k, k=5)
    .reset_index()
)

results_precision.head()


  .apply(compute_precision_at_k, k=5)


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Precision@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.0
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,0.2
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,0.2
3,1,How do I open a dataset and view it in JASP?,easy,vector,0.4


In [86]:
# optional: save
out_path = f"precision_at_k.csv"
results_precision.to_csv(out_path, index=False)

print("Saved to:", out_path)
results_precision.head()


Saved to: precision_at_k.csv


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Precision@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.0
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,0.2
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,0.2
3,1,How do I open a dataset and view it in JASP?,easy,vector,0.4


## 3. Recall@k (Completeness)

Recall@k quantifies how completely the system recovers all relevant information for a given query within the top k retrieved documents. 

This metric was computed only for answerable questions (difficulty levels easy and difficult). 

A value of 1 indicates that all gold-relevant documents appear within the top k retrieved items, whereas a value of 0 indicates that none of them were retrieved.


In [87]:
import numpy as np
import pandas as pd

df_answerable = df[df["Difficulty Level"].isin(["easy", "difficult"])]

def parse_gold_set(s):
    if pd.isna(s):
        return []
    return list({x.strip() for x in str(s).split(",") if x.strip()})

def compute_recall_variable_k(group, doc_col="Retrieved Doc"):
    """
    Recall = (# gold-standard docs that were retrieved) / (# gold-standard docs)
    where 'retrieved' = all rows in this group (after your score threshold).
    """
    gold_string = group["Gold Standard Set"].iloc[0]
    gold_docs = set(parse_gold_set(gold_string))
    n_gold = len(gold_docs)
    if n_gold == 0:
        return pd.Series({"Recall@k": np.nan})

    retrieved_docs = group[doc_col].astype(str)

    # count how many *distinct* gold docs were retrieved
    hits = {doc for doc in retrieved_docs if doc in gold_docs}
    n_rel_retrieved = len(hits)

    recall = n_rel_retrieved / n_gold
    return pd.Series({"Recall@k": recall})

results_recall = (
    df_answerable
      .groupby(["QID","Question", "Difficulty Level", "Retrieval Mode"])
      .apply(lambda g: compute_recall_variable_k(g, doc_col="Retrieved Doc"))
      .reset_index()
)
results_recall.head()

  .apply(lambda g: compute_recall_variable_k(g, doc_col="Retrieved Doc"))


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Recall@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.0
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,1.0
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1.0
3,1,How do I open a dataset and view it in JASP?,easy,vector,1.0


In [88]:
# optional: save
out_path = f"recall_at_k.csv"
results_recall.to_csv(out_path, index=False)

print("Saved to:", out_path)
results_recall.head()

Saved to: recall_at_k.csv


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Recall@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.0
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,1.0
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1.0
3,1,How do I open a dataset and view it in JASP?,easy,vector,1.0


## 4. nDCG@k(Ranking Quality) 


nDCG@k (Normalized Discounted Cumulative Gain) evaluates the ranking quality of the retrieved documents based on the graded relevance labels assigned during annotation. 

The metric accounts for both relevance and ranking position, which rewards systems that not only retrieve relevant documents but also order them appropriately, with the strongest emphasis placed on highly relevant items appearing early.

 nDCG@k is normalized between 0 and 1, where 1 represents a perfect ranking that matches the ideal ordering, and 0 represents the poorest possible ranking given the relevance configuration. 


In [None]:
import numpy as np
import pandas as pd

def compute_ndcg_variable_k(group):
    """
    nDCG@k with variable k:
      k = number of retrieved docs in this (QID, Difficulty, Mode) group.
    Uses graded relevance in column 'Relevance' (e.g. 0,1,2).
    """

    # sort by rank just in case
    g = group.sort_values("Rank")

    # relevance scores and ranks as arrays
    rel = g["Relevance"].to_numpy(dtype=float)
    ranks = g["Rank"].to_numpy(dtype=float)

    if len(rel) == 0:
        return pd.Series({"nDCG@k": np.nan})

    # ------- DCG -------
    # Compute DCG@k (Discounted Cumulative Gain): 
    # DCG assigns a gain to each retrieved document depending on its relevance
    # So relevant results at the top give much more DCG than at the bottom.

    gains = (2 ** rel) - 1
    dcg = np.sum(gains / np.log2(ranks + 1.0))

    # ------- IDCG (ideal DCG) -------
    # Compute IDCG@k (Ideal DCG):
    # IDCG is the maximum possible DCG for the same results if they were perfectly sorted.
    # To compute this:Sort the relevance values from highest → lowest;Compute DCG on this perfect ordering
    # IDCG represents the best case score.

    ideal_rel = np.sort(rel)[::-1]                     # best ranking
    ideal_ranks = np.arange(1, len(ideal_rel) + 1)
    ideal_gains = (2 ** ideal_rel) - 1
    idcg = np.sum(ideal_gains / np.log2(ideal_ranks + 1.0))

    # -------Normalise: nDCG = DCG / IDCG-------
    # Range:1.0 = perfect ranking; 0.0 = no useful ranking signal;
    # Always between 0 and 1
    # This allows comparing across different questions, datasets, and retrieval modes.
    if idcg == 0:
        ndcg = 0.0
    else:
        ndcg = dcg / idcg

    return pd.Series({"nDCG@k": ndcg})

results_ndcg = (
    df_answerable
      .groupby(["QID", "Question","Difficulty Level", "Retrieval Mode"])
      .apply(compute_ndcg_variable_k)
      .reset_index()
)

results_ndcg.head()


  .apply(compute_ndcg_variable_k)


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,nDCG@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.850345
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,0.707579
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1.0
3,1,How do I open a dataset and view it in JASP?,easy,vector,0.912878


In [90]:
# optional: save
out_path = f"ndcg_at_k.csv"
results_ndcg.to_csv(out_path, index=False)

print("Saved to:", out_path)
results_ndcg.head()

Saved to: ndcg_at_k.csv


Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,nDCG@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0.850345
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,0.707579
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1.0
3,1,How do I open a dataset and view it in JASP?,easy,vector,0.912878


In [None]:
# Combine all metrics into one dataframe

keys = ["QID","Question", "Difficulty Level", "Retrieval Mode"]

RAG_metrics_combined = (
    results_success
        .merge(results_precision, on=keys, how="inner")
        .merge(results_recall,    on=keys, how="inner")
        .merge(results_ndcg,      on=keys, how="inner")
)

RAG_metrics_combined.head()

Unnamed: 0,QID,Question,Difficulty Level,Retrieval Mode,Success@k,Precision@k,Recall@k,nDCG@k
0,1,How do I open a dataset and view it in JASP?,easy,BM25,0,0.0,0.0,0.850345
1,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion,1,0.2,1.0,0.707579
2,1,How do I open a dataset and view it in JASP?,easy,bm_vector_fusion_rerank,1,0.2,1.0,1.0
3,1,How do I open a dataset and view it in JASP?,easy,vector,1,0.4,1.0,0.912878


In [93]:
# optional: save
out_path = f"RAG_metrics_combined.csv"
RAG_metrics_combined.to_csv(out_path, index=False)

print("Saved to:", out_path)


Saved to: RAG_metrics_combined.csv


# Step 2.Show results table per metric