## Configs

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi

# Ajuste caso tenha salvo em outro lugar:
ROOT = Path("./data").resolve()

# Conjuntos BEIR do paper:
DATASETS = ["scifact", "fiqa", "nfcorpus"]
TOPK = 100  # top-k para avaliação

pd.set_option("display.max_colwidth", 200)

## Utils I/O

In [2]:
def load_parquet_or_jsonl(path_parquet: Path, path_jsonl: Path) -> pd.DataFrame:
    if path_parquet.exists():
        return pd.read_parquet(path_parquet)
    if path_jsonl.exists():
        return pd.read_json(path_jsonl, lines=True)
    raise FileNotFoundError(f"Faltam arquivos: {path_parquet} | {path_jsonl}")

def load_beir_processed(ds_name: str):
    base = ROOT / ds_name / "processed" / "beir"
    paths = {
        "corpus":  (base / "corpus.parquet",  base / "corpus.jsonl"),
        "queries": (base / "queries.parquet", base / "queries.jsonl"),
        "qrels":   (base / "qrels.parquet",   base / "qrels.jsonl"),
    }
    df_corpus  = load_parquet_or_jsonl(*paths["corpus"])
    df_queries = load_parquet_or_jsonl(*paths["queries"])
    df_qrels   = load_parquet_or_jsonl(*paths["qrels"])

    # Normalizações leves
    df_corpus["doc_id"]   = df_corpus["doc_id"].astype(str)
    df_queries["query_id"] = df_queries["query_id"].astype(str)
    if "split" not in df_qrels.columns:
        df_qrels["split"] = "test"  # fallback
    df_qrels["query_id"] = df_qrels["query_id"].astype(str)
    df_qrels["doc_id"]   = df_qrels["doc_id"].astype(str)
    if "score" not in df_qrels.columns:
        df_qrels["score"] = 1

    return df_corpus, df_queries, df_qrels

def pick_split_available(qrels: pd.DataFrame, prefer="test"):
    order = [prefer, "dev", "validation", "train"]
    present = set(qrels["split"].unique().tolist())
    for s in order:
        if s in present:
            return s
    return qrels["split"].iloc[0]

## Tokenização

In [3]:
# Tokenização simples
_tok_re = re.compile(r"[A-Za-z0-9_]+")
def tokenize(text: str):
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    return [t.lower() for t in _tok_re.findall(text)]

## Métricas (Top-k)

In [4]:
# Métricas
def mrr_at_k(ranked, gold_set, k=10):
    for i, did in enumerate(ranked[:k], start=1):
        if did in gold_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(ranked, gains, k=10):
    dcg = 0.0
    for i, did in enumerate(ranked[:k], start=1):
        g = gains.get(did, 0.0)
        if g > 0:
            dcg += (2**g - 1) / np.log2(i + 1)
    return dcg

def ndcg_at_k(ranked, gains, k=10):
    ideal = sorted(gains.values(), reverse=True)[:k]
    idcg = 0.0
    for i, g in enumerate(ideal, start=1):
        idcg += (2**g - 1) / np.log2(i + 1)
    if idcg == 0:
        return 0.0
    return dcg_at_k(ranked, gains, k) / idcg

def average_precision_at_k(ranked, gold_set, k=10):
    hits, s = 0, 0.0
    for i, did in enumerate(ranked[:k], start=1):
        if did in gold_set:
            hits += 1
            s += hits / i
    return 0.0 if not gold_set else s / min(len(gold_set), k)

def recall_at_k(ranked, gold_set, k=10):
    if not gold_set:
        return 0.0
    return len(set(ranked[:k]) & gold_set) / len(gold_set)

## índice BM25

In [8]:
def build_bm25_index(df_corpus: pd.DataFrame):
    # concatena título + texto
    texts = (df_corpus["title"].fillna("") + " " + df_corpus["text"].fillna("")).tolist()
    tokenized = [tokenize(t) for t in texts]
    bm25 = BM25Okapi(tokenized)
    return bm25, tokenized

def rank_with_bm25(bm25: BM25Okapi, df_corpus: pd.DataFrame, query_text: str, topk=TOPK):
    qtok = tokenize(query_text)
    scores = bm25.get_scores(qtok)
    # pega top-k índices
    top_idx = np.argpartition(scores, -topk)[-topk:]
    top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]  # ordena por score desc
    return df_corpus["doc_id"].iloc[top_idx].tolist(), scores[top_idx]

## Loop de avaliação

In [9]:
def evaluate_dataset(name: str, topk=TOPK, only_test=True, show_examples=3):
    print(f"\n====== {name} ======")
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    print("corpus:", df_corpus.shape, "| queries:", df_queries.shape, "| qrels:", df_qrels.shape)

    desired = "test" if only_test else "test"
    split = pick_split_available(df_qrels, prefer=desired)
    if only_test and split != "test":
        print(f"⚠️ Split 'test' não encontrado em {name}; usando '{split}' para rodar mesmo assim.")
    qrels_split = df_qrels[df_qrels["split"] == split].copy()
    print("usando split:", split, "| qrels:", qrels_split.shape)

    # mapa query_id -> {doc_id: score}
    qrels_map = {}
    for row in qrels_split.itertuples(index=False):
        qrels_map.setdefault(row.query_id, {})[row.doc_id] = int(getattr(row, "score", 1))

    # constrói índice lexical
    bm25, _ = build_bm25_index(df_corpus)

    # prepara lookup de queries existentes
    qdf = df_queries[df_queries["query_id"].isin(qrels_split["query_id"].unique())]
    q_lookup = dict(zip(qdf["query_id"], qdf["query"]))

    metrics = {"MRR@10": [], "nDCG@10": [], "MAP@10": [], "Recall@10": []}
    examples = []

    for qid, gold_gains in tqdm(qrels_map.items(), desc=f"BM25 {name} ({split})"):
        qtxt = q_lookup.get(qid)
        if qtxt is None:
            continue
        gold_set = {d for d, s in gold_gains.items() if s > 0}
        ranked, _ = rank_with_bm25(bm25, df_corpus, qtxt, topk=topk)

        metrics["MRR@10"].append(mrr_at_k(ranked, gold_set, k=10))
        metrics["nDCG@10"].append(ndcg_at_k(ranked, gold_gains, k=10))
        metrics["MAP@10"].append(average_precision_at_k(ranked, gold_set, k=10))
        metrics["Recall@10"].append(recall_at_k(ranked, gold_set, k=10))

    # agrega
    results = {m: float(np.mean(v)) if v else 0.0 for m, v in metrics.items()}
    print("Resultados (médias):", results)

    # exemplos qualitativos
    for qid in list(qrels_map.keys())[:show_examples]:
        qtxt = q_lookup.get(qid)
        if not qtxt:
            continue
        ranked, _ = rank_with_bm25(bm25, df_corpus, qtxt, topk=10)
        gold = {d for d, s in qrels_map[qid].items() if s > 0}
        hit_rank = next((i+1 for i, did in enumerate(ranked) if did in gold), None)
        # título do top-1
        top1 = ranked[0] if ranked else None
        title_top1 = df_corpus.loc[df_corpus["doc_id"] == top1, "title"].head(1).tolist()
        title_top1 = title_top1[0] if title_top1 else None
        examples.append({
            "query_id": qid,
            "query": qtxt,
            "gold_size": len(gold),
            "hit@10_rank": hit_rank,
            "top1_doc_id": top1,
            "top1_title": title_top1
        })
    ex_df = pd.DataFrame(examples)
    if not ex_df.empty:
        print("\nAmostras de consultas (top-1 e acerto@10):")
        display(ex_df)
    return results


In [10]:
all_results = {}
for ds in DATASETS:
    all_results[ds] = evaluate_dataset(ds, only_test=True)

print("\n== Resumo (médias por dataset) ==")
display(pd.DataFrame(all_results).T)


corpus: (5183, 4) | queries: (1109, 2) | qrels: (1258, 4)
usando split: test | qrels: (339, 4)


BM25 scifact (test): 100%|██████████| 300/300 [00:04<00:00, 68.11it/s]

Resultados (médias): {'MRR@10': 0.6183650793650793, 'nDCG@10': 0.6522849449944166, 'MAP@10': 0.607113492063492, 'Recall@10': 0.7756666666666666}

Amostras de consultas (top-1 e acerto@10):





Unnamed: 0,query_id,query,gold_size,hit@10_rank,top1_doc_id,top1_title
0,1,0-dimensional biomaterials show inductive properties.,1,,10608397,High-performance neuroprosthetic control by an individual with tetraplegia.
1,3,"1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.",1,1.0,14717500,Rare Variants Create Synthetic Genome-Wide Associations
2,5,1/2000 in UK have abnormal PrP positivity.,1,1.0,13734012,Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey



corpus: (57638, 4) | queries: (6648, 2) | qrels: (17110, 4)
usando split: test | qrels: (1706, 4)


BM25 fiqa (test): 100%|██████████| 648/648 [04:50<00:00,  2.23it/s]


Resultados (médias): {'MRR@10': 0.270334852047815, 'nDCG@10': 0.21670365375682496, 'MAP@10': 0.15995581497345165, 'Recall@10': 0.2780129849574294}

Amostras de consultas (top-1 e acerto@10):


Unnamed: 0,query_id,query,gold_size,hit@10_rank,top1_doc_id,top1_title
0,8,How to deposit a cheque issued to an associate in my business into my business account?,2,1.0,65404,
1,15,Can I send a money order from USPS as a business?,1,,420483,
2,18,1 EIN doing business under multiple business names,1,10.0,377152,



corpus: (3633, 4) | queries: (3237, 2) | qrels: (134294, 4)
usando split: test | qrels: (12334, 4)


BM25 nfcorpus (test): 100%|██████████| 323/323 [00:02<00:00, 108.39it/s]


Resultados (médias): {'MRR@10': 0.5084549609317411, 'nDCG@10': 0.30729352479751965, 'MAP@10': 0.21908235516428795, 'Recall@10': 0.15222522666353797}

Amostras de consultas (top-1 e acerto@10):


Unnamed: 0,query_id,query,gold_size,hit@10_rank,top1_doc_id,top1_title
0,PLAIN-2,Do Cholesterol Statin Drugs Cause Breast Cancer?,24,1.0,MED-14,Statin use after diagnosis of breast cancer and survival: a population-based cohort study.
1,PLAIN-12,Exploiting Autophagy to Live Longer,30,,MED-4711,Licorice and licochalcone-A induce autophagy in LNCaP prostate cancer cells by suppression of Bcl-2 expression and the mTOR pathway.
2,PLAIN-23,How to Reduce Exposure to Alkylphenols Through Your Diet,90,2.0,MED-1221,Clostridium difficile in foods and animals: history and measures to reduce exposure.



== Resumo (médias por dataset) ==


Unnamed: 0,MRR@10,nDCG@10,MAP@10,Recall@10
scifact,0.618365,0.652285,0.607113,0.775667
fiqa,0.270335,0.216704,0.159956,0.278013
nfcorpus,0.508455,0.307294,0.219082,0.152225


In [11]:
for name in DATASETS:
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    qrels_test = df_qrels[df_qrels["split"] == "test"]
    print(
        f"{name:9s} | corpus={len(df_corpus):6d} | queries_total={len(df_queries):5d} "
        f"| qrels_test_linhas={len(qrels_test):6d} | qrels_test_queries_unicas={qrels_test['query_id'].nunique():5d}"
    )

scifact   | corpus=  5183 | queries_total= 1109 | qrels_test_linhas=   339 | qrels_test_queries_unicas=  300
fiqa      | corpus= 57638 | queries_total= 6648 | qrels_test_linhas=  1706 | qrels_test_queries_unicas=  648
nfcorpus  | corpus=  3633 | queries_total= 3237 | qrels_test_linhas= 12334 | qrels_test_queries_unicas=  323
