## Configs

In [67]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import sys

# Adicione o path do projeto
sys.path.insert(0, str(Path(".").resolve()))

# Importe suas classes
from src.datasets.schema import Document, Query
from src.retrievers.tfidf_faiss import TFIDFRetriever


# Ajuste caso tenha salvo em outro lugar:
ROOT = Path("./data").resolve()

# Conjuntos BEIR do paper:
DATASETS = ["scifact", "fiqa", "nfcorpus"]
TOPK = 100  # top-k para avalia√ß√£o

pd.set_option("display.max_colwidth", 200)

## Utils I/O

In [68]:
def load_parquet_or_jsonl(path_parquet: Path, path_jsonl: Path) -> pd.DataFrame:
    if path_parquet.exists():
        return pd.read_parquet(path_parquet)
    if path_jsonl.exists():
        return pd.read_json(path_jsonl, lines=True)
    raise FileNotFoundError(f"Faltam arquivos: {path_parquet} | {path_jsonl}")

def load_beir_processed(ds_name: str):
    base = ROOT / ds_name / "processed" / "beir"
    paths = {
        "corpus":  (base / "corpus.parquet",  base / "corpus.jsonl"),
        "queries": (base / "queries.parquet", base / "queries.jsonl"),
        "qrels":   (base / "qrels.parquet",   base / "qrels.jsonl"),
    }
    df_corpus  = load_parquet_or_jsonl(*paths["corpus"])
    df_queries = load_parquet_or_jsonl(*paths["queries"])
    df_qrels   = load_parquet_or_jsonl(*paths["qrels"])

    # Normaliza√ß√µes leves
    df_corpus["doc_id"]   = df_corpus["doc_id"].astype(str)
    df_queries["query_id"] = df_queries["query_id"].astype(str)
    if "split" not in df_qrels.columns:
        df_qrels["split"] = "test"  # fallback
    df_qrels["query_id"] = df_qrels["query_id"].astype(str)
    df_qrels["doc_id"]   = df_qrels["doc_id"].astype(str)
    if "score" not in df_qrels.columns:
        df_qrels["score"] = 1

    return df_corpus, df_queries, df_qrels

def pick_split_available(qrels: pd.DataFrame, prefer="test"):
    order = [prefer, "dev", "validation", "train"]
    present = set(qrels["split"].unique().tolist())
    for s in order:
        if s in present:
            return s
    return qrels["split"].iloc[0]

## Tokeniza√ß√£o

In [69]:
# Tokeniza√ß√£o simples
_tok_re = re.compile(r"[A-Za-z0-9_]+")
def tokenize(text: str):
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    return [t.lower() for t in _tok_re.findall(text)]

## M√©tricas (Top-k)

In [70]:
# M√©tricas
def mrr_at_k(ranked, gold_set, k=10):
    for i, did in enumerate(ranked[:k], start=1):
        if did in gold_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(ranked, gains, k=10):
    dcg = 0.0
    for i, did in enumerate(ranked[:k], start=1):
        g = gains.get(did, 0.0)
        if g > 0:
            dcg += (2**g - 1) / np.log2(i + 1)
    return dcg

def ndcg_at_k(ranked, gains, k=10):
    ideal = sorted(gains.values(), reverse=True)[:k]
    idcg = 0.0
    for i, g in enumerate(ideal, start=1):
        idcg += (2**g - 1) / np.log2(i + 1)
    if idcg == 0:
        return 0.0
    return dcg_at_k(ranked, gains, k) / idcg

def average_precision_at_k(ranked, gold_set, k=10):
    hits, s = 0, 0.0
    for i, did in enumerate(ranked[:k], start=1):
        if did in gold_set:
            hits += 1
            s += hits / i
    return 0.0 if not gold_set else s / min(len(gold_set), k)

def recall_at_k(ranked, gold_set, k=10):
    if not gold_set:
        return 0.0
    return len(set(ranked[:k]) & gold_set) / len(gold_set)

## √≠ndice TFIDF

In [71]:
def build_tfidf_index(df_corpus: pd.DataFrame, dataset_name: str):
    """Constr√≥i √≠ndice TF-IDF usando sua classe TFIDFRetriever"""
    # Converte DataFrame para Document objects
    documents = []
    for _, row in df_corpus.iterrows():
        doc = Document(
            doc_id=str(row["doc_id"]),
            title=str(row.get("title", "") or ""),
            text=str(row.get("text", "") or "")
        )
        documents.append(doc)
    
    # Cria retriever com cache em artifacts
    artifact_dir = f"./outputs/artifacts/{dataset_name}_baseline_tfidf"
    retriever = TFIDFRetriever(
        dim=1000,
        min_df=2,
        backend="sklearn",
        use_faiss=True,
        artifact_dir=artifact_dir,
        index_name="tfidf.index"
    )
    
    # Constr√≥i √≠ndice (ou carrega do cache se existir)
    retriever.build_index(documents)
    
    return retriever, documents

def rank_with_tfidf(retriever: TFIDFRetriever, query_text: str, topk=TOPK):
    """Ranking usando TFIDFRetriever"""
    # Cria objeto Query
    query = Query(query_id="tmp", text=query_text)
    
    # Recupera top-k
    results = retriever.retrieve([query], k=topk)
    
    # Extrai doc_ids e scores
    ranked_items = results.get("tmp", [])
    doc_ids = [doc_id for doc_id, score in ranked_items]
    scores = [score for doc_id, score in ranked_items]
    
    return doc_ids, scores

## Loop de avalia√ß√£o

In [72]:
def evaluate_dataset(name: str, topk=TOPK, only_test=True, show_examples=3):
    print(f"\n====== {name} ======")
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    print("corpus:", df_corpus.shape, "| queries:", df_queries.shape, "| qrels:", df_qrels.shape)

    desired = "test" if only_test else "test"
    split = pick_split_available(df_qrels, prefer=desired)
    if only_test and split != "test":
        print(f"‚ö†Ô∏è Split 'test' n√£o encontrado em {name}; usando '{split}' para rodar mesmo assim.")
    qrels_split = df_qrels[df_qrels["split"] == split].copy()
    print("usando split:", split, "| qrels:", qrels_split.shape)

    # mapa query_id -> {doc_id: score}
    qrels_map = {}
    for row in qrels_split.itertuples(index=False):
        qrels_map.setdefault(row.query_id, {})[row.doc_id] = int(getattr(row, "score", 1))

    # üÜï Constr√≥i √≠ndice TF-IDF usando sua classe
    retriever, documents = build_tfidf_index(df_corpus, name)

    # prepara lookup de queries existentes
    qdf = df_queries[df_queries["query_id"].isin(qrels_split["query_id"].unique())]
    q_lookup = dict(zip(qdf["query_id"], qdf["query"]))

    metrics = {"MRR@10": [], "nDCG@10": [], "MAP@10": [], "Recall@10": []}
    examples = []

    for qid, gold_gains in tqdm(qrels_map.items(), desc=f"TF-IDF {name} ({split})"):
        qtxt = q_lookup.get(qid)
        if qtxt is None:
            continue
        gold_set = {d for d, s in gold_gains.items() if s > 0}
        
        # üÜï Ranking com TFIDFRetriever
        ranked, _ = rank_with_tfidf(retriever, qtxt, topk=topk)

        metrics["MRR@10"].append(mrr_at_k(ranked, gold_set, k=10))
        metrics["nDCG@10"].append(ndcg_at_k(ranked, gold_gains, k=10))
        metrics["MAP@10"].append(average_precision_at_k(ranked, gold_set, k=10))
        metrics["Recall@10"].append(recall_at_k(ranked, gold_set, k=10))

    # agrega
    results = {m: float(np.mean(v)) if v else 0.0 for m, v in metrics.items()}
    print("Resultados (m√©dias):", results)

    # exemplos qualitativos
    for qid in list(qrels_map.keys())[:show_examples]:
        qtxt = q_lookup.get(qid)
        if not qtxt:
            continue
        ranked, _ = rank_with_tfidf(retriever, qtxt, topk=10)
        gold = {d for d, s in qrels_map[qid].items() if s > 0}
        hit_rank = next((i+1 for i, did in enumerate(ranked) if did in gold), None)
        # t√≠tulo do top-1
        top1 = ranked[0] if ranked else None
        title_top1 = df_corpus.loc[df_corpus["doc_id"] == top1, "title"].head(1).tolist()
        title_top1 = title_top1[0] if title_top1 else None
        examples.append({
            "query_id": qid,
            "query": qtxt,
            "gold_size": len(gold),
            "hit@10_rank": hit_rank,
            "top1_doc_id": top1,
            "top1_title": title_top1
        })
    ex_df = pd.DataFrame(examples)
    if not ex_df.empty:
        print("\nAmostras de consultas (top-1 e acerto@10):")
        display(ex_df)
    return results

In [73]:
all_results = {}
for ds in DATASETS:
    all_results[ds] = evaluate_dataset(ds, only_test=True)

print("\n== Resumo (m√©dias por dataset) ==")
display(pd.DataFrame(all_results).T)


corpus: (5183, 4) | queries: (1109, 2) | qrels: (1258, 4)
usando split: test | qrels: (339, 4)
2025-11-01 21:02:33 | INFO     | retriever.tfidf | [tfidf_faiss.py:66] | üöÄ Building TF-IDF Index (5183 documentos)
2025-11-01 21:02:33 | INFO     | retriever.tfidf | [logging.py:199] | ‚è±Ô∏è  Fit TF-IDF no corpus - iniciando...
2025-11-01 21:02:33 | INFO     | tfidf.vectorizer | [logging.py:199] | ‚è±Ô∏è  Fit TF-IDF - iniciando...
2025-11-01 21:02:33 | INFO     | tfidf.vectorizer | [logging.py:220] | ‚úì Fit TF-IDF - conclu√≠do em [32m333.8ms[0m
2025-11-01 21:02:33 | INFO     | tfidf.vectorizer | [tfidf_vectorizer.py:19] | ‚úì TF-IDF fitted: vocab_size=1000
2025-11-01 21:02:33 | INFO     | retriever.tfidf | [logging.py:220] | ‚úì Fit TF-IDF no corpus - conclu√≠do em [32m336.3ms[0m
2025-11-01 21:02:33 | INFO     | retriever.tfidf | [logging.py:199] | ‚è±Ô∏è  Encoding documents (TF-IDF) - iniciando...
2025-11-01 21:02:34 | INFO     | retriever.tfidf | [logging.py:220] | ‚úì Encoding do

TF-IDF scifact (test):   0%|          | 0/300 [00:00<?, ?it/s]


AssertionError: 

In [45]:
for name in DATASETS:
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    qrels_test = df_qrels[df_qrels["split"] == "test"]
    print(
        f"{name:9s} | corpus={len(df_corpus):6d} | queries_total={len(df_queries):5d} "
        f"| qrels_test_linhas={len(qrels_test):6d} | qrels_test_queries_unicas={qrels_test['query_id'].nunique():5d}"
    )

scifact   | corpus=  5183 | queries_total= 1109 | qrels_test_linhas=   339 | qrels_test_queries_unicas=  300
fiqa      | corpus= 57638 | queries_total= 6648 | qrels_test_linhas=  1706 | qrels_test_queries_unicas=  648
nfcorpus  | corpus=  3633 | queries_total= 3237 | qrels_test_linhas= 12334 | qrels_test_queries_unicas=  323
