In [None]:
# %%
from pathlib import Path
import pandas as pd
import hashlib
from collections import Counter

pd.set_option("display.max_rows", 5)
pd.set_option("display.max_colwidth", 120)

DATA_ROOT = Path("./data/squad/processed/beir")  # ajuste aqui se precisar
corpus_path  = DATA_ROOT / "corpus.parquet"
queries_path = DATA_ROOT / "queries.parquet"
qrels_path   = DATA_ROOT / "qrels.parquet"

for p in [corpus_path, queries_path, qrels_path]:
    if not p.exists():
        raise FileNotFoundError(f"Arquivo não encontrado: {p.resolve()}")

corpus  = pd.read_parquet(corpus_path)
queries = pd.read_parquet(queries_path)
qrels   = pd.read_parquet(qrels_path)

print("Tamanhos:")
print(" - corpus :", corpus.shape)
print(" - queries:", queries.shape)
print(" - qrels  :", qrels.shape)

Tamanhos:
 - corpus : (20958, 4)
 - queries: (98169, 2)
 - qrels  : (98169, 4)


In [2]:
# %%
expected_corpus_cols  = {"doc_id", "title", "text", "metadata"}
expected_queries_cols = {"query_id", "query"}
expected_qrels_cols   = {"query_id", "doc_id", "score", "split"}

def assert_has_columns(df, expected, name):
    missing = expected - set(df.columns)
    extra   = set(df.columns) - expected
    if missing:
        raise AssertionError(f"{name}: faltando colunas {missing}")
    if extra:
        print(f"[AVISO] {name}: colunas extras {extra} (ok se forem ignoradas na pipeline)")

assert_has_columns(corpus,  expected_corpus_cols,  "corpus")
assert_has_columns(queries, expected_queries_cols, "queries")
assert_has_columns(qrels,   expected_qrels_cols,   "qrels")

print("Colunas OK ✅")


Colunas OK ✅


In [3]:
# %%
def check_unique_non_null(df, col, name):
    nulls = df[col].isna().sum()
    uniq  = df[col].nunique()
    if nulls:
        raise AssertionError(f"{name}: {nulls} valores nulos em {col}")
    if uniq != len(df):
        dup = len(df) - uniq
        raise AssertionError(f"{name}: {dup} duplicatas em {col}")
    print(f"{name}: {col} é único e sem nulos ✅")

check_unique_non_null(corpus,  "doc_id",   "corpus")
check_unique_non_null(queries, "query_id", "queries")


corpus: doc_id é único e sem nulos ✅
queries: query_id é único e sem nulos ✅


In [4]:
# %%
# Checa se todos os doc_id de qrels existem no corpus
doc_in_corpus = qrels["doc_id"].isin(corpus["doc_id"])
qid_in_queries = qrels["query_id"].isin(queries["query_id"])

print(f"qrels → corpus OK? {doc_in_corpus.all()}")
print(f"qrels → queries OK? {qid_in_queries.all()}")

if not doc_in_corpus.all():
    missing_docs = qrels.loc[~doc_in_corpus, "doc_id"].unique()
    print("doc_ids faltando no corpus:", missing_docs[:5], "...", len(missing_docs))

if not qid_in_queries.all():
    missing_qids = qrels.loc[~qid_in_queries, "query_id"].unique()
    print("query_ids faltando nas queries:", missing_qids[:5], "...", len(missing_qids))

# Splits presentes
print("Splits em qrels:", Counter(qrels["split"]))


qrels → corpus OK? True
qrels → queries OK? True
Splits em qrels: Counter({'train': 87599, 'test': 10570})


In [5]:
# %%
# SQuAD v1.1 não tem perguntas impossíveis → cada query deve ter exatamente 1 doc relevante.
gr = qrels.groupby("query_id").size()
ok_one = (gr == 1).mean()
print(f"Proporção de queries com exatamente 1 qrel: {ok_one:.4f}")

# Reporta problemas (se houver)
multi = gr[gr > 1]
zero  = set(queries["query_id"]) - set(qrels["query_id"])
print(f"Queries com mais de 1 qrel: {len(multi)}")
print(f"Queries sem qrel: {len(zero)}")
if len(multi):
    display(multi.head())


Proporção de queries com exatamente 1 qrel: 1.0000
Queries com mais de 1 qrel: 0
Queries sem qrel: 0


In [6]:
# %%
def make_doc_id(title: str, text: str) -> str:
    base_title = (title or "").strip().replace("\t", " ").replace("\n", " ")
    if not base_title:
        base_title = "article"
    h = hashlib.md5((text or "").encode("utf-8")).hexdigest()[:12]
    return f"{base_title}::p_{h}"

sample = corpus.sample(min(10000, len(corpus)), random_state=42)
recomputed = sample.apply(lambda r: make_doc_id(r["title"], r["text"]), axis=1)
match_rate = (recomputed == sample["doc_id"]).mean()
print(f"Taxa de match doc_id (amostra): {match_rate:.4f}")
mismatch = sample.loc[recomputed != sample["doc_id"]]
if len(mismatch):
    print("Exemplos de mismatch (mostrando 3):")
    display(mismatch[["doc_id", "title", "text"]].head(3))


Taxa de match doc_id (amostra): 1.0000


In [7]:
# %%
n_docs = len(corpus)
n_queries_total = len(queries)
n_queries_train = qrels.loc[qrels["split"] == "train", "query_id"].nunique()
n_queries_test  = qrels.loc[qrels["split"] == "test",  "query_id"].nunique()

print(f"Parágrafos (docs): {n_docs:,}")
print(f"Queries total     : {n_queries_total:,}")
print(f"Queries train     : {n_queries_train:,}")
print(f"Queries test      : {n_queries_test:,}")

# Como cada query tem 1 qrel, esperamos qrels == #queries por split
gr_split = qrels.groupby("split")["query_id"].nunique()
print("\n#Queries por split segundo qrels:")
display(gr_split.to_frame("#queries").T)


Parágrafos (docs): 20,958
Queries total     : 98,169
Queries train     : 87,599
Queries test      : 10,570

#Queries por split segundo qrels:


split,test,train
#queries,10570,87599


In [8]:
# %%
try:
    from datasets import load_dataset
    ds_dev = load_dataset("rajpurkar/squad", split="validation")
    # Amostragem limitada para rodar rápido
    hf_sample = ds_dev.shuffle(seed=42).select(range(min(1000, len(ds_dev))))
    def make_id_from_hf(ex):
        t = ex.get("title") or ""
        c = ex["context"]
        base_title = t.strip().replace("\t", " ").replace("\n", " ") or "article"
        h = hashlib.md5(c.encode("utf-8")).hexdigest()[:12]
        return f"{base_title}::p_{h}"
    hf_ids = [make_id_from_hf(ex) for ex in hf_sample]
    exist_mask = pd.Series(hf_ids).isin(set(corpus["doc_id"]))
    print(f"HF→corpus cobertura (amostra): {exist_mask.mean():.4f}  ({exist_mask.sum()}/{len(hf_ids)})")
except Exception as e:
    print("Pulei a auditoria com HF (instale 'datasets' se quiser rodar). Motivo:", repr(e))


  from .autonotebook import tqdm as notebook_tqdm


HF→corpus cobertura (amostra): 1.0000  (1000/1000)


In [9]:
# %%
empty_text = (corpus["text"].astype(str).str.strip() == "")
if empty_text.any():
    print(f"Parágrafos com texto vazio: {empty_text.sum()} (mostrando 3)")
    display(corpus.loc[empty_text].head(3))
else:
    print("Nenhum parágrafo com texto vazio ✅")


Nenhum parágrafo com texto vazio ✅
