# Análise dos datasets

## Configs

In [1]:
from pathlib import Path
import pandas as pd
import json
import random
from textwrap import shorten

ROOT = Path("./data").resolve()  
random.seed(42)

pd.set_option("display.max_colwidth", 200)

## Utils

In [2]:
def load_parquet_or_jsonl(path_parquet: Path, path_jsonl: Path) -> pd.DataFrame:
    """Carrega Parquet se existir, senão JSONL; levanta erro se ambos faltarem."""
    if path_parquet.exists():
        return pd.read_parquet(path_parquet)
    if path_jsonl.exists():
        return pd.read_json(path_jsonl, lines=True)
    raise FileNotFoundError(f"Nem Parquet nem JSONL encontrados: {path_parquet} | {path_jsonl}")

def preview_df(df: pd.DataFrame, n=5):
    """Mostra N linhas com truncagem leve de textos longos."""
    if df.empty:
        print("DataFrame vazio.")
        return
    df2 = df.copy()
    for col in df2.columns:
        if df2[col].dtype == "object":
            # Trunca strings muito longas para leitura rápida
            df2[col] = df2[col].apply(lambda x: shorten(str(x), width=160, placeholder="…"))
    display(df2.head(n))

def parse_json_if_str(x):
    if isinstance(x, str):
        try:
            return json.loads(x)
        except Exception:
            return x
    return x

def pick_split_available(qrels: pd.DataFrame, order=("test","dev","validation","train")) -> str:
    """Escolhe o primeiro split disponível conforme a ordem desejada."""
    present = {s for s in qrels["split"].unique().tolist()}
    for s in order:
        if s in present:
            return s
    # fallback: qualquer um
    return qrels["split"].iloc[0]

def snippet(text: str, width: int = 500) -> str:
    return shorten(str(text).replace("\n", " "), width=width, placeholder="…")

## Análise dataset scifact

In [3]:
scifact_proc = ROOT / "scifact" / "processed" / "original"

paths_scifact = {
    "corpus": (scifact_proc / "corpus.parquet", scifact_proc / "corpus.jsonl"),
    "claims_train": (scifact_proc / "claims_train.parquet", scifact_proc / "claims_train.jsonl"),
    "claims_dev":   (scifact_proc / "claims_dev.parquet",   scifact_proc / "claims_dev.jsonl"),
    "claims_test":  (scifact_proc / "claims_test.parquet",  scifact_proc / "claims_test.jsonl"),
}

df_corpus_scifact = load_parquet_or_jsonl(*paths_scifact["corpus"])
df_tr_scifact     = load_parquet_or_jsonl(*paths_scifact["claims_train"])
df_dev_scifact    = load_parquet_or_jsonl(*paths_scifact["claims_dev"])
df_te_scifact     = load_parquet_or_jsonl(*paths_scifact["claims_test"])

In [4]:
print("SciFact original — corpus:", df_corpus_scifact.shape)
print("SciFact original — train/dev/test:", df_tr_scifact.shape, df_dev_scifact.shape, df_te_scifact.shape)

SciFact original — corpus: (5183, 4)
SciFact original — train/dev/test: (809, 6) (300, 6) (300, 6)


In [5]:
print("Colunas corpus:", df_corpus_scifact.columns.tolist())
print("Colunas claims:", df_tr_scifact.columns.tolist())

Colunas corpus: ['doc_id', 'title', 'text', 'metadata']
Colunas claims: ['claim_id', 'split', 'claim', 'label', 'evidences', 'metadata']


In [6]:
print("Amostra corpus:")
preview_df(df_corpus_scifact, n=3)

print("Amostra claims (train):")
preview_df(df_tr_scifact, n=3)

Amostra corpus:


Unnamed: 0,doc_id,title,text,metadata
0,4983,Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.,"[""Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional…","{""structured"": false}"
1,5836,Induction of myelodysplasia by myeloid-derived suppressor cells.,"[""Myelodysplastic syndromes (MDS) are age-dependent stem cell malignancies that share biological features of activated adaptive immune response and ineffective…","{""structured"": false}"
2,7912,"BC1 RNA, the transcript from a master gene for ID element amplification, is able to prime its own reverse transcription.","[""ID elements are short interspersed elements (SINEs) found in high copy number in many rodent genomes."", ""BC1 RNA, an ID-related transcript, is derived from…","{""structured"": false}"


Amostra claims (train):


Unnamed: 0,claim_id,split,claim,label,evidences,metadata
0,0,train,0-dimensional biomaterials lack inductive properties.,,,"{""cited_doc_ids"": [31715818], ""evidence"": {}}"
1,2,train,1 in 5 million in UK have abnormal PrP positivity.,,,"{""cited_doc_ids"": [13734012], ""evidence"": {""13734012"": [{""label"": ""CONTRADICT"", ""sentences"": [4]}]}}"
2,4,train,1-1% of colorectal cancer patients are diagnosed with regional or distant metastases.,,,"{""cited_doc_ids"": [22942787], ""evidence"": {}}"


In [7]:
# Pode haver evidences armazenado como string JSON no Parquet; converter pra dict/list
df_tr_scifact["_evidences_parsed"] = df_tr_scifact["evidences"].apply(parse_json_if_str)

# Seleciona um claim com evidências não vazias
def extract_doc_ids_from_evidences(evs):
    if evs is None:
        return set()
    dids = set()
    try:
        # formatos comuns: lista de dicts com chaves "doc_id" ou objetos aninhados
        if isinstance(evs, list):
            for e in evs:
                if isinstance(e, dict):
                    if "doc_id" in e:
                        dids.add(e["doc_id"])
                    # alguns formatos trazem "evidence": [{"doc_id": X, "sentences":[...]}]
                    if "evidence" in e and isinstance(e["evidence"], list):
                        for sub in e["evidence"]:
                            if isinstance(sub, dict) and "doc_id" in sub:
                                dids.add(sub["doc_id"])
        elif isinstance(evs, dict) and "doc_id" in evs:
            dids.add(evs["doc_id"])
    except Exception:
        pass
    return {str(x) for x in dids}

row = None
for _, r in df_tr_scifact.iterrows():
    dids = extract_doc_ids_from_evidences(r["_evidences_parsed"])
    if dids:
        row = r
        row_dids = dids
        break

if row is None:
    print("⚠️ Não encontrei claim com evidências parseáveis. Mostrando um claim qualquer.")
    row = df_tr_scifact.sample(1, random_state=42).iloc[0]
    row_dids = set()

print("\n== Claim selecionado ==")
print("claim_id:", row["claim_id"])
print("label   :", row.get("label"))
print("claim   :", row["claim"])
print("doc_ids :", list(row_dids)[:5])

if row_dids:
    # junção com corpus (doc_id pode ser numérico/string; forçamos string)
    df_corpus_scifact["_doc_id_str"] = df_corpus_scifact["doc_id"].astype(str)
    subset = df_corpus_scifact[df_corpus_scifact["_doc_id_str"].isin(row_dids)].copy()
    print(f"\nDocumentos encontrados no corpus ({len(subset)}):")
    for i, rr in subset.head(3).iterrows():
        print(f"\n[doc_id={rr['doc_id']}] {rr.get('title')}")
        print(snippet(rr.get("text", ""), 600))
else:
    print("\nSem evidências parseadas para este claim.")

⚠️ Não encontrei claim com evidências parseáveis. Mostrando um claim qualquer.

== Claim selecionado ==
claim_id: 1215
label   : None
claim   : The effect of Lipopolysaccharides on kidney barrier function is dependent on inflammation levels.
doc_ids : []

Sem evidências parseadas para este claim.


In [8]:
def load_beir_processed(ds_name: str):
    base = ROOT / ds_name / "processed" / "beir"
    paths = {
        "corpus":  (base / "corpus.parquet",  base / "corpus.jsonl"),
        "queries": (base / "queries.parquet", base / "queries.jsonl"),
        "qrels":   (base / "qrels.parquet",   base / "qrels.jsonl"),
    }
    df_corpus  = load_parquet_or_jsonl(*paths["corpus"])
    df_queries = load_parquet_or_jsonl(*paths["queries"])
    df_qrels   = load_parquet_or_jsonl(*paths["qrels"])
    return df_corpus, df_queries, df_qrels

def beir_quick_report(name: str):
    print(f"\n====== BEIR/{name} ======")
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    print("corpus:", df_corpus.shape, "| queries:", df_queries.shape, "| qrels:", df_qrels.shape)
    print("colunas corpus:", df_corpus.columns.tolist())
    print("colunas queries:", df_queries.columns.tolist())
    print("colunas qrels:", df_qrels.columns.tolist())

    print("\nAmostra corpus:")
    preview_df(df_corpus, n=3)
    print("\nAmostra queries:")
    preview_df(df_queries, n=3)
    print("\nAmostra qrels:")
    preview_df(df_qrels, n=5)

    # Escolhe split disponível e faz um "join" para ver um par query-doc relevante
    split = "split" if "split" in df_qrels.columns else None
    if split:
        chosen = pick_split_available(df_qrels)
        df_q = df_qrels[df_qrels["split"] == chosen]
    else:
        df_q = df_qrels
        chosen = "<sem split>"

    if df_q.empty:
        print("⚠️ qrels vazio.")
        return

    pair = df_q.sample(1, random_state=42).iloc[0]
    qid, did, score = str(pair["query_id"]), str(pair["doc_id"]), pair.get("score")
    qtxt = df_queries.loc[df_queries["query_id"].astype(str) == qid, "query"]
    dtxt = df_corpus.loc[df_corpus["doc_id"].astype(str) == did, ["title","text"]]

    print(f"\nExemplo relevante (split={chosen}):")
    print("query_id:", qid)
    print("doc_id  :", did, "score:", score)
    print("query   :", snippet(qtxt.iloc[0] if not qtxt.empty else "<não encontrado>"))
    if not dtxt.empty:
        title = dtxt.iloc[0].get("title")
        text  = dtxt.iloc[0].get("text")
        print("title  :", snippet(title, 200))
        print("text   :", snippet(text, 600))
    else:
        print("⚠️ doc_id não encontrado no corpus (checar integridade).")

# %%
for name in ["scifact", "fiqa", "nfcorpus"]:
    beir_quick_report(name)



corpus: (5183, 4) | queries: (1109, 2) | qrels: (1258, 4)
colunas corpus: ['doc_id', 'title', 'text', 'metadata']
colunas queries: ['query_id', 'query']
colunas qrels: ['query_id', 'doc_id', 'score', 'split']

Amostra corpus:


Unnamed: 0,doc_id,title,text,metadata
0,4983,Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.,Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A…,{}
1,5836,Induction of myelodysplasia by myeloid-derived suppressor cells.,Myelodysplastic syndromes (MDS) are age-dependent stem cell malignancies that share biological features of activated adaptive immune response and ineffective…,{}
2,7912,"BC1 RNA, the transcript from a master gene for ID element amplification, is able to prime its own reverse transcription.","ID elements are short interspersed elements (SINEs) found in high copy number in many rodent genomes. BC1 RNA, an ID-related transcript, is derived from the…",{}



Amostra queries:


Unnamed: 0,query_id,query
0,0,0-dimensional biomaterials lack inductive properties.
1,2,1 in 5 million in UK have abnormal PrP positivity.
2,4,1-1% of colorectal cancer patients are diagnosed with regional or distant metastases.



Amostra qrels:


Unnamed: 0,query_id,doc_id,score,split
0,0,31715818,1,train
1,2,13734012,1,train
2,4,22942787,1,train
3,6,2613775,1,train
4,9,44265107,1,train



Exemplo relevante (split=test):
query_id: 1199
doc_id  : 16760369 score: 1
query   : The benefits of colchicine were achieved with effective widespread use of secondary prevention strategies such as high-dose statins.
title  : Comparative determinants of 4-year cardiovascular event rates in stable outpatients at risk of or with atherothrombosis.
text   : CONTEXT Clinicians and trialists have difficulty with identifying which patients are highest risk for cardiovascular events. Prior ischemic events, polyvascular disease, and diabetes mellitus have all been identified as predictors of ischemic events, but their comparative contributions to future risk remain unclear. OBJECTIVE To categorize the risk of cardiovascular events in stable outpatients with various initial manifestations of atherothrombosis using simple clinical descriptors. DESIGN, SETTING, AND PATIENTS Outpatients with coronary artery disease, cerebrovascular disease, or peripheral…

corpus: (57638, 4) | queries: (6648, 2) 

Unnamed: 0,doc_id,title,text,metadata
0,3,,"I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're…",{}
1,31,,"So nothing preventing false ratings besides additional scrutiny from the market/investors, but there are some newer controls in place to prevent institutions…",{}
2,56,,"You can never use a health FSA for individual health insurance premiums. Moreover, FSA plan sponsors can limit what they are will to reimburse. While you can't…",{}



Amostra queries:


Unnamed: 0,query_id,query
0,0,What is considered a business expense on a business trip?
1,4,Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip
2,5,Starting a new online business



Amostra qrels:


Unnamed: 0,query_id,doc_id,score,split
0,0,18850,1,train
1,4,196463,1,train
2,5,69306,1,train
3,6,560251,1,train
4,6,188530,1,train



Exemplo relevante (split=test):
query_id: 3490
doc_id  : 420529 score: 1
query   : Tax Witholding for Stock Sale
title  : 
text   : I assume US as mhoran_psprep edited, although I'm not sure IRS necessarily means US. (It definitely used to also include Britain's Inland Revenue, but they changed.) (US) Stockbrokers do not normally withhold on either dividends/interest/distributions or realized capital gains, especially since gains might be reduced or eliminated by later losses. (They can be required to apply backup withholding to dividends and interest; don't ask how I know :-) You are normally required to pay most of your tax during the year, defined as within 10% or $1000 whichever is more, by withholding and/or…

corpus: (3633, 4) | queries: (3237, 2) | qrels: (134294, 4)
colunas corpus: ['doc_id', 'title', 'text', 'metadata']
colunas queries: ['query_id', 'query']
colunas qrels: ['query_id', 'doc_id', 'score', 'split']

Amostra corpus:


Unnamed: 0,doc_id,title,text,metadata
0,MED-10,Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland,"Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer…","{""url"": ""http://www.ncbi.nlm.nih.gov/pubmed/25329299""}"
1,MED-14,Statin use after diagnosis of breast cancer and survival: a population-based cohort study.,"BACKGROUND: Preclinical studies have shown that statins, particularly simvastatin, can prevent growth in breast cancer cell lines and animal models. We…","{""url"": ""http://www.ncbi.nlm.nih.gov/pubmed/25304447""}"
2,MED-118,Alkylphenols in human milk and their relations to dietary habits in central Taiwan.,The aims of this study were to determine the concentrations of 4-nonylphenol (NP) and 4-octylphenol (OP) in 59 human milk samples and to examine related…,"{""url"": ""http://www.ncbi.nlm.nih.gov/pubmed/20435081%20""}"



Amostra queries:


Unnamed: 0,query_id,query
0,PLAIN-3,Breast Cancer Cells Feed on Cholesterol
1,PLAIN-4,Using Diet to Treat Asthma and Eczema
2,PLAIN-5,Treating Asthma With Plants vs. Pills



Amostra qrels:


Unnamed: 0,query_id,doc_id,score,split
0,PLAIN-3,MED-2436,1,train
1,PLAIN-3,MED-2437,1,train
2,PLAIN-3,MED-2438,1,train
3,PLAIN-3,MED-2439,1,train
4,PLAIN-3,MED-2440,1,train



Exemplo relevante (split=test):
query_id: PLAIN-3221
doc_id  : MED-744 score: 1
query   : Dietary Theory of Alzheimer's
title  : Therapy with saffron and the goddess at Thera.
text   : This paper presents a new interpretation of a unique Bronze Age (c. 3000-1100 BCE) Aegean wall painting in the building of Xeste 3 at Akrotiri,Thera. Crocus carturightianus and its active principle, saffron, are the primary subjects at Xeste 3. Several lines of evidence suggest that the meaning of these frescoes concerns saffron and healing: (1) the unusual degree of visual attention given to the crocus, including the variety of methods for display of the stigmas; (2) the painted depiction of the line of saffron production from plucking blooms to the collection of stigmas; and (3) the sheer…


In [9]:
def beir_sanity(name: str):
    df_corpus, df_queries, df_qrels = load_beir_processed(name)
    corpus_ids = set(df_corpus["doc_id"].astype(str).unique())
    query_ids  = set(df_queries["query_id"].astype(str).unique())
    q_doc_ids  = set(df_qrels["doc_id"].astype(str).unique())
    q_qry_ids  = set(df_qrels["query_id"].astype(str).unique())

    missing_docs  = q_doc_ids - corpus_ids
    missing_qries = q_qry_ids - query_ids

    print(f"\n== Sanidade BEIR/{name} ==")
    print("Total doc_id em corpus:", len(corpus_ids), "| doc_id referenciados em qrels:", len(q_doc_ids))
    print("Total query_id em queries:", len(query_ids), "| query_id referenciados em qrels:", len(q_qry_ids))
    print("doc_id faltando no corpus:", len(missing_docs))
    print("query_id faltando em queries:", len(missing_qries))
    if missing_docs:
        print("Exemplos doc_id faltantes:", list(sorted(list(missing_docs))[:10]))
    if missing_qries:
        print("Exemplos query_id faltantes:", list(sorted(list(missing_qries))[:10]))

for name in ["scifact", "fiqa", "nfcorpus"]:
    beir_sanity(name)


== Sanidade BEIR/scifact ==
Total doc_id em corpus: 5183 | doc_id referenciados em qrels: 667
Total query_id em queries: 1109 | query_id referenciados em qrels: 1109
doc_id faltando no corpus: 0
query_id faltando em queries: 0

== Sanidade BEIR/fiqa ==
Total doc_id em corpus: 57638 | doc_id referenciados em qrels: 17110
Total query_id em queries: 6648 | query_id referenciados em qrels: 6648
doc_id faltando no corpus: 0
query_id faltando em queries: 0

== Sanidade BEIR/nfcorpus ==
Total doc_id em corpus: 3633 | doc_id referenciados em qrels: 3633
Total query_id em queries: 3237 | query_id referenciados em qrels: 3237
doc_id faltando no corpus: 0
query_id faltando em queries: 0
