In [None]:
# 0) Setup ‚Äî Facts Creation Gold (GPT-4o-mini)

In [16]:
import ast
import re
import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from utils_notebook import SemanticCleaner, extract_answer

load_dotenv()

# Dataset
train_df = pd.read_csv("dataset/arc_challenge_train_processed.csv")

# Modelo √∫nico: gpt-4o-mini para tudo
gpt4 = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)

cleaner = SemanticCleaner()

LABELS  = ['A', 'B', 'C', 'D']
number  = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

In [17]:
def format_choices(choices_dict):
    if isinstance(choices_dict, str):
        choices_dict = ast.literal_eval(choices_dict)
    return "\n".join(f"({l}) {t}" for l, t in zip(choices_dict['label'], choices_dict['text']))

def get_choice_text(choices_dict, label):
    if isinstance(choices_dict, str):
        choices_dict = ast.literal_eval(choices_dict)
    idx = number.get(label, 0)
    return choices_dict['text'][idx]

def parse_gold_response(text):
    """Extrai Answer, Reflection e lista de Facts do output do Prompt 1.
    
    Returns: (answer: str, facts: list[str], reflection: str)
    """
    answer_match = re.search(r"Answer\s*:\s*([A-D])", text, re.IGNORECASE)
    refl_match   = re.search(r"Reflection\s*:\s*(.*?)(?=\nFacts?|$)", text, re.IGNORECASE | re.DOTALL)
    facts_match  = re.search(r"Facts?\s*:?\s*\n(.*?)$", text, re.IGNORECASE | re.DOTALL)

    answer     = answer_match.group(1).upper() if answer_match else ""
    reflection = refl_match.group(1).strip()   if refl_match   else ""

    facts = []
    if facts_match:
        lines = facts_match.group(1).split('\n')
        for line in lines:
            clean = line.strip().lstrip('-‚Ä¢*1234567890.)').strip()
            if len(clean.split()) >= 4:
                facts.append(clean)
    facts = facts[:2]  # m√°ximo 2

    return answer, facts, reflection

def parse_score(text):
    """Extrai score (-1, 0 ou 1) do output do Prompt 2."""
    matches = re.findall(r"(?<!\d)-?[01](?!\d)", text)
    if matches:
        sc = int(matches[-1])
        return max(-1, min(1, sc))
    return 0

# Prompts

**Prompt 1** ‚Äî O modelo responde, explica seu **racioc√≠nio** (Reflection) e extrai os **fatos cient√≠ficos universais** usados (1 ou 2, preferencialmente 1). Reflex√£o e fatos s√£o campos separados e independentes.  
**Prompt 2** ‚Äî Score por fato: curto e direto.

In [23]:
# Prompt 1 ‚Äî Responde + reflex√£o do racioc√≠nio + fato(s) cient√≠fico(s)
answer_fact_template = """Answer the multiple-choice question below.

Question:
{question}

Respond in this exact format:
Answer: [A/B/C/D]
Reflection: [1-2 sentences: the step-by-step reasoning chain that led to the answer]
Facts:
- [one universal scientific principle, max 30 words, no specific objects or names]
- [second principle ONLY if truly independent and essential ‚Äî otherwise omit this line]"""

answer_fact_prompt = PromptTemplate.from_template(answer_fact_template)
answer_fact_chain  = answer_fact_prompt | gpt4 | StrOutputParser()

In [24]:
# Prompt 2 ‚Äî Score curto e direto
score_template = """Did this Fact help answer the question correctly?

Q: {question}
Correct: {correct} | Chosen: {chosen} | {outcome}
Fact: {fact}

-1 = fact led to wrong answer | 0 = irrelevant/unused | 1 = essential for correct answer
Score:"""

score_prompt = PromptTemplate.from_template(score_template)
score_chain  = score_prompt | gpt4 | StrOutputParser()

In [25]:
# Testando em uma quest√£o

In [26]:
sample        = train_df.iloc[0]
choices       = ast.literal_eval(sample['choices'])
options_text  = format_choices(choices)
full_question = f"{sample['question']}\n{options_text}"
target        = sample['answerKey']

print(f"Pergunta : {sample['question']}")
print(f"Op√ß√µes   :\n{options_text}")
print(f"Resposta : {target}")

Pergunta : George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?
Op√ß√µes   :
(A) dry palms
(B) wet palms
(C) palms covered with oil
(D) palms covered with lotion
Resposta : A


In [27]:
# ‚Äî Prompt 1: responde + reflex√£o + fato(s)
raw = answer_fact_chain.invoke({"question": full_question})
print("=== Prompt 1 ===")
print(raw)

pred, facts, reflection = parse_gold_response(raw)
facts      = [cleaner.clean(f) for f in facts if f and len(f.split()) >= 4]
reflection = cleaner.clean(reflection)

# fallback se nenhum fato foi extra√≠do
if not facts:
    facts = [reflection]

is_correct = (pred == target)
status     = "CORRECT" if is_correct else "WRONG"

print(f"\n‚Üí pred={pred} | target={target} | {'‚úÖ' if is_correct else '‚ùå'}")
print(f"‚Üí Reflection : {reflection}")
for i, f in enumerate(facts, 1):
    print(f"‚Üí Fact {i}     : {f}")

=== Prompt 1 ===
Answer: A  
Reflection: Rubbing dry palms together generates friction, which produces heat. Wet palms, oil, or lotion can reduce friction and thus decrease heat generation.  
Facts: Friction between surfaces generates heat through kinetic energy conversion.

‚Üí pred=A | target=A | ‚úÖ
‚Üí Reflection : Rubbing dry palms together generates friction, which produces heat. Wet palms, oil, or lotion can reduce friction and thus decrease heat generation.
‚Üí Fact 1     : Rubbing dry palms together generates friction, which produces heat. Wet palms, oil, or lotion can reduce friction and thus decrease heat generation.


In [28]:
# ‚Äî Prompt 2: score por fato
print("=== Prompt 2 ===")
scores = []
for i, fact in enumerate(facts, 1):
    score_raw = score_chain.invoke({
        "question": full_question,
        "correct":  get_choice_text(choices, target),
        "chosen":   get_choice_text(choices, pred),
        "outcome":  status,
        "fact":     fact,
    })
    sc = parse_score(score_raw)
    scores.append(sc)
    print(f"\nFato {i}: {fact}")
    print(f"Resposta modelo: {score_raw.strip()}")
    print(f"‚Üí Score: {sc:+d}")

=== Prompt 2 ===

Fato 1: Rubbing dry palms together generates friction, which produces heat. Wet palms, oil, or lotion can reduce friction and thus decrease heat generation.
Resposta modelo: 1 = essential for correct answer
‚Üí Score: +1


# Cria√ß√£o do Dataset Gold

Processo em **dois prompts por quest√£o**:
1. GPT-4o-mini responde a pergunta, descreve a **reflex√£o** (racioc√≠nio que levou √† resposta) e lista **1 ou 2 fatos cient√≠ficos universais** (independentes entre si)
2. GPT-4o-mini atribui um **score** (-1 / 0 / 1) para cada fato individualmente

> **Reflex√£o** = processo de racioc√≠nio do agente (pode mencionar contexto espec√≠fico da quest√£o)  
> **Fatos** = princ√≠pios cient√≠ficos universais independentes (preferencialmente 1, no m√°ximo 2)

In [29]:
def create_facts_gold_dataset(
    df,
    output_csv="scientific_facts_gold.csv",
    backup_frequency=50,
    start_from=0,
):
    """
    Para cada quest√£o, 2 prompts:
      1. answer_fact_chain  ‚Üí responde + reflex√£o de racioc√≠nio + 1 ou 2 fatos cient√≠ficos
      2. score_chain        ‚Üí score por fato (-1/0/1)

    Colunas do CSV:
      id, question, choices, correct_answer, model_prediction, is_correct,
      reflection, scientific_fact, quality_score, fact_number, origin
    """
    results = []
    correct_count = 0
    total = len(df)

    loop = tqdm(df.iloc[start_from:].iterrows(),
                total=total - start_from,
                desc="Gold facts")

    for idx, row in loop:
        try:
            q_text        = row['question']
            choices       = ast.literal_eval(row['choices'])
            options_text  = format_choices(choices)
            full_question = f"{q_text}\n{options_text}"
            target        = row['answerKey']

            # ‚Äî Prompt 1: responde + reflex√£o + fato(s)
            raw            = answer_fact_chain.invoke({"question": full_question})
            pred, facts, reflection = parse_gold_response(raw)

            # fallback: se o parser n√£o encontrou a letra, tenta extrair_answer
            if pred not in LABELS:
                pred = extract_answer(small_llm_model=gpt4,
                                      model_text_output=raw,
                                      question=full_question)

            reflection = cleaner.clean(reflection)
            facts      = [cleaner.clean(f) for f in facts if f and len(f.split()) >= 4]

            # fallback se nenhum fato foi extra√≠do
            if not facts:
                facts = [reflection]

            is_correct = (pred == target)
            status     = "CORRECT" if is_correct else "WRONG"
            if is_correct:
                correct_count += 1

            # ‚Äî Prompt 2: score por fato
            for fi, fact in enumerate(facts, 1):
                try:
                    score_raw = score_chain.invoke({
                        "question": full_question,
                        "correct":  get_choice_text(choices, target),
                        "chosen":   get_choice_text(choices, pred),
                        "outcome":  status,
                        "fact":     fact,
                    })
                    score = parse_score(score_raw)
                except Exception:
                    score = 0

                results.append({
                    'id':               row['id'],
                    'question':         q_text,
                    'choices':          str(choices),
                    'correct_answer':   target,
                    'model_prediction': pred,
                    'is_correct':       is_correct,
                    'reflection':       reflection,
                    'scientific_fact':  fact,
                    'quality_score':    score,
                    'fact_number':      fi,
                    'origin':           'train',
                })

            # progresso
            n_done = idx - start_from + 1
            acc    = correct_count / n_done * 100
            loop.set_postfix(acc=f"{acc:.1f}%", facts=len(results), status="‚úÖ" if is_correct else "‚ùå")

            # backup peri√≥dico
            if n_done % backup_frequency == 0:
                pd.DataFrame(results).to_csv(output_csv, index=False)
                print(f"\nüíæ Backup ‚Üí {output_csv} ({len(results)} registros)")

        except Exception as e:
            print(f"\n‚ùå Erro Q{idx}: {e}")
            continue

    df_final = pd.DataFrame(results)
    df_final.to_csv(output_csv, index=False)

    n_done = total - start_from
    print(f"\n{'='*60}")
    print(f"‚úÖ CONCLU√çDO")
    print(f"   Quest√µes  : {n_done}")
    print(f"   Acur√°cia  : {correct_count / n_done * 100:.2f}%")
    print(f"   Registros : {len(results)}")
    print(f"   Arquivo   : {output_csv}")
    print(f"{'='*60}")

    return df_final

In [30]:
df_gold = create_facts_gold_dataset(
    df=train_df,
    output_csv="scientific_facts_gold.csv",
    backup_frequency=25,
    start_from=0,
)

Gold facts:   2%|‚ñè         | 25/1119 [00:57<46:41,  2.56s/it, acc=92.0%, facts=25, status=‚ùå]


üíæ Backup ‚Üí scientific_facts_gold.csv (25 registros)


Gold facts:   4%|‚ñç         | 50/1119 [01:54<41:32,  2.33s/it, acc=94.0%, facts=50, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (50 registros)


Gold facts:   7%|‚ñã         | 75/1119 [02:48<38:25,  2.21s/it, acc=96.0%, facts=75, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (75 registros)


Gold facts:   9%|‚ñâ         | 100/1119 [03:52<41:37,  2.45s/it, acc=96.0%, facts=100, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (100 registros)


Gold facts:  11%|‚ñà         | 125/1119 [05:01<44:36,  2.69s/it, acc=96.0%, facts=125, status=‚úÖ]  


üíæ Backup ‚Üí scientific_facts_gold.csv (125 registros)


Gold facts:  13%|‚ñà‚ñé        | 150/1119 [06:04<47:32,  2.94s/it, acc=94.7%, facts=150, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (150 registros)


Gold facts:  16%|‚ñà‚ñå        | 175/1119 [07:00<39:06,  2.49s/it, acc=95.4%, facts=175, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (175 registros)


Gold facts:  18%|‚ñà‚ñä        | 200/1119 [07:59<35:10,  2.30s/it, acc=94.5%, facts=200, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (200 registros)


Gold facts:  20%|‚ñà‚ñà        | 225/1119 [08:56<30:34,  2.05s/it, acc=95.1%, facts=225, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (225 registros)


Gold facts:  22%|‚ñà‚ñà‚ñè       | 250/1119 [09:54<33:01,  2.28s/it, acc=95.6%, facts=250, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (250 registros)


Gold facts:  25%|‚ñà‚ñà‚ñç       | 275/1119 [10:54<31:37,  2.25s/it, acc=95.3%, facts=275, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (275 registros)


Gold facts:  27%|‚ñà‚ñà‚ñã       | 300/1119 [12:00<29:15,  2.14s/it, acc=94.7%, facts=301, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (301 registros)


Gold facts:  29%|‚ñà‚ñà‚ñâ       | 325/1119 [13:56<1:31:31,  6.92s/it, acc=95.1%, facts=328, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (328 registros)


Gold facts:  31%|‚ñà‚ñà‚ñà‚ñè      | 350/1119 [14:55<29:10,  2.28s/it, acc=95.1%, facts=353, status=‚úÖ]  


üíæ Backup ‚Üí scientific_facts_gold.csv (353 registros)


Gold facts:  34%|‚ñà‚ñà‚ñà‚ñé      | 375/1119 [16:06<30:21,  2.45s/it, acc=95.2%, facts=379, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (379 registros)


Gold facts:  36%|‚ñà‚ñà‚ñà‚ñå      | 400/1119 [17:18<31:39,  2.64s/it, acc=95.5%, facts=404, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (404 registros)


Gold facts:  38%|‚ñà‚ñà‚ñà‚ñä      | 425/1119 [18:37<41:39,  3.60s/it, acc=95.1%, facts=429, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (429 registros)


Gold facts:  40%|‚ñà‚ñà‚ñà‚ñà      | 450/1119 [19:38<25:19,  2.27s/it, acc=95.1%, facts=454, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (454 registros)


Gold facts:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 475/1119 [20:45<27:01,  2.52s/it, acc=95.2%, facts=479, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (479 registros)


Gold facts:  45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 500/1119 [22:50<32:02,  3.11s/it, acc=95.0%, facts=504, status=‚ùå]  


üíæ Backup ‚Üí scientific_facts_gold.csv (504 registros)


Gold facts:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 525/1119 [24:04<26:08,  2.64s/it, acc=95.2%, facts=530, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (530 registros)


Gold facts:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 550/1119 [25:20<27:10,  2.87s/it, acc=95.1%, facts=555, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (555 registros)


Gold facts:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 575/1119 [26:31<25:30,  2.81s/it, acc=95.0%, facts=581, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (581 registros)


Gold facts:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 600/1119 [27:39<26:04,  3.02s/it, acc=94.8%, facts=606, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (606 registros)


Gold facts:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 625/1119 [28:45<19:32,  2.37s/it, acc=95.0%, facts=631, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (631 registros)


Gold facts:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 650/1119 [29:54<23:22,  2.99s/it, acc=94.9%, facts=657, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (657 registros)


Gold facts:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 675/1119 [31:01<19:11,  2.59s/it, acc=95.0%, facts=682, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (682 registros)


Gold facts:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 700/1119 [32:15<18:51,  2.70s/it, acc=94.9%, facts=707, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (707 registros)


Gold facts:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 725/1119 [33:16<15:27,  2.35s/it, acc=95.0%, facts=732, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (732 registros)


Gold facts:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 750/1119 [34:30<18:08,  2.95s/it, acc=94.8%, facts=757, status=‚ùå]


üíæ Backup ‚Üí scientific_facts_gold.csv (757 registros)


Gold facts:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 775/1119 [35:40<13:28,  2.35s/it, acc=94.7%, facts=782, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (782 registros)


Gold facts:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 800/1119 [36:41<12:46,  2.40s/it, acc=94.8%, facts=807, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (807 registros)


Gold facts:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 825/1119 [37:41<13:23,  2.73s/it, acc=94.8%, facts=833, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (833 registros)


Gold facts:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 850/1119 [38:41<11:26,  2.55s/it, acc=94.8%, facts=859, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (859 registros)


Gold facts:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 875/1119 [39:44<10:04,  2.48s/it, acc=94.9%, facts=884, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (884 registros)


Gold facts:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 900/1119 [40:45<09:23,  2.57s/it, acc=94.9%, facts=910, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (910 registros)


Gold facts:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 925/1119 [41:49<08:15,  2.55s/it, acc=94.8%, facts=935, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (935 registros)


Gold facts:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 950/1119 [42:46<06:03,  2.15s/it, acc=94.8%, facts=960, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (960 registros)


Gold facts:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 975/1119 [43:50<06:30,  2.71s/it, acc=94.6%, facts=985, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (985 registros)


Gold facts:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 1000/1119 [44:45<04:17,  2.16s/it, acc=94.6%, facts=1010, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (1010 registros)


Gold facts:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 1025/1119 [45:54<03:36,  2.30s/it, acc=94.6%, facts=1035, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (1035 registros)


Gold facts:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 1050/1119 [47:01<03:17,  2.87s/it, acc=94.7%, facts=1060, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (1060 registros)


Gold facts:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 1075/1119 [48:10<01:58,  2.70s/it, acc=94.6%, facts=1086, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (1086 registros)


Gold facts:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 1100/1119 [49:13<00:54,  2.88s/it, acc=94.6%, facts=1111, status=‚úÖ]


üíæ Backup ‚Üí scientific_facts_gold.csv (1111 registros)


Gold facts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1119/1119 [50:06<00:00,  2.69s/it, acc=94.6%, facts=1130, status=‚úÖ]


‚úÖ CONCLU√çDO
   Quest√µes  : 1119
   Acur√°cia  : 94.64%
   Registros : 1130
   Arquivo   : scientific_facts_gold.csv





In [31]:
df_gold

Unnamed: 0,id,question,choices,correct_answer,model_prediction,is_correct,reflection,scientific_fact,quality_score,fact_number,origin
0,Mercury_SC_415702,George wants to warm his hands quickly by rubb...,"{'text': ['dry palms', 'wet palms', 'palms cov...",A,A,True,"Rubbing dry palms together generates friction,...","Rubbing dry palms together generates friction,...",1,1,train
1,MCAS_2009_5_6516,Which of the following statements best explain...,"{'text': ['The refrigerator door is smooth.', ...",B,B,True,Magnets stick to the refrigerator door because...,Magnets stick to the refrigerator door because...,1,1,train
2,Mercury_7233695,A fold observed in layers of sedimentary rock ...,"{'text': ['cooling of flowing magma.', 'conver...",B,B,True,Folds in sedimentary rock layers are typically...,Folds in sedimentary rock layers are typically...,1,1,train
3,Mercury_7041615,Which of these do scientists offer as the most...,"{'text': ['worldwide disease', 'global mountai...",D,D,True,The most widely accepted explanation for the m...,Mass extinctions can be caused by sudden envir...,1,1,train
4,Mercury_7041860,A boat is acted on by a river current flowing ...,"{'text': ['west', 'east', 'north', 'south'], '...",B,B,True,"The boat is moving northeast, which means the ...","The boat is moving northeast, which means the ...",1,1,train
...,...,...,...,...,...,...,...,...,...,...,...
1125,Mercury_SC_415480,Which change would most likely increase the nu...,"{'text': ['flood', 'drought', 'fire', 'landsli...",A,A,True,Flooding can create new habitats and increase ...,Flooding can create new habitats and increase ...,1,1,train
1126,Mercury_7172795,The skin is the largest organ in the human bod...,"{'text': ['It is made of cells.', 'It acts as ...",C,C,True,Skin is considered an organ because it is made...,Skin is considered an organ because it is made...,1,1,train
1127,NCEOGA_2013_8_59,Which food provides the most energy for the bo...,"{'text': ['potato', 'meat', 'milk', 'fruit'], ...",D,D,True,"Fruits, particularly those high in sugars like...","Fruits, particularly those high in sugars like...",1,1,train
1128,Mercury_7219643,Screech owls have two color variations-red and...,"{'text': ['nesting', 'feeding', 'reproduction'...",D,D,True,The grey screech owl has an advantage in camou...,Camouflage helps animals avoid predation and i...,1,1,train


In [None]:
print("üìã Primeiros registros:")
print(df_gold[['question', 'model_prediction', 'correct_answer', 'is_correct',
               'scientific_fact', 'reflection', 'quality_score', 'fact_number']].head())

print("\nüìä Distribui√ß√£o de scores:")
print(df_gold['quality_score'].value_counts().sort_index())
print(f"\nScore m√©dio : {df_gold['quality_score'].mean():.3f}")

print(f"\nAcur√°cia    : {df_gold['is_correct'].mean()*100:.2f}%")

print(f"\nüìä Fatos por quest√£o:")
print(df_gold.groupby('id')['fact_number'].max().value_counts().sort_index())

print(f"\nTotal registros: {df_gold.shape}")