# 0) Setup

In [3]:
# Standard library imports
import os
import re
import sys
from typing import List, Optional

# Add parent directory to path for imports
# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
# Add vectorstore & prompts folders to path

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# LangChain imports
from langchain_core.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

# Local imports
from utils_notebook import (
    format_choices,
    make_question
)

load_dotenv()

True

In [4]:
LABELS = ['A','B','C','D']
number = {'A':0,'B':1,'C':2,'D':3}

In [5]:
# Dataset do arc. (Caso não tenha é necessário instalar e guardar na pasta datasets como csv)
# train_df = pd.read_csv("dataset/arc_challenge_train_processed.csv")
valid_df = pd.read_csv("../../dataset/arc_challenge_valid_processed.csv")
test_df = pd.read_csv("../../dataset/arc_challenge_test_processed.csv")

In [6]:
gpt5_nano = ChatOpenAI(model='gpt-5-nano-2025-08-07', temperature=0)

In [7]:
phi2 = ChatOllama(model="phi", temperature=0) # phi2

### C) Prompt and Chain

In [8]:
no_memory_template = """Instruction: You are an expert science tutor. Your goal is to answer the target multiple-choice question below.

Apply relevant scientific principles - fundamental definitions and laws from your knowledge.

Use scientific principles to ground your facts and guide your logic.

Structure your response strictly as:
1. **Reasoning:** Explain the step-by-step logic to reach the correct answer.
2. **Principles:** List the scientific concepts applied.
3. **Answer:** State only the correct option letter (A, B, C, or D).

### TARGET QUESTION:
{question}
"""


no_memory_prompt = PromptTemplate.from_template(no_memory_template)
no_memory_chain = no_memory_prompt | phi2

# 1) Utils Functions

In [12]:
def extract_answer(
    small_llm_model,
    model_text_output: str,
    valid_labels: List[str] = None,
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    Extração robusta baseada em padrões de alta confiança
    com fallback para LLM Juiz em caso de falha.
    """
    if valid_labels is None:
        valid_labels = ['A', 'B', 'C', 'D', 'E']

    if not model_text_output:
        return "N/A"

    text = model_text_output.strip()

    if "```python" in text or "def solution" in text:
        if debug: print("⚠ [CODE DETECTED] Enviando para LLM Judge.")
        return _llm_judge(small_llm_model, text, valid_labels, debug, question)

    strong_patterns = [
        r"\\boxed\s*\{\s*([A-H])\s*\}",
        r"(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"The\s+(?:correct\s+)?(?:answer|option|choice)\s+is\s*(?:Option)?\s*[:\-]?\s*[\(\[]([A-H])[\)\]]",
        r"(?:Final|Correct)\s+Answer\s*[:\-]\s*(?:is\s+)?(?:Option\s+)?([A-H])(?=\s|\.|,|!|\?|$)",
        r"(?:Therefore|Thus|Hence|So),\s*(?:the\s+answer\s+is\s*)?(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"(?:^|\n)\s*Answer\s*:\s*([A-H])(?=\s|$|\.|\,)",
        r"\*\*([A-H])\*\*",
        r"^([A-H])$"
]
    
    for pattern in strong_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            candidate = matches[-1].group(1).upper()
            if candidate in valid_labels:
                if debug: print(f"✓ [REGEX] Padrão: '{pattern}' -> {candidate}")
                return candidate

    if debug: print("✗ [REGEX] Falha. Chamando LLM Judge.")
    return _llm_judge(small_llm_model, text, valid_labels, debug, question)


def _llm_judge(
    small_llm_model,
    text: str,
    valid_labels: List[str],
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    context_block = f"\n### CONTEXT:\n{question}\n" if question else ""

    prompt = f"""You are an Answer Extraction Bot.
Your ONLY job is to identify which option the following "Model Output" concluded is correct.
{context_block}
### Model Output to Analyze:
{text}

### Instructions:
1. Look for an explicit answer (e.g., "Answer: A").
2. If NO explicit letter is found, infer from the conclusion and the Context options.
3. Do NOT solve the problem yourself. Trust the "Model Output".
4. If the model refuses to answer or is unclear, return "E".

Output format: Just the single letter (A, B, C, D, or E). No other text."""

    try:
        response = small_llm_model.invoke(prompt)
        output = response.content if hasattr(response, 'content') else str(response)
        clean_cand = re.sub(r"[^A-E]", "", output.strip().upper())
        if len(clean_cand) > 1:
            clean_cand = clean_cand[-1]
        if clean_cand in valid_labels:
            if debug: print(f"✓ [LLM JUIZ] Inferido: {clean_cand}")
            return clean_cand
        return "N/A"
    except Exception as e:
        if debug: print(f"✗ [ERRO JUIZ] {e}")
        return "N/A"


# 2) Evaluation

In [13]:
def avaliar_dataset_no_memory(
    df,
    chain=no_memory_chain,
    desc="Avaliando sem Memória Externa",
    full_prompt=True,
):
    """
    full_prompt=True  → passa a questão com as instruções do template.
    full_prompt=False → passa somente a questão diretamente ao LLM (sem template).
    """
    resultados = []
    acertos = 0
    total = len(df)
    erros = 0
    loop = tqdm(df.iterrows(), total=total, desc=desc)

    for idx, row in loop:
        try:
            full_question = make_question(row, inline=False)[0]

            if full_prompt:
                response_obj = chain.invoke({"question": full_question})
            else:
                response_obj = phi2.invoke(full_question)

            response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

            pred = extract_answer(
                small_llm_model=gpt5_nano,
                model_text_output=response_text,
                question=full_question,
            )
            if pred not in LABELS:
                pred = 'E'
                erros += 1

            target = row['answerKey']
            is_correct = (pred == target)
            acertos += int(is_correct)

            acc_atual = (acertos / (loop.n + 1)) * 100
            loop.set_postfix(acc=f"{acc_atual:.2f}%", last=pred, erros=erros)

            resultados.append({
                'index': idx,
                'question': full_question,
                'raw_output': response_text,
                'pred': pred,
                'target': target,
                'is_correct': is_correct,
                'source': desc
            })

        except Exception as e:
            tqdm.write(f"Erro no índice {idx}: {e}")
            resultados.append({
                'index': idx,
                'error': str(e),
                'is_correct': False,
                'source': desc
            })

    df_res = pd.DataFrame(resultados)
    print(f"\n✅ {desc} — {acertos}/{total} acertos ({(acertos/total)*100:.2f}%)")
    return df_res


# 3) Running

## 3.1) With instructions (full prompt)

In [None]:
df_valid_with_prompt = avaliar_dataset_no_memory(
    df=valid_df,
    chain=no_memory_chain,
    desc="Validação (with prompt)",
    full_prompt=True,
)


Dataset Validação: 100%|██████████| 299/299 [14:04<00:00,  2.82s/it, acc=69.57%, erros=21, last=A]


✅ Dataset Validação finalizado com 208/299 acertos (69.57%)





In [None]:
df_test_with_prompt = avaliar_dataset_no_memory(
    df=test_df,
    chain=no_memory_chain,
    desc="Teste (with prompt)",
    full_prompt=True,
)


Dataset Teste: 100%|██████████| 1172/1172 [1:55:08<00:00,  5.89s/it, acc=65.96%, erros=86, last=B] 


✅ Dataset Teste finalizado com 773/1172 acertos (65.96%)





In [None]:
df_test_with_prompt.to_csv("no_memory_test.csv", index=False)
df_valid_with_prompt.to_csv("no_memory_valid.csv", index=False)