# Reflection RAG System â€” Simple Retrieval

Sistema de memÃ³ria reflexiva para comparaÃ§Ã£o com sistemas cognitivos:
- **MemÃ³rias Reflexivas**: RaciocÃ­nios (reasoning traces) de questÃµes resolvidas
- **Retrieval ClÃ¡ssico**: Apenas similaridade semÃ¢ntica (sem score cognitivo, sem decay, sem geraÃ§Ã£o/refinamento)


# 0) Setup

### A) Imports

In [None]:
# Standard library imports
import ast
import os
import re
import time
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
# Add vectorstore & prompts folders to path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'vectorstore')))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'prompts')))

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# LangChain imports
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

# Memory managers & metadata builders
from simple_vector_memory import SimpleVectorMemory, build_metadata_reflection

# Prompt templates
from templates import REFLECTION_TEMPLATE

# Local imports
from utils_notebook import (
    SemanticCleaner,
    calcular_metricas_memoria,
    format_choices,
    make_question
)
cleaner = SemanticCleaner()

load_dotenv()


### B) Language Models and Datasets

In [None]:
LABELS = ['A','B','C','D']
number = {'A':0,'B':1,'C':2,'D':3}

In [3]:
# Dataset do arc. (Caso nÃ£o tenha Ã© necessÃ¡rio instalar e guardar na pasta datasets como csv)
# train_df = pd.read_csv("dataset/arc_challenge_train_processed.csv")
valid_df = pd.read_csv("../../dataset/arc_challenge_valid_processed.csv")
test_df = pd.read_csv("../../dataset/arc_challenge_test_processed.csv")

In [None]:
# LLM judge
gpt5_nano = ChatOpenAI(model='gpt-5-nano-2025-08-07', temperature=0)

In [5]:
phi2 = ChatOllama(model="phi", temperature=0) # phi2

### C) Prompts and Chains

In [None]:
reflection_prompt = PromptTemplate.from_template(REFLECTION_TEMPLATE)
rag_chain = reflection_prompt | phi2


# 1) Utils Functions

### A) Context Formatter

In [None]:
def format_reflection_context(reflection_items, show_scores=False):
    if not reflection_items:
        return "No specific reference information found."

    formatted_parts = ["### SOLVED EXAMPLES (PRACTICE)"]
    for i, reflection in enumerate(reflection_items, 1):
        raw_q = reflection['metadata'].get('question', '')
        q_text = raw_q.split('\n')[0].strip()
        r_text = reflection.get('content', '').strip()

        if show_scores:
            sim = reflection.get('similarity', 0)
            block = f"""
    * **Example Case #{i}** (Sim: {sim:.2f})
        > Context: {q_text}
        > Analysis: {r_text}
"""
        else:
            block = f"""
    * **Example Case #{i}**
        > Context: {q_text}
        > Analysis: {r_text}
"""
        formatted_parts.append(block)

    return "\n".join(formatted_parts)

### B) Answer Extractor

In [None]:
import re
from typing import List, Optional

def extract_answer(
    small_llm_model,
    model_text_output: str,
    valid_labels: List[str] = None,
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    ExtraÃ§Ã£o robusta baseada em padrÃµes de alta confianÃ§a (Tier 1)
    com fallback para LLM Juiz em caso de falha.
    """
    if valid_labels is None:
        valid_labels = ['A', 'B', 'C', 'D', 'E']

    if not model_text_output:
        return "N/A"

    text = model_text_output.strip()

    if "```python" in text or "def solution" in text:
        if debug: print("âš  [CODE DETECTED] Enviando para LLM Judge.")
        return _llm_judge(small_llm_model, text, valid_labels, debug, question)

    strong_patterns = [
        r"\\boxed\s*\{\s*([A-H])\s*\}",
        r"(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"The\s+(?:correct\s+)?(?:answer|option|choice)\s+is\s*(?:Option)?\s*[:\-]?\s*[\(\[]([A-H])[\)\]]",
        r"(?:Final|Correct)\s+Answer\s*[:\-]\s*(?:is\s+)?(?:Option\s+)?([A-H])(?=\s|\.|,|!|\?|$)",
        r"(?:Therefore|Thus|Hence|So),\s*(?:the\s+answer\s+is\s*)?(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"(?:^|\n)\s*Answer\s*:\s*([A-H])(?=\s|$|\.|\,)",
        r"\*\*([A-H])\*\*",
        r"^([A-H])$"
    ]

    for pattern in strong_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            last_match = matches[-1]
            candidate = last_match.group(1).upper()
            if candidate in valid_labels:
                if debug:
                    print(f"âœ“ [REGEX] PadrÃ£o encontrado: '{pattern}' -> {candidate}")
                return candidate

    if debug: print("âœ— [REGEX] Falha nos padrÃµes fortes. Chamando LLM Judge.")
    return _llm_judge(small_llm_model, text, valid_labels, debug, question)


def _llm_judge(
    small_llm_model,
    text: str,
    valid_labels: List[str],
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    context_block = ""
    if question:
        context_block = f"""
### CONTEXT (Use this to infer the answer letter):
{question}
"""

    prompt = f"""You are an Answer Extraction Bot.
Your ONLY job is to identify which option the following "Model Output" concluded is correct.

{context_block}

### Model Output to Analyze:
{text}

### Instructions:
1. Look for an explicit answer (e.g., "Answer: A").
2. If NO explicit letter is found, read the conclusion of the "Model Output" and match it against the Options in the Context. **Infer the letter.**
   - Example: If Context has "(A) 5 (B) 10" and Model Output says "The result is 10", you must output B.
3. Do NOT calculate or solve the problem yourself. Trust the "Model Output".
4. If the model refuses to answer or is unclear, return "E".

Output format: Just the single letter (A, B, C, D, or E). No other text."""

    try:
        response = small_llm_model.invoke(prompt)
        output = response.content if hasattr(response, 'content') else str(response)
        clean_cand = re.sub(r"[^A-E]", "", output.strip().upper())

        if len(clean_cand) > 1:
            clean_cand = clean_cand[-1]

        if clean_cand in valid_labels:
            if debug: print(f"âœ“ [LLM JUIZ] Inferido: {clean_cand}")
            return clean_cand

        return "N/A"

    except Exception as e:
        if debug: print(f"âœ— [ERRO JUIZ] {e}")
        return "N/A"


# 2) Evaluation

In [None]:
def avaliar_dataset_rag_reflection(
    df,
    chain=rag_chain,
    manager_reflection=None,
    k_reflection=3,
    threshold_reflection=0.24,
    backup_frequency=160,
    backup_path=None,
    desc="RAG Reflection"
):
    """
    Avalia dataset com sistema de memÃ³ria reflexiva simples.
    RecuperaÃ§Ã£o por similaridade pura â€” sem score cognitivo, sem EMA, sem decay.
    """
    resultados = []
    acertos = 0
    total = len(df)
    backup_counter = 0

    global_sims = []
    global_counts = []
    erros = 0

    # Preparar diretÃ³rio de backup
    if backup_path:
        backup_dir = os.path.dirname(backup_path)
        if backup_dir and not os.path.exists(backup_dir):
            os.makedirs(backup_dir, exist_ok=True)

    loop = tqdm(df.iterrows(), total=total, desc=desc)

    for idx, row in loop:
        try:
            full_question = make_question(row, inline=False)[0]
            question_for_retriever = re.sub(r'\([A-Z]\)\s*', '', row['question'])

            # Recuperar memÃ³rias reflexivas
            items_reflection = manager_reflection.retrieve_memories(
                query=question_for_retriever,
                k=k_reflection,
                threshold=threshold_reflection
            )

            # Formatar contexto de reflexÃ£o
            reflection_context_formatted = format_reflection_context(items_reflection)

            count, avg_sim, raw_sims = calcular_metricas_memoria(items_reflection)
            global_counts.append(count)
            global_sims.extend(raw_sims)

            response_obj = chain.invoke({
                "question": full_question,
                "reflection_context": reflection_context_formatted,
            })

            response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

            pred = extract_answer(
                small_llm_model=gpt5_nano,
                model_text_output=response_text,
                question=full_question,
            )

            if pred not in ['A', 'B', 'C', 'D']:
                pred = 'E'
                erros += 1

            target = row['answerKey']
            is_correct = (pred == target)
            acertos += int(is_correct)

            # === BACKUP ===
            if backup_path and backup_frequency > 0 and (idx + 1) % backup_frequency == 0:
                backup_counter += 1
                df_partial = pd.DataFrame(resultados)
                backup_file = backup_path.replace('.csv', '_backup.csv')
                df_partial.to_csv(backup_file, index=False)

            # Atualiza barra de progresso
            acc_atual = (acertos / (loop.n + 1)) * 100
            loop.set_postfix(
                acc=f"{acc_atual:.2f}%",
                ref=count,
                bkp=backup_counter,
                erros=erros
            )

            resultados.append({
                'index': idx,
                'question': full_question,
                'retrieved_context': reflection_context_formatted,
                'retrieved_count': count,
                'avg_similarity': avg_sim,
                'raw_output': response_text,
                'pred': pred,
                'target': target,
                'is_correct': is_correct,
                'source': desc,
                'erros': erros
            })

        except Exception as e:
            tqdm.write(f"Erro no Ã­ndice {idx}: {e}")
            resultados.append({
                'index': idx,
                'error': str(e),
                'is_correct': False,
                'retrieved_count': 0,
                'avg_similarity': 0.0,
            })

    if backup_path and backup_counter > 0:
        print(f"ðŸ’¾ Total de backups salvos: {backup_counter}")

    return pd.DataFrame(resultados)


# 3) ConstruÃ§Ã£o dos Bancos de Dados

In [None]:
# Preparar dados
scientific_facts = pd.read_csv("../../scientific_facts_expanded.csv").fillna("N/A")

df_reflection = scientific_facts[scientific_facts['clean_reasoning'] != "N/A"].drop_duplicates(subset=['clean_reasoning'])


In [None]:
FORCE_RESET = False

manager_reflection = SimpleVectorMemory(
    db_path="vectorstores/reflection_rag/chroma_reflection",
    embedding_model=embedding_model
).init_from_dataframe(df=df_reflection, content_col='clean_reasoning',
                      id_prefix='reflection', metadata_func=build_metadata_reflection, reset_db=FORCE_RESET)


ðŸ§¹ [Sistema] Reset total solicitado. Apagando vectorstores/reflection/chroma_reflection...
ðŸ“‚ Carregando base de vetores de: vectorstores/reflection/chroma_reflection
âœ… Base carregada com 1084 itens.


<__main__.SimpleVectorMemory at 0x23816049d50>

# 4) Testing in one question

In [None]:
i = 0
k_reflection = 3
row = test_df.iloc[i]

In [None]:
q_text = make_question(row, inline=False)[0]
question_clean = re.sub(r'\([A-Z]\)\s*', '', q_text)

# Recuperar memÃ³rias reflexivas
items_reflection = manager_reflection.retrieve_memories(
    query=question_clean,
    k=k_reflection,
    threshold=0.33
)

# Formatar contexto de reflexÃ£o
reflection_context = format_reflection_context(items_reflection, show_scores=True)

print("Question: \n", q_text)
print("\nReflection Context: \n", reflection_context)

count, avg_sim, raw_sims = calcular_metricas_memoria(items_reflection)
print("\nMÃ©tricas Reflexivas: Count:", count, "AVG_SIM:", avg_sim)


Question: 
 Students visited the Morris W. Offit telescope located at the Maryland Space Grant Observatory in Baltimore. They learned about the stars, planets, and moon. The students recorded the information below. â€¢ Star patterns stay the same, but their locations in the sky seem to change. â€¢ The sun, planets, and moon appear to move in the sky. â€¢ Proxima Centauri is the nearest star to our solar system. â€¢ Polaris is a star that is part of a pattern of stars called the Little Dipper. Which statement best explains why the sun appears to move across the sky each day?
(A) The sun revolves around Earth.
(B) Earth rotates around the sun.
(C) The sun revolves on its axis.
(D) Earth rotates on its axis.

Contexto:  
        ### Reference Example 1
        **Question:** Stars are organized into patterns called constellations. One constellation is named Leo. Which statement best explains why Leo appears in different areas of the sky throughout the year?
        **Reference Logic:** The

In [None]:
import time

a = time.time()
reflection_context_clean = format_reflection_context(items_reflection, show_scores=False)
response_obj = rag_chain.invoke({
    "question": q_text,
    "reflection_context": reflection_context_clean,
})
print("Time to answer: ", round(time.time()-a, 2))
response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

pred = extract_answer(
    small_llm_model=gpt5_nano,
    model_text_output=response_text,
    question=q_text,
    debug=True
)

target = row['answerKey']
is_correct = (pred == target)

print("# Raw Response: \n", response_text)
print(f"# Alternative chosen: {pred} (Correta: {target})")
print(f"# Resultado: {'CORRETO' if is_correct else 'ERRO'}")


âœ— [REGEX] Falha nos padrÃµes fortes. Chamando LLM Judge.
âœ“ [LLM JUIZ] Inferido: B
# Raw Response: 
  (B) Earth rotates around the sun.

# Alternative chosen:  B (Correta: D)
----------------------------------------------
Tempo para rodar extraction:  1.55
Tempo para rodar invoke:  0.53


# 5) Running

In [None]:
# VALIDAÃ‡ÃƒO
df_resultado_valid = avaliar_dataset_rag_reflection(
    df=valid_df,
    chain=rag_chain,
    manager_reflection=manager_reflection,
    k_reflection=3,
    threshold_reflection=0.24,
    backup_frequency=130,
    backup_path="../../results/rag_reflection_valid.csv",
    desc="ValidaÃ§Ã£o (RAG Reflection)"
)


Dataset ValidaÃ§Ã£o: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 299/299 [10:58<00:00,  2.20s/it, acc=70.23%, avg_sim=0.6164, erros=0, n_mem=3]


In [None]:
df_resultado_valid.to_csv("rag_reflection_valid.csv", index=False)


In [None]:
# TESTE
df_resultado_test = avaliar_dataset_rag_reflection(
    df=test_df,
    chain=rag_chain,
    manager_reflection=manager_reflection,
    k_reflection=3,
    threshold_reflection=0.24,
    backup_frequency=150,
    backup_path="../../results/rag_reflection_test.csv",
    desc="Teste (RAG Reflection)"
)


Dataset Teste: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1172/1172 [33:58<00:00,  1.74s/it, acc=68.86%, avg_sim=0.5434, erros=0, n_mem=3] 


In [None]:
df_resultado_test.to_csv("rag_reflection_test.csv", index=False)


ðŸ’¾ Salvando resultados finais...
   âœ“ reflection_test.csv salvo
   âœ“ reflection_valid.csv salvo

âœ… Todos os arquivos salvos com sucesso!


In [None]:
df_resultado_test['is_correct'].mean()