# Hybrid RAG System (Semantic + Reflection) ‚Äî Simple Retrieval

Sistema h√≠brido de recupera√ß√£o simples sem mecanismos cognitivos:
- **Mem√≥rias Sem√¢nticas**: Fatos cient√≠ficos extra√≠dos
- **Mem√≥rias Reflexivas**: Racioc√≠nios de problemas resolvidos
- **Sem update de score, sem decay, sem gera√ß√£o/refinamento de fatos**

# 0) Setup

### A) Imports

In [None]:
# Standard library imports
import ast
import os
import re
import time
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
# Add vectorstore & prompts folders to path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'vectorstore')))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'prompts')))

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# LangChain imports
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

# Memory managers & metadata builders
from simple_vector_memory import SimpleVectorMemory, build_metadata_reflection

# Prompt templates
from templates import HYBRID_TEMPLATE

# Local imports
from utils_notebook import (
    SemanticCleaner,
    calcular_metricas_memoria,
    format_choices,
    make_question
)
cleaner = SemanticCleaner()

load_dotenv()

### B) Language Models and Datasets

In [None]:
LABELS = ['A','B','C','D']
number = {'A':0,'B':1,'C':2,'D':3}

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Dataset do arc
valid_df = pd.read_csv("../../dataset/arc_challenge_valid_processed.csv")
test_df = pd.read_csv("../../dataset/arc_challenge_test_processed.csv")

In [None]:
# LLM judge
gpt5_nano = ChatOpenAI(model='gpt-5-nano-2025-08-07', temperature=0)

In [None]:
phi2 = ChatOllama(model="phi", temperature=0)

### C) Prompts and Chains

In [None]:
hybrid_prompt = PromptTemplate.from_template(HYBRID_TEMPLATE)
rag_chain = hybrid_prompt | phi2

# 1) Utils Functions

### A) Context Formatter

In [None]:
def format_hybrid_context(semantic_items, reflection_items, show_scores=False):
    formatted_parts = []

    if semantic_items:
        formatted_parts.append("### PART 1: SCIENTIFIC PRINCIPLES (THEORY)")
        for i, fact in enumerate(semantic_items, 1):
            raw_q = fact['metadata'].get('question', '')
            q_text = raw_q.split('\n')[0].strip()
            f_text = fact.get('content', '').strip()

            if show_scores:
                sim = fact.get('similarity', 0)
                block = f"""
    * **Principle #{i}** (Sim: {sim:.2f})
        Context: "{q_text}"
        Fact: "{f_text}"
"""
            else:
                block = f"""
    * **Principle #{i}**
        * Context: "{q_text}"
        * Fact: "{f_text}"
"""
            formatted_parts.append(block)

    if reflection_items:
        formatted_parts.append("\n### PART 2: SOLVED EXAMPLES (PRACTICE)")
        for i, reflection in enumerate(reflection_items, 1):
            raw_q = reflection['metadata'].get('question', '')
            q_text = raw_q.split('\n')[0].strip()
            r_text = reflection.get('content', '').strip()

            block = f"""
    * **Example Case #{i}**
        > Context: {q_text}
        > Analysis: {r_text}
"""
            formatted_parts.append(block)

    if not formatted_parts:
        return "No specific reference information found."

    return "\n".join(formatted_parts)

### B) Answer Extractor

In [None]:
import re
from typing import List, Optional

def extract_answer(
    small_llm_model,
    model_text_output: str,
    valid_labels: List[str] = None,
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    Extra√ß√£o robusta baseada em padr√µes de alta confian√ßa (Tier 1)
    com fallback para LLM Juiz em caso de falha.
    """
    if valid_labels is None:
        valid_labels = ['A', 'B', 'C', 'D', 'E']

    if not model_text_output:
        return "N/A"

    text = model_text_output.strip()

    if "```python" in text or "def solution" in text:
        if debug: print("‚ö† [CODE DETECTED] Enviando para LLM Judge.")
        return _llm_judge(small_llm_model, text, valid_labels, debug, question)

    strong_patterns = [
        r"\\boxed\s*\{\s*([A-H])\s*\}",
        r"(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"The\s+(?:correct\s+)?(?:answer|option|choice)\s+is\s*(?:Option)?\s*[:\-]?\s*[\(\[]([A-H])[\)\]]",
        r"(?:Final|Correct)\s+Answer\s*[:\-]\s*(?:is\s+)?(?:Option\s+)?([A-H])(?=\s|\.|,|!|\?|$)",
        r"(?:Therefore|Thus|Hence|So),\s*(?:the\s+answer\s+is\s*)?(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"(?:^|\n)\s*Answer\s*:\s*([A-H])(?=\s|$|\.|\,)",
        r"\*\*([A-H])\*\*",
        r"^([A-H])$"
    ]

    for pattern in strong_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            last_match = matches[-1]
            candidate = last_match.group(1).upper()
            if candidate in valid_labels:
                if debug:
                    print(f"‚úì [REGEX] Padr√£o encontrado: '{pattern}' -> {candidate}")
                return candidate

    if debug: print("‚úó [REGEX] Falha nos padr√µes fortes. Chamando LLM Judge.")
    return _llm_judge(small_llm_model, text, valid_labels, debug, question)


def _llm_judge(
    small_llm_model,
    text: str,
    valid_labels: List[str],
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    context_block = ""
    if question:
        context_block = f"""
### CONTEXT (Use this to infer the answer letter):
{question}
"""

    prompt = f"""You are an Answer Extraction Bot.
Your ONLY job is to identify which option the following "Model Output" concluded is correct.

{context_block}

### Model Output to Analyze:
{text}

### Instructions:
1. Look for an explicit answer (e.g., "Answer: A").
2. If NO explicit letter is found, read the conclusion of the "Model Output" and match it against the Options in the Context. **Infer the letter.**
   - Example: If Context has "(A) 5 (B) 10" and Model Output says "The result is 10", you must output B.
3. Do NOT calculate or solve the problem yourself. Trust the "Model Output".
4. If the model refuses to answer or is unclear, return "E".

Output format: Just the single letter (A, B, C, D, or E). No other text."""

    try:
        response = small_llm_model.invoke(prompt)
        output = response.content if hasattr(response, 'content') else str(response)
        clean_cand = re.sub(r"[^A-E]", "", output.strip().upper())

        if len(clean_cand) > 1:
            clean_cand = clean_cand[-1]

        if clean_cand in valid_labels:
            if debug: print(f"‚úì [LLM JUIZ] Inferido: {clean_cand}")
            return clean_cand

        return "N/A"

    except Exception as e:
        if debug: print(f"‚úó [ERRO JUIZ] {e}")
        return "N/A"

# 2) Evaluation

In [None]:
def avaliar_dataset_rag_hybrid(
    df,
    chain=rag_chain,
    manager_semantic=None,
    manager_reflection=None,
    k_semantic=3,
    k_reflection=3,
    threshold_semantic=0.24,
    threshold_reflection=0.24,
    backup_frequency=160,
    backup_path=None,
    desc="RAG Hybrid (Semantic + Reflection)"
):
    """
    Avalia dataset com sistema h√≠brido simples (sem√¢ntica + reflex√£o).
    Recupera√ß√£o por similaridade pura ‚Äî sem score cognitivo, sem EMA, sem decay.
    """
    resultados = []
    acertos = 0
    total = len(df)
    backup_counter = 0

    # Acumuladores globais
    global_sims_sem = []
    global_sims_ref = []
    global_counts_sem = []
    global_counts_ref = []

    erros = 0

    # Preparar diret√≥rio de backup
    if backup_path:
        backup_dir = os.path.dirname(backup_path)
        if backup_dir and not os.path.exists(backup_dir):
            os.makedirs(backup_dir, exist_ok=True)

    loop = tqdm(df.iterrows(), total=total, desc=desc)

    for idx, row in loop:
        try:
            full_question = make_question(row, inline=False)[0]
            question_for_retriever = re.sub(r'\([A-Z]\)\s*', '', row['question'])

            # Recuperar mem√≥rias sem√¢nticas (similaridade simples)
            items_semantic = manager_semantic.retrieve_memories(
                query=question_for_retriever,
                k=k_semantic,
                threshold=threshold_semantic
            )

            # Recuperar mem√≥rias reflexivas
            items_reflection = []
            if manager_reflection:
                items_reflection = manager_reflection.retrieve_memories(
                    query=question_for_retriever,
                    k=k_reflection,
                    threshold=threshold_reflection
                )

            # Combinar contextos
            hybrid_context_formatted = format_hybrid_context(items_semantic, items_reflection)

            count_sem, avg_sim_sem, raw_sims_sem = calcular_metricas_memoria(items_semantic)
            count_ref, avg_sim_ref, raw_sims_ref = calcular_metricas_memoria(items_reflection)

            global_counts_sem.append(count_sem)
            global_counts_ref.append(count_ref)
            global_sims_sem.extend(raw_sims_sem)
            global_sims_ref.extend(raw_sims_ref)

            response_obj = chain.invoke({
                "question": full_question,
                "hybrid_context": hybrid_context_formatted,
            })

            response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

            pred = extract_answer(
                small_llm_model=gpt5_nano,
                model_text_output=response_text,
                question=full_question,
            )

            if pred not in ['A', 'B', 'C', 'D']:
                pred = 'E'
                erros += 1

            target = row['answerKey']
            is_correct = (pred == target)
            acertos += int(is_correct)

            # === BACKUP ===
            if backup_path and backup_frequency > 0 and (idx + 1) % backup_frequency == 0:
                backup_counter += 1
                df_partial = pd.DataFrame(resultados)
                backup_file = backup_path.replace('.csv', '_backup.csv')
                df_partial.to_csv(backup_file, index=False)

            # Atualiza barra de progresso
            acc_atual = (acertos / (loop.n + 1)) * 100
            loop.set_postfix(
                acc=f"{acc_atual:.2f}%",
                sem=count_sem,
                ref=count_ref,
                bkp=backup_counter,
                erros=erros
            )

            resultados.append({
                'index': idx,
                'question': full_question,
                'retrieved_context': hybrid_context_formatted,
                'retrieved_count_semantic': count_sem,
                'retrieved_count_reflection': count_ref,
                'avg_similarity_semantic': avg_sim_sem,
                'avg_similarity_reflection': avg_sim_ref,
                'raw_output': response_text,
                'pred': pred,
                'target': target,
                'is_correct': is_correct,
                'source': desc,
                'erros': erros
            })

        except Exception as e:
            tqdm.write(f"Erro no √≠ndice {idx}: {e}")
            resultados.append({
                'index': idx,
                'error': str(e),
                'is_correct': False,
                'retrieved_count_semantic': 0,
                'retrieved_count_reflection': 0,
                'avg_similarity_semantic': 0.0,
                'avg_similarity_reflection': 0.0,
            })

    if backup_path and backup_counter > 0:
        print(f"üíæ Total de backups salvos: {backup_counter}")

    return pd.DataFrame(resultados)

# 3) Constru√ß√£o dos Bancos de Dados

In [None]:
# Preparar dados
scientific_facts = pd.read_csv("../../scientific_facts_expanded.csv").fillna("N/A")

df_semantic = scientific_facts[scientific_facts['scientific_fact'] != "N/A"].drop_duplicates(subset=['scientific_fact'])
df_reflection = scientific_facts[scientific_facts['clean_reasoning'] != "N/A"].drop_duplicates(subset=['clean_reasoning'])

In [None]:
FORCE_RESET = False

def build_metadata_semantic(index, row, doc_id):
    return {
        "chroma_id": doc_id,
        "original_id": str(row['id']),
        "question": str(row['question']),
        "correct_answer": str(row['correct_answer']) if pd.notna(row.get('correct_answer')) else "None",
        "scientific_fact": row['scientific_fact'],
        "origin": "original_training",
        "type": "semantic_memory",
    }

manager_semantic = SimpleVectorMemory(
    db_path="vectorstores/hybrid_rag/chroma_semantic",
    embedding_model=embedding_model
).init_from_dataframe(df=df_semantic, content_col='scientific_fact',
                      id_prefix='semantic', metadata_func=build_metadata_semantic, reset_db=FORCE_RESET)

manager_reflection = SimpleVectorMemory(
    db_path="vectorstores/hybrid_rag/chroma_reflection",
    embedding_model=embedding_model
).init_from_dataframe(df=df_reflection, content_col='clean_reasoning',
                      id_prefix='reflection', metadata_func=build_metadata_reflection, reset_db=FORCE_RESET)

# 4) Testing in one question

In [None]:
i = 0
k_semantic = 3
k_reflection = 3
row = test_df.iloc[i]

In [None]:
q_text = make_question(row, inline=False)[0]
question_clean = re.sub(r'\([A-Z]\)\s*', '', q_text)

# Recuperar mem√≥rias sem√¢nticas
items_semantic = manager_semantic.retrieve_memories(
    query=question_clean,
    k=k_semantic,
    threshold=0.33
)

# Recuperar mem√≥rias reflexivas
items_reflection = manager_reflection.retrieve_memories(
    query=question_clean,
    k=k_reflection,
    threshold=0.33
)

# Combinar contextos
hybrid_context = format_hybrid_context(items_semantic, items_reflection, show_scores=True)

print("Question: \n", q_text)
print("\nHybrid Context: \n", hybrid_context)

count_sem, avg_sim_sem, raw_sims_sem = calcular_metricas_memoria(items_semantic)
count_ref, avg_sim_ref, raw_sims_ref = calcular_metricas_memoria(items_reflection)
print("\nM√©tricas Sem√¢nticas: Count:", count_sem, "AVG_SIM:", avg_sim_sem)
print("M√©tricas Reflexivas: Count:", count_ref, "AVG_SIM:", avg_sim_ref)

In [None]:
import time

a = time.time()
hybrid_context_clean = format_hybrid_context(items_semantic, items_reflection, show_scores=False)
response_obj = rag_chain.invoke({
    "question": q_text,
    "hybrid_context": hybrid_context_clean,
})
print("Time to answer: ", round(time.time()-a, 2))
response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

pred = extract_answer(
    small_llm_model=gpt5_nano,
    model_text_output=response_text,
    question=q_text,
    debug=True
)

target = row['answerKey']
is_correct = (pred == target)

print("# Raw Response: \n", response_text)
print(f"# Alternative chosen: {pred} (Correta: {target})")
print(f"# Resultado: {'CORRETO' if is_correct else 'ERRO'}")

# 5) Running

In [None]:
# VALIDA√á√ÉO
df_resultado_valid = avaliar_dataset_rag_hybrid(
    df=valid_df,
    chain=rag_chain,
    manager_semantic=manager_semantic,
    manager_reflection=manager_reflection,
    k_semantic=3,
    k_reflection=3,
    threshold_semantic=0.24,
    threshold_reflection=0.24,
    backup_frequency=130,
    backup_path="../../results/rag_hybrid_valid.csv",
    desc="Valida√ß√£o (RAG Hybrid)"
)

In [None]:
df_resultado_valid.to_csv("rag_hybrid_valid.csv", index=False)

In [None]:
# TESTE
df_resultado_test = avaliar_dataset_rag_hybrid(
    df=test_df,
    chain=rag_chain,
    manager_semantic=manager_semantic,
    manager_reflection=manager_reflection,
    k_semantic=3,
    k_reflection=3,
    threshold_semantic=0.24,
    threshold_reflection=0.24,
    backup_frequency=150,
    backup_path="../../results/rag_hybrid_test.csv",
    desc="Teste (RAG Hybrid)"
)

In [None]:
df_resultado_test.to_csv("rag_hybrid_test.csv", index=False)

In [None]:
df_resultado_test['is_correct'].mean()