# Semantic RAG System â€” Simple Retrieval

Sistema de memÃ³ria semÃ¢ntica para comparaÃ§Ã£o com sistemas cognitivos:
- **MemÃ³rias SemÃ¢nticas**: Fatos cientÃ­ficos extraÃ­dos de questÃµes resolvidas
- **Retrieval ClÃ¡ssico**: Apenas similaridade semÃ¢ntica (sem score cognitivo, sem decay, sem geraÃ§Ã£o/refinamento)

# 0) Setup

### A) Imports

In [None]:
# Standard library imports
import ast
import os
import re
import time
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
# Add vectorstore & prompts folders to path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'vectorstore')))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'prompts')))

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

# LangChain imports
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

# Memory managers & metadata builders
from simple_vector_memory import SimpleVectorMemory, build_metadata_semantic

# Prompt templates
from templates import SEMANTIC_TEMPLATE

# Local imports
from utils_notebook import (
    SemanticCleaner,
    calcular_metricas_memoria,
    format_choices,
    make_question
)
cleaner = SemanticCleaner()

load_dotenv()


### B) Language Models and Datasets

In [2]:
LABELS = ['A','B','C','D']
number = {'A':0,'B':1,'C':2,'D':3}

In [3]:
# Dataset do arc. (Caso nÃ£o tenha Ã© necessÃ¡rio instalar e guardar na pasta datasets como csv)
# train_df = pd.read_csv("dataset/arc_challenge_train_processed.csv")
valid_df = pd.read_csv("../../dataset/arc_challenge_valid_processed.csv")
test_df = pd.read_csv("../../dataset/arc_challenge_test_processed.csv")

In [None]:
# LLM judge
gpt5_nano = ChatOpenAI(model='gpt-5-nano-2025-08-07', temperature=0)


In [None]:
phi2 = ChatOllama(model="phi", temperature=0)


In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


### C) Prompts and Chains

In [None]:
semantic_prompt = PromptTemplate.from_template(SEMANTIC_TEMPLATE)
rag_chain = semantic_prompt | phi2


# 1) Utils Functions

### A) Context Formatter

In [None]:
def format_facts_clean(semantic_items, show_scores=False):
    if not semantic_items:
        return "No relevant scientific facts found."

    formatted_parts = ["### SCIENTIFIC FACTS (THEORY)"]
    for i, fact in enumerate(semantic_items, 1):
        f_text = fact.get('content', '').strip()

        if show_scores:
            sim = fact.get('similarity', 0)
            block = f"""
    * **Principle #{i}** (Sim: {sim:.2f})
        Context: "{sim}"
        Fact: "{f_text}"
"""
        else:
            block = f"""
    * **Principle #{i}**
        * Context: "{sim}"
        * Fact: "{f_text}"
"""
        formatted_parts.append(block)

    return "\n".join(formatted_parts)


### B) Answer Extractor

In [None]:
import re
from typing import List, Optional

def extract_answer(
    small_llm_model,
    model_text_output: str,
    valid_labels: List[str] = None,
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    ExtraÃ§Ã£o robusta baseada em padrÃµes de alta confianÃ§a (Tier 1)
    com fallback para LLM Juiz em caso de falha.
    """
    if valid_labels is None:
        valid_labels = ['A', 'B', 'C', 'D', 'E']

    if not model_text_output:
        return "N/A"

    text = model_text_output.strip()

    if "```python" in text or "def solution" in text:
        if debug: print("âš  [CODE DETECTED] Enviando para LLM Judge.")
        return _llm_judge(small_llm_model, text, valid_labels, debug, question)

    strong_patterns = [
        r"\\boxed\s*\{\s*([A-H])\s*\}",
        r"(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"The\s+(?:correct\s+)?(?:answer|option|choice)\s+is\s*(?:Option)?\s*[:\-]?\s*[\(\[]([A-H])[\)\]]",
        r"(?:Final|Correct)\s+Answer\s*[:\-]\s*(?:is\s+)?(?:Option\s+)?([A-H])(?=\s|\.|,|!|\?|$)",
        r"(?:Therefore|Thus|Hence|So),\s*(?:the\s+answer\s+is\s*)?(?:Option)?\s*[\(\[]([A-H])[\)\]]",
        r"(?:^|\n)\s*Answer\s*:\s*([A-H])(?=\s|$|\.|\,)",
        r"\*\*([A-H])\*\*",
        r"^([A-H])$"
    ]

    for pattern in strong_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            last_match = matches[-1]
            candidate = last_match.group(1).upper()
            if candidate in valid_labels:
                if debug:
                    print(f"âœ“ [REGEX] PadrÃ£o encontrado: '{pattern}' -> {candidate}")
                return candidate

    if debug: print("âœ— [REGEX] Falha nos padrÃµes fortes. Chamando LLM Judge.")
    return _llm_judge(small_llm_model, text, valid_labels, debug, question)


def _llm_judge(
    small_llm_model,
    text: str,
    valid_labels: List[str],
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    context_block = ""
    if question:
        context_block = f"""
### CONTEXT (Use this to infer the answer letter):
{question}
"""

    prompt = f"""You are an Answer Extraction Bot.
Your ONLY job is to identify which option the following "Model Output" concluded is correct.

{context_block}

### Model Output to Analyze:
{text}

### Instructions:
1. Look for an explicit answer (e.g., "Answer: A").
2. If NO explicit letter is found, read the conclusion of the "Model Output" and match it against the Options in the Context. **Infer the letter.**
   - Example: If Context has "(A) 5 (B) 10" and Model Output says "The result is 10", you must output B.
3. Do NOT calculate or solve the problem yourself. Trust the "Model Output".
4. If the model refuses to answer or is unclear, return "E".

Output format: Just the single letter (A, B, C, D, or E). No other text."""

    try:
        response = small_llm_model.invoke(prompt)
        output = response.content if hasattr(response, 'content') else str(response)
        clean_cand = re.sub(r"[^A-E]", "", output.strip().upper())

        if len(clean_cand) > 1:
            clean_cand = clean_cand[-1]

        if clean_cand in valid_labels:
            if debug: print(f"âœ“ [LLM JUIZ] Inferido: {clean_cand}")
            return clean_cand

        return "N/A"

    except Exception as e:
        if debug: print(f"âœ— [ERRO JUIZ] {e}")
        return "N/A"


# 2) Evaluation

In [None]:
def avaliar_dataset_semantic(
    df,
    chain=rag_chain,
    manager_semantic=manager_semantic,
    k_retrieval=3,
    threshold=0.33,
    backup_frequency=160,
    backup_path=None,
    desc="Avaliando com Semantic Memory"
):
    resultados = []
    acertos = 0
    total = len(df)
    backup_counter = 0

    global_sims = []
    global_counts = []
    erros = 0

    # Preparar diretÃ³rio de backup
    if backup_path:
        backup_dir = os.path.dirname(backup_path)
        if backup_dir and not os.path.exists(backup_dir):
            os.makedirs(backup_dir, exist_ok=True)

    loop = tqdm(df.iterrows(), total=total, desc=desc)

    for idx, row in loop:
        try:
            full_question = make_question(row, inline=False)[0]
            question_for_retriever = re.sub(r'\([A-Z]\)\s*', '', row['question'])

            items = manager_semantic.retrieve_memories(
                query=question_for_retriever,
                k=k_retrieval,
                threshold=threshold
            )
            facts_formatted = format_facts_clean(items)

            count, avg_sim, raw_sims = calcular_metricas_memoria(items)
            global_counts.append(count)
            global_sims.extend(raw_sims)

            response_obj = chain.invoke({
                "question": full_question,
                "similar_facts": facts_formatted,
            })

            response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

            pred = extract_answer(
                small_llm_model=gpt5_nano,
                model_text_output=response_text,
                question=full_question,
            )

            if pred not in LABELS:
                pred = "Invalid"
                erros += 1

            target = row['answerKey']
            is_correct = (pred == target)
            acertos += int(is_correct)

            # === BACKUP ===
            if backup_path and backup_frequency > 0 and (idx + 1) % backup_frequency == 0:
                backup_counter += 1
                df_partial = pd.DataFrame(resultados)
                backup_file = backup_path.replace('.csv', '_backup.csv')
                df_partial.to_csv(backup_file, index=False)

            # Atualiza barra de progresso
            acc_atual = (acertos / (loop.n + 1)) * 100
            loop.set_postfix(acc=f"{acc_atual:.2f}%", n_mem=count, avg_sim=f"{avg_sim:.4f}", erros=erros, bkp=backup_counter)

            resultados.append({
                'index': idx,
                'question': full_question,
                'retrieved_context': facts_formatted,
                'retrieved_count': count,
                'avg_similarity': avg_sim,
                'raw_output': response_text,
                'pred': pred,
                'target': target,
                'is_correct': is_correct,
                'source': desc,
            })

        except Exception as e:
            tqdm.write(f"Erro no Ã­ndice {idx}: {e}")
            resultados.append({
                'index': idx,
                'error': str(e),
                'is_correct': False,
                'retrieved_count': 0,
                'avg_similarity': 0.0,
            })

    if backup_path and backup_counter > 0:
        print(f"ðŸ’¾ Total de backups salvos: {backup_counter}")

    return pd.DataFrame(resultados)


# 3) ConstruÃ§Ã£o dos Bancos de Dados

In [8]:
scientific_facts = pd.read_csv("../../scientific_facts_expanded.csv")
scientific_facts = scientific_facts.fillna("N/A")

df_train_subset = scientific_facts[scientific_facts['origin'] == 'train'].copy()
df_train_subset = df_train_subset[df_train_subset['scientific_fact'] != "N/A"]
df_train_subset.drop_duplicates(subset=['scientific_fact'], keep='first', inplace=True)
df_train_subset['scientific_fact'] = df_train_subset['scientific_fact'].apply(lambda x: cleaner.clean(x))

In [None]:
FORCE_RESET = False

manager_semantic = SimpleVectorMemory(
    db_path="vectorstores/semantic/chroma_semantic",
    embedding_model=embedding_model
)

manager_semantic.init_from_dataframe(
    df=df_train_subset,
    content_col='scientific_fact',
    id_prefix='semantic',
    metadata_func=build_metadata_semantic,
    reset_db=FORCE_RESET
)


ðŸ§¹ [Sistema] Reset total solicitado. Apagando vectorstores/semantic/chroma_semantic...
ðŸ“‚ Carregando base de vetores de: vectorstores/semantic/chroma_semantic
âœ… Base carregada com 3286 itens.


<__main__.SimpleVectorMemory at 0x18197685850>

# 4) Testing in one question

In [11]:
i = 0
k = 3
row = valid_df.iloc[i]

In [None]:
q_text = make_question(row, inline=False)[0]
question_clean = re.sub(r'\([A-Z]\)\s*', '', row['question'])

items = manager_semantic.retrieve_memories(
    query=question_clean,
    k=k,
    threshold=0.33
)
facts_semantic = format_facts_clean(items, show_scores=True)

print("Question: \n", q_text)
print("\nContexto: \n", facts_semantic)

count, avg_sim, raw_sims = calcular_metricas_memoria(items)
print("\nMÃ©tricas das memÃ³rias recuperadas: ")
print("Count: ", count, " AVG_SIM: ", avg_sim, " Raw_SIM: ", raw_sims)


Question: 
 Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation?
(A) Put the objects in groups.
(B) Change the height of the ramp.
(C) Choose different objects to roll.
(D) Record the details of the investigation.

Contexto:  
    Question: Which situation would be considered observation and measurement?
    Knowledge: Observation involves using senses or tools to gather information.


    Question: What should be done when the results of an experiment do not support the hypothesis?
    Knowledge: Repeating the experiment and checking for errors can gather more data to refine understanding of the phenomenon under investigation.


    Question: Which type of force requires contact between two objects for one to push or pull the other?
    Knowledge: When an object rolls on a surface, microscopic irregularities interact and cause friction.


MÃ©tricas das memÃ³rias recuperadas: 
Cou

In [None]:
import time

a = time.time()
facts_clean = format_facts_clean(items, show_scores=False)
response_obj = rag_chain.invoke({
    "question": q_text,
    "similar_facts": facts_clean,
})
b = time.time()
response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

c = time.time()
pred = extract_answer(
    small_llm_model=gpt5_nano,
    model_text_output=response_text,
    question=q_text,
    debug=True
)
d = time.time()

target = row['answerKey']
is_correct = (pred == target)

print("# Raw Response: \n", response_text)
print("# Alternative chosen: ", pred, f"(Correta: {target})")
print(f"# Resultado: {'CORRETO' if is_correct else 'ERRO'}")
print("----------------------------------------------")
print("Tempo para rodar extraction: ", round(d - c, 2))
print("Tempo para rodar invoke: ", round(b - a, 2))


âœ“ [REGEX] PadrÃ£o encontrado: '(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]?([A-H])[\)\]]?' -> D
# Raw Response: 
  
1. Reasoning: The correct answer is (D). To repeat an investigation, it's important to record all relevant details such as the materials used, the setup, and any changes made during the experiment. This will help ensure that the same conditions are replicated in future experiments for accurate comparison of results.
2. Principles: Recording experimental details helps maintain consistency and allows for replication by other scientists or researchers. It also aids in identifying potential sources of error and refining the experimental design.
3. Answer: (D) Record the details of the investigation.

# Alternative chosen:  D (Correta: D)
----------------------------------------------
Tempo para rodar extraction:  0.0
Tempo para rodar invoke:  1.31


# 5) Running

In [None]:
# VALIDAÃ‡ÃƒO
df_resultado_valid_semantic = avaliar_dataset_semantic(
    df=valid_df,
    chain=rag_chain,
    manager_semantic=manager_semantic,
    k_retrieval=3,
    threshold=0.33,
    backup_frequency=130,
    backup_path="../../results/rag_semantic_valid.csv",
    desc="ValidaÃ§Ã£o (Semantic RAG)",
)


Dataset ValidaÃ§Ã£o: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 299/299 [09:10<00:00,  1.84s/it, acc=70.90%, avg_sim=0.6393, erros=6, n_mem=3]


In [None]:
# TESTE
df_resultado_test_semantic = avaliar_dataset_semantic(
    df=test_df,
    chain=rag_chain,
    manager_semantic=manager_semantic,
    k_retrieval=3,
    threshold=0.33,
    backup_frequency=150,
    backup_path="../../results/rag_semantic_test.csv",
    desc="Teste (Semantic RAG)",
)

Dataset Teste: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1172/1172 [36:50<00:00,  1.89s/it, acc=69.28%, avg_sim=0.6920, erros=27, n_mem=3]


In [None]:
df_resultado_test_semantic.to_csv("rag_semantic_test.csv", index=False)
df_resultado_valid_semantic.to_csv("rag_semantic_valid.csv", index=False)
