# Hybrid Memory System (Semantic + Reflection) with Cognitive Scoring + Fact Generation & Refinement

**Fases:**
- **Validation**: Update de scores (EMA), decay, **fact generation**
- **Teste**: Freeze Memory

# 0) Setup

### A) Imports

In [1]:
# Standard library imports
import ast
import os
import re
import time
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
# Add vectorstore & prompts folders to path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'vectorstore')))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..', 'prompts')))

# Third-party imports
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# LangChain imports
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

# Memory managers & metadata builders
from cognitive_memory_manager import CognitiveMemoryManager, build_metadata_semantic
from simple_vector_memory import SimpleVectorMemory, build_metadata_reflection

# Prompt templates
from templates import (
    HYBRID_TEMPLATE,
    SCORE_TEMPLATE,
    EXTRACT_FACTS_TEMPLATE,
    REFINE_FACT_TEMPLATE,
)

# # Local imports
from utils_notebook import (
    SemanticCleaner,
    calcular_metricas_memoria,
    format_choices,
    parse_simple_score,
    make_question
)
cleaner = SemanticCleaner()

load_dotenv()

True

### B) Language Models and Datasets

In [2]:
LABELS = ['A','B','C','D']
number = {'A':0,'B':1,'C':2,'D':3}

In [3]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") 

In [4]:
# Dataset do arc
valid_df = pd.read_csv("../../dataset/arc_challenge_valid_processed.csv")
test_df = pd.read_csv("../../dataset/arc_challenge_test_processed.csv")

In [5]:
# LLM judge
gpt5_nano = ChatOpenAI(model='gpt-5-nano-2025-08-07',temperature=0)

In [6]:
phi2 = ChatOllama(model="phi", temperature=0)
phi2_creative = ChatOllama(model="phi", temperature=0.6)  # for fact generation

### C) Prompts and Chains

In [7]:
hybrid_prompt = PromptTemplate.from_template(HYBRID_TEMPLATE)
rag_chain = hybrid_prompt | phi2

score_prompt = PromptTemplate.from_template(SCORE_TEMPLATE)
score_chain = score_prompt | phi2 | StrOutputParser()

extract_facts_prompt = PromptTemplate.from_template(EXTRACT_FACTS_TEMPLATE)
extract_facts_chain = extract_facts_prompt | gpt5_nano | StrOutputParser()

refine_fact_prompt = PromptTemplate.from_template(REFINE_FACT_TEMPLATE)
refine_fact_chain = refine_fact_prompt | phi2_creative | StrOutputParser()

In [8]:
def call_organizer(fact, temperature=0.1):
    """Limpa e resume um fato cient√≠fico."""
    prompt = f"""### Instruction:
Extract ONLY one objective scientific sentence from the text below. 
NO introductions, NO explanations, NO conversational filler.

Text: "{fact.strip()}"
Scientific Fact (Write it in only one sentence):"""
    
    model = ChatOllama(model="phi", temperature=temperature)
    fact_cleaned = model.invoke(prompt).content.strip().strip('"').strip("'")
    fact_cleaned = cleaner.clean(fact_cleaned)
    return fact_cleaned

# 1) Utils Functions

### A) Context Formatter

In [21]:
def format_hybrid_context(semantic_items, reflection_items, show_scores=False):
    formatted_parts = []
    
    if semantic_items:
        formatted_parts.append("### PART 1: SCIENTIFIC PRINCIPLES (THEORY)")
        for i, fact in enumerate(semantic_items, 1):
            raw_q = fact['metadata'].get('question', '')
            q_text = raw_q.split('\n')[0].strip()
            f_text = fact.get('content', '').strip()
            
            if show_scores:
                sim = fact.get('similarity', 0)
                mem = fact.get('memory_strength', 0)
                final = fact.get('final_score', 0)
                block = f"""
    * **Principle #{i}** (Sim: {sim:.2f}, Mem: {mem:.2f}, Final: {final:.2f})
        Context: "{q_text}"
        Fact: "{f_text}"
"""
            else:
                block = f"""
    * **Principle #{i}**
        * Context: "{q_text}"
        * Fact: "{f_text}"
"""
            formatted_parts.append(block)

    if reflection_items:
        formatted_parts.append("\n### PART 2: SOLVED EXAMPLES (PRACTICE)")
        for i, reflection in enumerate(reflection_items, 1):
            raw_q = reflection['metadata'].get('question', '')
            q_text = raw_q.split('\n')[0].strip()
            r_text = reflection.get('content', '').strip()
            
            block = f"""
    * **Example Case #{i}**
        > Context: {q_text}
        > Analysis: {r_text}
"""
            formatted_parts.append(block)
    
    if not formatted_parts:
        return "No specific reference information found."
    
    return "\n".join(formatted_parts)

### B) Answer Extractor 

In [22]:
# TODO Essa fun√ß√£o deve posteriormente ser dividida em fun√ß√µes menores, cada uma respons√°vel por um aspecto espec√≠fico do processo de formata√ß√£o.
# TODO Migrar a fun√ß√£o para um arquivo utils

import re
from typing import List, Optional

def extract_answer(
    small_llm_model, 
    model_text_output: str, 
    valid_labels: List[str] = None,
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    Extra√ß√£o robusta baseada em padr√µes de alta confian√ßa (Tier 1) 
    com fallback para LLM Juiz em caso de falha.
    """
    if valid_labels is None:
        valid_labels = ['A', 'B', 'C', 'D', 'E']
    
    if not model_text_output:
        return "N/A"
    
    text = model_text_output.strip()

    if "```python" in text or "def solution" in text:
        if debug: print("‚ö† [CODE DETECTED] Enviando para LLM Judge.")
        return _llm_judge(small_llm_model, text, valid_labels, debug, question)

    strong_patterns = [
        r"\\boxed\s*\{\s*([A-H])\s*\}",
        r"(?:Final|Correct)\s+Answer\s*[:\-]?\s*(?:is)?\s*(?:Option)?\s*[\(\[]([A-H])[\)\]](?![a-z])",
        r"The\s+(?:correct\s+)?(?:answer|option|choice)\s+is\s*(?:Option)?\s*[:\-]?\s*[\(\[]([A-H])[\)\]](?![a-z])",
        r"(?:Final|Correct)\s+Answer\s*[:\-]\s*(?:is\s+)?(?:Option\s+)?([A-H])(?=\s|\.|,|!|\?|$)(?![A-Za-z])",
        r"(?:Therefore|Thus|Hence|So),\s*(?:the\s+answer\s+is\s*)?(?:Option)?\s*[\(\[]([A-H])[\)\]](?![a-z])",
        r"(?:^|\n)\s*Answer\s*:\s*([A-H])(?=\s|$|\.|\,)(?![A-Za-z])",
        r"\*\*([A-H])\*\*(?![a-z])",
        r"^([A-H])$"
    ]
    
    for pattern in strong_patterns:
        matches = list(re.finditer(pattern, text, re.IGNORECASE))
        if matches:
            # Pega a √∫ltima ocorr√™ncia desse padr√£o espec√≠fico
            last_match = matches[-1]
            candidate = last_match.group(1).upper()
            
            if candidate in valid_labels:
                if debug: 
                    print(f"‚úì [REGEX] Padr√£o encontrado: '{pattern}' -> {candidate}")
                return candidate

    #  FALLBACK: LLM 
    # Se nenhum padr√£o forte foi encontrado
    # Deixe o LLM ler e decidir.
    
    if debug: print("‚úó [REGEX] Falha nos padr√µes fortes. Chamando LLM Judge.")
    return _llm_judge(small_llm_model, text, valid_labels, debug, question)


def _llm_judge(
    small_llm_model, 
    text: str, 
    valid_labels: List[str],
    debug: bool = False,
    question: Optional[str] = None,
) -> str:
    """
    Juiz focado em INFER√äNCIA. Ele deve ler o racioc√≠nio e mapear para a letra correta,
    mesmo que o modelo n√£o tenha dito a letra explicitamente.
    """
    
    # Monta contexto apenas se a pergunta existir
    context_block = ""
    if question:
        context_block = f"""
### CONTEXT (Use this to infer the answer letter):
{question}
"""

    prompt = f"""You are an Answer Extraction Bot. 
Your ONLY job is to identify which option the following "Model Output" concluded is correct.

{context_block}

### Model Output to Analyze:
{text}

### Instructions:
1. Look for an explicit answer (e.g., "Answer: A").
2. If NO explicit letter is found, read the conclusion of the "Model Output" and match it against the Options in the Context. **Infer the letter.**
   - Example: If Context has "(A) 5 (B) 10" and Model Output says "The result is 10", you must output B.
3. Do NOT calculate or solve the problem yourself. Trust the "Model Output".
4. If the model refuses to answer or is unclear, return "E".

Output format: Just the single letter (A, B, C, D, or E). No other text."""

    try:
        response = small_llm_model.invoke(prompt) 
        output = response.content if hasattr(response, 'content') else str(response)
        clean_cand = re.sub(r"[^A-E]", "", output.strip().upper())
        
        if len(clean_cand) > 1:
            clean_cand = clean_cand[-1]
            
        if clean_cand in valid_labels:
            if debug: print(f"‚úì [LLM JUIZ] Inferido: {clean_cand}")
            return clean_cand
            
        return "N/A"

    except Exception as e:
        if debug: print(f"‚úó [ERRO JUIZ] {e}")
        return "N/A"

### C) Check Fact Validation

In [23]:
def is_invalid(question, fact, original_fact, check_size=True, check_content=True, 
               check_similarity=True, min_similarity=0.2, return_reason=False):
    """
    Valida se um fato cient√≠fico gerado √© inv√°lido.
    Inclui verifica√ß√£o de drift sem√¢ntico via Similaridade de Cosseno.
    """
    if not fact or not isinstance(fact, str):
        return (True, "Fato vazio ou inv√°lido") if return_reason else True

    fact_lower = fact.lower()
    word_count = len(fact.split())
    
    # 1. Valida√ß√µes de Estrutura
    if check_size:
        size_checks = [
            (word_count > 100, f"Muito longo ({word_count} tokens)"),
            (word_count < 10, f"Muito curto ({word_count} tokens)"),
            ('\n' in fact, "Cont√©m quebras de linha"),
            ('#' in fact or '_' in fact, "Cont√©m caracteres especiais (#/_)"),
            (fact_lower == original_fact.lower(), "Id√™ntico ao original")
        ]
        for condition, reason in size_checks:
            if condition: return (True, reason) if return_reason else True

    # 2. Valida√ß√µes de Conte√∫do
    if check_content:
        forbidden_terms = ['score', 'correct', 'answer', 'instruction:', 'critique:']
        for term in forbidden_terms:
            if term in fact_lower:
                return (True, f"Cont√©m termo proibido: {term}") if return_reason else True

    # 3. Valida√ß√£o de Similaridade
    if check_similarity:
        try:
            vectorizer = TfidfVectorizer(stop_words=None)
            tfidf = vectorizer.fit_transform([fact_lower, question.lower()])
            sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]
            
            if sim < min_similarity:
                reason = f"Baixa similaridade ({sim:.2f}). Poss√≠vel alucina√ß√£o."
                return (True, reason) if return_reason else True
        except Exception as e:
            return (True, "Erro no c√°lculo de similaridade") if return_reason else True

    return (False, None) if return_reason else False

### D) Score Facts

In [24]:
def extract_and_score_new_facts(question, response_text, is_correct, pred, target, row, manager):
    """
    Extrai fatos cient√≠ficos da resposta do modelo, atribui scores e salva imediatamente.
    
    Returns:
        List[dict]: Lista de fatos com scores
    """
    try:
        # 1. Extra√ß√£o de fatos com GPT-5 nano
        facts_text = extract_facts_chain.invoke({
            "model_response": response_text,
            "question": question
        })
        
        # Parse fatos (separados por linha)
        raw_facts = [f.strip() for f in facts_text.split('\n') if f.strip()]
        if not raw_facts:
            return []
        
        # Limitar a 2 fatos
        raw_facts = raw_facts[:2]
        
        # 2. Limpar fatos
        cleaned_facts = []
        for fact in raw_facts:
            cleaned = cleaner.clean(fact)
            if cleaned and len(cleaned.split()) >= 10:  # M√≠nimo 10 palavras
                cleaned_facts.append(cleaned)
        
        if not cleaned_facts:
            return []
        
        # 3. Calcular scores para cada fato
        scored_facts = []
        choices = ast.literal_eval(row['choices'])
        
        # Prote√ß√£o contra √≠ndices inv√°lidos
        try:
            correct_txt = choices['text'][number[target]] if target in number else str(target)
        except (IndexError, KeyError):
            correct_txt = str(target)
        
        try:
            chosen_txt = choices['text'][number[pred]] if pred in number else str(pred)
        except (IndexError, KeyError):
            chosen_txt = str(pred)
        
        status = "SUCCESS" if is_correct else "FAILURE"
        
        for fact in cleaned_facts:
            try:
                score_response = score_chain.invoke({
                    "question": question,
                    "chosen": chosen_txt,
                    "correct": correct_txt,
                    "outcome_status": status,
                    "fact": fact,
                    "reasoning": response_text
                })
                
                reflection, score = parse_simple_score(score_response)
                
                scored_facts.append({
                    'content': fact,
                    'score': score,
                    'reflection': reflection,
                    'original_id': str(row['id']),
                    'question': question,
                    'correct_answer': target
                })
            except Exception as e:
                print(f"‚ö†Ô∏è Erro ao calcular score: {e}")
                continue
        
        # 4. Salvar fatos no vectorstore
        if scored_facts:
            new_memory_entries = []
            for fact_data in scored_facts:
                new_memory_entries.append({
                    'content': fact_data['content'],
                    'metadata': {
                        'original_id': fact_data['original_id'],
                        'question': fact_data['question'],
                        'correct_answer': fact_data['correct_answer'],
                        'score': fact_data['score'],
                        'frequency': 0,
                        'scientific_fact': fact_data['content'],
                        'origin': 'validation_generated',
                        'type': 'semantic_memory',
                        'initial_score': fact_data['score']
                    }
                })
            
            manager.add_new_memories(new_memory_entries, id_prefix='generated')
        
        return scored_facts
        
    except Exception as e:
        print(f"‚ö†Ô∏è Erro na extra√ß√£o de fatos: {e}")
        return []

### E) Fact Refinement

In [25]:
def refine_negative_facts(used_facts, is_correct, question, pred, target, row, manager, 
                          refine_threshold=0.3, verbose=False):
    """
    ‚ú® NEW: Refina fatos recuperados que contribu√≠ram para erro.
    Aplica apenas quando houve falha (is_correct = False).
    
    Args:
        refine_threshold: Score abaixo do qual um fato ser√° refinado (default: 0.3)
        verbose: Se True, mostra informa√ß√µes de debug
    
    Returns:
        int: N√∫mero de fatos refinados
    """
    if is_correct or not used_facts:
        return 0
    
    refined_count = 0
    candidates_count = 0
    choices = ast.literal_eval(row['choices'])
    
    # Prote√ß√£o contra √≠ndices inv√°lidos
    try:
        correct_txt = choices['text'][number[target]] if target in number else str(target)
    except (IndexError, KeyError):
        correct_txt = str(target)
    
    try:
        wrong_txt = choices['text'][number[pred]] if pred in number else str(pred)
    except (IndexError, KeyError):
        wrong_txt = str(pred)
    
    if verbose:
        print(f"\nüîç Analisando {len(used_facts)} fatos para refinamento (threshold: {refine_threshold})...")
    
    for memory in used_facts:
        fact_text = memory['content']
        memory_score = memory.get('memory_strength', 0.0)
        
        # Refina apenas fatos com score baixo (< refine_threshold)
        if memory_score >= refine_threshold:
            if verbose:
                print(f"   ‚äó Fato ignorado (score {memory_score:.2f} >= {refine_threshold})")
            continue
        
        candidates_count += 1
        if verbose:
            print(f"   ‚úì Candidato #{candidates_count} (score: {memory_score:.2f})")
        
        try:
            # Gerar reflex√£o/cr√≠tica sobre o fato
            score_response = score_chain.invoke({
                "question": question,
                "chosen": wrong_txt,
                "correct": correct_txt,
                "outcome_status": "FAILURE",
                "fact": fact_text,
                "reasoning": f"Used fact led to wrong answer: {wrong_txt}"
            })
            
            reflection, _ = parse_simple_score(score_response)
            
            # Tentar gerar fato refinado
            max_attempts = 3
            for attempt in range(max_attempts):
                new_fact_text = refine_fact_chain.invoke({
                    "question": question,
                    "correct_answer": correct_txt,
                    "old_fact": fact_text,
                    "reflection": reflection,
                    "wrong_answer": wrong_txt
                })
                
                clean_new_fact = call_organizer(new_fact_text, temperature=0.1)
                
                # Validar fato refinado
                q_c = f"{question} {correct_txt}"
                if not is_invalid(question=q_c, fact=clean_new_fact, original_fact=fact_text, 
                                check_size=True, check_content=True, check_similarity=True):
                    # Salvar fato refinado
                    manager.add_new_memories([{
                        'content': clean_new_fact,
                        'metadata': {
                            'original_id': str(row['id']),
                            'question': question,
                            'correct_answer': target,
                            'score': 0,  # Score neutro inicial
                            'frequency': 0,
                            'scientific_fact': clean_new_fact,
                            'origin': 'validation_refined',
                            'type': 'semantic_memory',
                            'initial_score': 0,
                            'refined_from': fact_text
                        }
                    }], id_prefix='refined')
                    
                    refined_count += 1
                    if verbose:
                        print(f"   ‚úÖ Fato refinado com sucesso (tentativa {attempt+1})")
                    break
                elif verbose and attempt == max_attempts - 1:
                    print(f"   ‚ùå Falha ap√≥s {max_attempts} tentativas")
                    
        except Exception as e:
            if verbose:
                print(f"   ‚ö†Ô∏è Erro ao refinar fato: {e}")
            continue
    
    if verbose:
        print(f"üìä Resultado: {candidates_count} candidatos, {refined_count} refinados\n")
    
    return refined_count

### D) Evaluation

In [26]:
def avaliar_dataset_cognitive_plus(
    df, 
    chain=rag_chain,
    manager_semantic=None,
    manager_reflection=None,  
    score_chain=score_chain,
    k_semantic=3,
    k_reflection=3,  #  Quantidade de mem√≥rias reflexivas
    threshold_semantic=0.24,
    threshold_reflection=0.24,  #  Threshold para reflex√£o
    semantic_weight=0.7,
    update_memory=False,
    decay_batch_size=False,
    decay_frequency=50,
    extract_new_facts=False,
    refine_facts=False,
    refine_threshold=0.3,
    refine_verbose=False,
    backup_frequency=160,
    backup_path=None,
    desc="Hybrid Memory System PLUS"
):
    """
    Avalia dataset com sistema h√≠brido (sem√¢ntica + reflex√£o) + gera√ß√£o/refinamento.
    
    """
    resultados = []
    acertos = 0
    total = len(df)
    decay_counter = 0
    backup_counter = 0
    
    # Contadores de gera√ß√£o/refinamento
    total_facts_generated = 0
    total_facts_refined = 0
    total_facts_errors = 0
    
    # Acumuladores globais
    global_sims_sem = []
    global_sims_ref = []
    global_counts_sem = []
    global_counts_ref = []
    
    erros = 0

    # Preparar diret√≥rio de backup
    if backup_path:
        backup_dir = os.path.dirname(backup_path)
        if backup_dir and not os.path.exists(backup_dir):
            os.makedirs(backup_dir, exist_ok=True)
    
    loop = tqdm(df.iterrows(), total=total, desc=desc)

    for idx, row in loop:
        try:
            full_question = make_question(row, inline=False)[0]
            question_for_retriever = re.sub(r'\([A-Z]\)\s*', '', row['question'])
            
            # Recuperar mem√≥rias sem√¢nticas (COM SCORE COGNITIVO)
            items_semantic, _ = manager_semantic.retrieve_ranked_memories(
                query=question_for_retriever, 
                k=k_semantic,
                threshold=threshold_semantic,
                semantic_weight=semantic_weight,
                show_scores=False
            )
            
            # Recuperar mem√≥rias reflexivas (SEM SCORE - recupera√ß√£o simples)
            items_reflection = []
            if manager_reflection:
                items_reflection = manager_reflection.retrieve_memories(
                    query=question_for_retriever, 
                    k=k_reflection,
                    threshold=threshold_reflection
                )
            
            # NEW: Combinar contextos (h√≠brido)
            hybrid_context_formatted = format_hybrid_context(items_semantic, items_reflection)

            count_sem, avg_sim_sem, raw_sims_sem = calcular_metricas_memoria(items_semantic)
            count_ref, avg_sim_ref, raw_sims_ref = calcular_metricas_memoria(items_reflection)
            
            global_counts_sem.append(count_sem)
            global_counts_ref.append(count_ref)
            global_sims_sem.extend(raw_sims_sem)
            global_sims_ref.extend(raw_sims_ref)

            response_obj = chain.invoke({
                "question": full_question,
                "hybrid_context": hybrid_context_formatted,  # ‚ú® Mudan√ßa de similar_facts para hybrid_context
            })

            response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)
            
            pred = extract_answer(
                small_llm_model=gpt5_nano, 
                model_text_output=response_text,
                question=full_question,
            )

            if pred not in ['A', 'B', 'C', 'D']:
                pred = 'E'
                erros += 1
            
            target = row['answerKey']

            is_correct = (pred == target)
            acertos += int(is_correct)
            
            # === EXTRA√á√ÉO DE NOVOS FATOS ===
            if extract_new_facts and pred != 'E':
                try:
                    new_facts = extract_and_score_new_facts(
                        question=full_question,
                        response_text=response_text,
                        is_correct=is_correct,
                        pred=pred,
                        target=target,
                        row=row,
                        manager=manager_semantic
                    )
                    if new_facts:
                        total_facts_generated += len(new_facts)
                    else:
                        total_facts_errors += 1
                except Exception as e:
                    total_facts_errors += 1
                    tqdm.write(f"‚ùå Q{idx+1}: Erro na extra√ß√£o de fatos: {e}")
            
            # === UPDATE DE MEM√ìRIA EXISTENTE ===
            if update_memory and items_semantic and pred != 'E':
                used_ids = [m['doc_id'] for m in items_semantic]
                
                # Calcular feedback scores
                feedback_scores = []
                choices = ast.literal_eval(row['choices'])
                
                try:
                    correct_txt = choices['text'][number[target]] if target in number else str(target)
                except (IndexError, KeyError):
                    correct_txt = str(target)
                
                try:
                    chosen_txt = choices['text'][number[pred]] if pred in number else str(pred)
                except (IndexError, KeyError):
                    chosen_txt = str(pred)
                
                status = "SUCCESS" if is_correct else "FAILURE"
                
                for memory in items_semantic:
                    fact_text = memory['content']
                    
                    try:
                        score_response = score_chain.invoke({
                            "question": full_question,
                            "chosen": chosen_txt,
                            "correct": correct_txt,
                            "outcome_status": status,
                            "fact": fact_text,
                            "reasoning": response_text
                        })
                        
                        _, score = parse_simple_score(score_response)
                        feedback_scores.append(score)
                    except:
                        feedback_scores.append(0)
                
                # Aplicar update EMA
                manager_semantic.update_memories_feedback(used_ids, feedback_scores)
            
            # === REFINAMENTO DE FATOS ===
            if refine_facts and not is_correct:
                refined = refine_negative_facts(
                    used_facts=items_semantic,
                    is_correct=is_correct,
                    question=full_question,
                    pred=pred,
                    target=target,
                    row=row,
                    manager=manager_semantic,
                    refine_threshold=refine_threshold,
                    verbose=refine_verbose
                )
                total_facts_refined += refined
            
            # === DECAY ===
            if update_memory and decay_frequency > 0 and (idx + 1) % decay_frequency == 0:
                decay_counter += 1
                tqdm.write(f"üåô [Q{idx+1}/{total}] Aplicando ciclo de decay #{decay_counter}...")
                manager_semantic.apply_decay_cycle(decay_threshold=0.0)
            
            # === BACKUP  ===
            if backup_path and backup_frequency > 0 and (idx + 1) % backup_frequency == 0:
                backup_counter += 1
                df_partial = pd.DataFrame(resultados)
                timestamp = time.strftime("%Y%m%d_%H%M%S")
                backup_file = backup_path.replace('.csv', f'_backup.csv')
                df_partial.to_csv(backup_file, index=False)
            
            # Atualiza barra de progresso
            acc_atual = (acertos / (loop.n + 1)) * 100
            loop.set_postfix(
                acc=f"{acc_atual:.2f}%", 
                sem=count_sem,
                ref=count_ref,  # ‚ú® NEW
                decay=decay_counter,
                gen=total_facts_generated,
                refine=total_facts_refined,
                err=total_facts_errors,
                bkp=backup_counter,
                erros=erros
            )

            resultados.append({
                'index': idx,
                'question': full_question,
                'retrieved_context': hybrid_context_formatted,  # ‚ú® Contexto h√≠brido
                'retrieved_count_semantic': count_sem,
                'retrieved_count_reflection': count_ref,  # ‚ú® NEW
                'avg_similarity_semantic': avg_sim_sem,
                'avg_similarity_reflection': avg_sim_ref,  # ‚ú® NEW
                'raw_output': response_text,
                'pred': pred,
                'target': target,
                'is_correct': is_correct,
                'source': desc,
                'erros': erros
            })

        except Exception as e:
            tqdm.write(f"Erro no √≠ndice {idx}: {e}")
            resultados.append({
                'index': idx, 
                'error': str(e), 
                'is_correct': False, 
                'retrieved_count_semantic': 0,
                'retrieved_count_reflection': 0, 
                'avg_similarity_semantic': 0.0,
                'avg_similarity_reflection': 0.0,
            })
    
    # === DECAY FINAL ===
    if update_memory and decay_batch_size:
        decay_counter += 1
        print(f"\nüåô [FINAL] Aplicando ciclo de decay final (#{decay_counter})...")
        manager_semantic.apply_decay_cycle(decay_threshold=0.0)
    
    # Sum√°rio final
    if update_memory:
        print(f"\nüìä SUM√ÅRIO DE MEM√ìRIA:")
        print(f"   Ciclos de decay: {decay_counter}")
        print(f"   Fatos gerados: {total_facts_generated}")
        print(f"   Fatos refinados: {total_facts_refined}")
        print(f"   Erros na extra√ß√£o: {total_facts_errors}")
        print(f"   Mem√≥rias sem√¢nticas finais: {manager_semantic._collection.count()}")
        if manager_reflection:
            print(f"   Mem√≥rias reflexivas: {manager_reflection._collection.count()}")
    
    if backup_path and backup_counter > 0:
        print(f"üíæ Total de backups salvos: {backup_counter}")

    
    if backup_path and backup_counter > 0:    return pd.DataFrame(resultados)

    return pd.DataFrame(resultados)

# 2) Constru√ß√£o dos Bancos de Dados

In [27]:
# Preparar dados
scientific_facts = pd.read_csv("../../scientific_facts_expanded.csv").fillna("N/A")

df_semantic = scientific_facts[scientific_facts['scientific_fact'] != "N/A"].drop_duplicates(subset=['scientific_fact'])
df_reflection = scientific_facts[scientific_facts['clean_reasoning'] != "N/A"].drop_duplicates(subset=['clean_reasoning'])

In [28]:
FORCE_RESET = False

manager_semantic = CognitiveMemoryManager(
    db_path="vectorstores/hybrid_refinement_var/chroma_semantic",
    embedding_model=embedding_model, alpha=0.1, decay_lambda=0.99, forget_threshold=-0.6
).init_from_dataframe(df=df_semantic, content_col='scientific_fact',
                      id_prefix='semantic', metadata_func=build_metadata_semantic, reset_db=FORCE_RESET)

manager_reflection = SimpleVectorMemory(
    db_path="vectorstores/semantic_refinement_cognitive_aaa/chroma_reflection",
    embedding_model=embedding_model
).init_from_dataframe(df=df_reflection, content_col='clean_reasoning',
                      id_prefix='reflection', metadata_func=build_metadata_reflection, reset_db=FORCE_RESET)

üìÇ Carregando mem√≥ria existente de: vectorstores/hybrid_refinement_var/chroma_semantic
‚úÖ Mem√≥ria carregada com 3993 itens.
üìÇ Carregando base de reflex√£o de: vectorstores/semantic_refinement_cognitive_aaa/chroma_reflection
‚úÖ Base carregada com 1080 itens.


# 3)  Testing in one question

In [29]:
i = 0
k_semantic = 3
k_reflection = 3
row = test_df.iloc[i]

In [30]:
q_text = make_question(row, inline=False)[0]
question_clean = re.sub(r'\([A-Z]\)\s*', '', q_text)

# Recuperar mem√≥rias sem√¢nticas (com score cognitivo)
items_semantic, _ = manager_semantic.retrieve_ranked_memories(
    query=question_clean, 
    k=k_semantic, 
    threshold=0.33,
    semantic_weight=0.5,
    show_scores=True
)

# ‚ú® NEW: Recuperar mem√≥rias reflexivas (simples)
items_reflection = manager_reflection.retrieve_memories(
    query=question_clean, 
    k=k_reflection,
    threshold=0.33
)

# ‚ú® NEW: Combinar contextos
hybrid_context = format_hybrid_context(items_semantic, items_reflection, show_scores=True)

print("Question: \n", q_text)
print("\nHybrid Context: \n", hybrid_context)

count_sem, avg_sim_sem, raw_sims_sem = calcular_metricas_memoria(items_semantic)
count_ref, avg_sim_ref, raw_sims_ref = calcular_metricas_memoria(items_reflection)
print("\nM√©tricas Sem√¢nticas: Count:", count_sem, "AVG_SIM:", avg_sim_sem)
print("M√©tricas Reflexivas: Count:", count_ref, "AVG_SIM:", avg_sim_ref)

Question: 
 An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
(A) Planetary density will decrease.
(B) Planetary years will become longer.
(C) Planetary days will become shorter.
(D) Planetary gravity will become stronger.

Hybrid Context: 
 ### PART 1: SCIENTIFIC PRINCIPLES (THEORY)

    * **Principle #1** (Sim: 0.53, Mem: 1.00, Final: 0.77)
        Context: "Why is it winter in North America when it is summer in South America?"
        Fact: "Axial precession causes the Earth's axis to change its orientation relative to its orbit around the sun over a period of approximately 26,000 years."


    * **Principle #2** (Sim: 0.50, Mem: 1.00, Final: 0.75)
        Context: "Michael learned that the movement of Earth in the solar system causes changes that can be seen on the planet. Which change could be seen on Earth in the time it takes Earth to rotate once on its axis?"
        Fact: "This apparent m

In [31]:
import time 

a = time.time()
# Usar o contexto h√≠brido (sem mostrar scores para o teste final)
hybrid_context_clean = format_hybrid_context(items_semantic, items_reflection, show_scores=False)
response_obj = rag_chain.invoke({
    "question": q_text,
    "hybrid_context": hybrid_context_clean,  # ‚ú® Mudan√ßa: similar_facts -> hybrid_context
})
print("Time to answer: ", round(time.time()-a,2))
response_text = response_obj.content if hasattr(response_obj, "content") else str(response_obj)

pred = extract_answer(
    small_llm_model=gpt5_nano, 
    model_text_output=response_text,
    question=q_text,
    debug=True
)

target = row['answerKey']
is_correct = (pred == target)

print("# Raw Response: \n", response_text)
print(f"# Alternative chosen: {pred} (Correta: {target})")
print(f"# Resultado: {'CORRETO' if is_correct else 'ERRO'}")

ConnectError: [WinError 10061] Nenhuma conex√£o p√¥de ser feita porque a m√°quina de destino as recusou ativamente

In [None]:
# Teste de extra√ß√£o de fatos
new_facts = extract_and_score_new_facts(
    question=q_text,
    response_text=response_text,
    is_correct=is_correct,
    pred=pred,
    target=target,
    row=row,
    manager=manager_semantic
)

print(f"\n‚ú® Fatos extra√≠dos: {len(new_facts)}")
for i, fact in enumerate(new_facts, 1):
    print(f"\nFato {i} (Score: {fact['score']}): {fact['content']}")


‚ú® Fatos extra√≠dos: 2

Fato 1 (Score: 0): The length of a planet's day is determined by its rotation rate; faster rotation yields a shorter day.

Fato 2 (Score: 0): Changes in rotation rate occur when external torques act on a body, altering the length of its day.


In [None]:
# ‚ú® Teste de refinamento de fatos (apenas quando houver erro)
if not is_correct:
    print("\n" + "="*60)
    print("üîß TESTE DE REFINAMENTO DE FATOS")
    print("="*60)
    
    refined_count = refine_negative_facts(
        used_facts=items_semantic,
        is_correct=is_correct,
        question=q_text,
        pred=pred,
        target=target,
        row=row,
        manager=manager_semantic,
        refine_threshold=0.3,  # Refina fatos com score < 0.3
        verbose=True  # Mostra logs detalhados
    )
    
    print(f"\n‚ú® Total de fatos refinados: {refined_count}")
else:
    print("\n‚úì Resposta correta - refinamento n√£o aplicado")


‚úì Resposta correta - refinamento n√£o aplicado


# 4) Running

In [None]:
# VALIDA√á√ÉO: Mem√≥ria ATIVA + Gera√ß√£o/Refinamento + Recupera√ß√£o de Reflex√µes
df_resultado_valid_semantic_plus = avaliar_dataset_cognitive_plus(
    df=valid_df, 
    chain=rag_chain,
    manager_semantic=manager_semantic,
    manager_reflection=manager_reflection, 
    k_semantic=3,
    k_reflection=3,  
    threshold_reflection=0.24,  
    semantic_weight=0.7,
    update_memory=True,
    decay_batch_size=True,
    decay_frequency=100,
    extract_new_facts=True,
    refine_facts=True,
    refine_threshold=0.3,
    backup_frequency=130,
    backup_path="../../results/hybrid_refinement_cognitive_valid.csv",
    desc="Valida√ß√£o (Hybrid Memory + Gen/Refine)"
)

Valida√ß√£o (Hybrid Memory + Gen/Refine):  11%|‚ñà         | 32/299 [12:03<1:39:26, 22.35s/it, acc=78.12%, bkp=0, decay=0, err=2, erros=1, gen=48, ref=3, refine=0, sem=3] 

üåô [Q33/299] Aplicando ciclo de decay #1...


Valida√ß√£o (Hybrid Memory + Gen/Refine):  11%|‚ñà         | 33/299 [12:03<1:50:13, 24.86s/it, acc=75.76%, bkp=0, decay=1, err=2, erros=1, gen=50, ref=3, refine=0, sem=3]

üóëÔ∏è Ciclo de Limpeza: 792 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  22%|‚ñà‚ñà‚ñè       | 65/299 [27:34<1:47:35, 27.59s/it, acc=83.33%, bkp=0, decay=2, err=8, erros=1, gen=94, ref=3, refine=1, sem=3]

üåô [Q66/299] Aplicando ciclo de decay #2...
üóëÔ∏è Ciclo de Limpeza: 12 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  33%|‚ñà‚ñà‚ñà‚ñé      | 98/299 [46:39<1:30:25, 26.99s/it, acc=78.79%, bkp=0, decay=3, err=13, erros=1, gen=143, ref=3, refine=2, sem=3] 

üåô [Q99/299] Aplicando ciclo de decay #3...
üóëÔ∏è Ciclo de Limpeza: 14 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 131/299 [1:02:33<1:09:46, 24.92s/it, acc=78.03%, bkp=1, decay=4, err=14, erros=1, gen=194, ref=3, refine=3, sem=3]

üåô [Q132/299] Aplicando ciclo de decay #4...
üóëÔ∏è Ciclo de Limpeza: 15 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 164/299 [1:18:49<32:20, 14.38s/it, acc=79.39%, bkp=1, decay=5, err=18, erros=1, gen=239, ref=3, refine=3, sem=3]   

üåô [Q165/299] Aplicando ciclo de decay #5...
üóëÔ∏è Ciclo de Limpeza: 10 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 197/299 [1:30:44<32:32, 19.14s/it, acc=79.29%, bkp=1, decay=6, err=20, erros=1, gen=294, ref=3, refine=3, sem=3]  

üåô [Q198/299] Aplicando ciclo de decay #6...
üóëÔ∏è Ciclo de Limpeza: 13 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 230/299 [1:41:19<28:09, 24.48s/it, acc=78.35%, bkp=1, decay=7, err=27, erros=2, gen=335, ref=3, refine=4, sem=3]

üåô [Q231/299] Aplicando ciclo de decay #7...
üóëÔ∏è Ciclo de Limpeza: 10 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 264/299 [1:52:01<12:39, 21.70s/it, acc=78.79%, bkp=2, decay=8, err=28, erros=2, gen=386, ref=3, refine=4, sem=3]

üåô [Q264/299] Aplicando ciclo de decay #8...
üóëÔ∏è Ciclo de Limpeza: 10 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine):  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 296/299 [2:05:06<01:02, 20.69s/it, acc=78.38%, bkp=2, decay=8, err=32, erros=2, gen=431, ref=3, refine=4, sem=3]

üåô [Q297/299] Aplicando ciclo de decay #9...


Valida√ß√£o (Hybrid Memory + Gen/Refine):  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 297/299 [2:05:06<00:41, 20.73s/it, acc=78.45%, bkp=2, decay=9, err=32, erros=2, gen=433, ref=3, refine=4, sem=3]

üóëÔ∏è Ciclo de Limpeza: 19 mem√≥rias esquecidas (Score < -0.6).


Valida√ß√£o (Hybrid Memory + Gen/Refine): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 299/299 [2:05:42<00:00, 25.23s/it, acc=78.26%, bkp=2, decay=9, err=33, erros=2, gen=435, ref=3, refine=4, sem=3]


üåô [FINAL] Aplicando ciclo de decay final (#10)...

üìä SUM√ÅRIO DE MEM√ìRIA:
   Ciclos de decay: 10
   Fatos gerados: 435
   Fatos refinados: 4
   Erros na extra√ß√£o: 33
   Mem√≥rias sem√¢nticas finais: 3559
   Mem√≥rias reflexivas: 1080
üíæ Total de backups salvos: 2





In [68]:
df_resultado_valid_semantic_plus.to_csv("hybrid_refinement_cognitive_valid_var.csv", index=False)

In [69]:
# TESTE: Mem√≥ria CONGELADA + Recupera√ß√£o de Reflex√µes (apenas recupera√ß√£o)
df_resultado_test_semantic_plus07 = avaliar_dataset_cognitive_plus(
    df=test_df, 
    chain=rag_chain,
    manager_semantic=manager_semantic,
    manager_reflection=manager_reflection,  # ‚ú® NEW: Adiciona mem√≥ria reflexiva
    score_chain=score_chain,
    k_semantic=3,
    k_reflection=3,  # ‚ú® NEW: Quantidade de reflex√µes
    threshold_reflection=0.24,  # ‚ú® NEW: Threshold para reflex√µes
    semantic_weight=0.7,
    update_memory=False,
    decay_batch_size=False,
    extract_new_facts=False,
    refine_facts=False,
    backup_frequency=150,
    backup_path="../../results/hybrid_refinement_cognitive_test.csv",
    desc="Teste (Hybrid Memory Frozen)"
)

Teste (Hybrid Memory Frozen): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1172/1172 [46:43<00:00,  2.39s/it, acc=73.12%, bkp=7, decay=0, err=0, erros=0, gen=0, ref=3, refine=0, sem=3] 

üíæ Total de backups salvos: 7





In [78]:
df_resultado_test_semantic_plus07.to_csv("hybrid_refinement_cognitive_test_var_07.csv", index=False)

In [76]:
df_resultado_test_semantic_plus07['is_correct'].mean()

np.float64(0.7312286689419796)