In [7]:
# 03 - Evaluaci√≥n del Sistema RAG (CON VISUALIZACI√ìN DE RESPUESTAS Y GENERACI√ìN)
# =============================================================================

# ===================================
# 1. Setup y carga de dependencias
# ===================================

import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache
from openai import AzureOpenAI
from dotenv import load_dotenv
import time

load_dotenv()

# Verifica que las variables se cargaron
print("üîë Clave cargada:", bool(os.getenv("AZURE_OPENAI_API_KEY")))

# üîπ Conexi√≥n a Azure OpenAI
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",
    azure_endpoint="https://pnl-maestria.openai.azure.com/"
)

project_root = os.path.abspath("..")
sys_path = os.path.join(project_root, "src")
if sys_path not in sys.path:
    sys.path.append(sys_path)

print("‚úÖ Project root:", project_root)

# ===================================
# 2. Cargar datos e √≠ndice TF-IDF
# ===================================

from utils import load_chunks_from_folder
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_PREPROCESSED = os.path.join(project_root, "data", "preprocessed")
folders = sorted([
    os.path.join(BASE_PREPROCESSED, f)
    for f in os.listdir(BASE_PREPROCESSED)
    if f.startswith("processed_")
])

records = []
for folder in folders:
    recs = load_chunks_from_folder(folder)
    records.extend(recs)

df = pd.DataFrame.from_records(records)
documents = df["text"].astype(str).tolist()

print(f"üìÑ Total chunks cargados: {len(documents)}")

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(documents)

print(f"‚úÖ √çndice TF-IDF recreado: {X.shape[0]} documentos, {X.shape[1]} features")

# ===================================
# 3. Funciones de evaluaci√≥n
# ===================================

@lru_cache(maxsize=128)
def get_similarities_cached(query, k=5):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X).flatten()
    top_k_indices = np.argsort(similarities)[::-1][:k]
    top_k_scores = similarities[top_k_indices]
    return tuple(top_k_indices), tuple(top_k_scores)


def recall_at_k(top_k_indices, relevant_indices):
    if not relevant_indices:
        return 0.0
    hits = len(set(top_k_indices) & set(relevant_indices))
    return hits / len(relevant_indices)


def precision_at_k(top_k_indices, relevant_indices, k=5):
    if not relevant_indices:
        return 0.0
    hits = len(set(top_k_indices) & set(relevant_indices))
    return hits / k


def average_context_size(retrieved_indices, documents):
    chunks = [documents[i] for i in retrieved_indices]
    sizes = [len(c.split()) for c in chunks]
    return np.mean(sizes) if sizes else 0


def search_tfidf(query, k=5):
    indices, scores = get_similarities_cached(query, k)
    return list(indices), list(scores)


# ===================================
# 4. Ground truth
# ===================================

def find_relevant_chunks_fast(keyword_list, documents, max_chunks=3):
    relevant = []
    keywords_lower = [kw.lower() for kw in keyword_list]

    for i, doc in enumerate(documents):
        if len(relevant) >= max_chunks:
            break
        doc_lower = doc.lower()
        if any(kw in doc_lower for kw in keywords_lower):
            relevant.append(i)

    return relevant


# ===================================
# 5. Definir consultas
# ===================================

queries = [
    "Who saves Bella from the van?",
    "Which Cullen family member is a doctor?",
]

keywords_per_query = [
    ["edward", "van", "save"],
    ["carlisle", "doctor"],
]

print(f"\nüîç Consultas definidas: {len(queries)}")
print("\nü§ñ Generando ground truth autom√°tico...")
ground_truth = []
for i, keywords in enumerate(keywords_per_query):
    relevant = find_relevant_chunks_fast(keywords, documents, max_chunks=3)
    ground_truth.append(relevant)
    print(f"Query {i+1}: {len(relevant)} chunks relevantes")


# ===================================
# 6. Evaluaci√≥n con visualizaci√≥n + generaci√≥n RAG
# ===================================

def evaluate_with_generation(queries, ground_truth, documents, k=5):
    recalls, precisions, context_sizes, all_responses = [], [], [], []

    print("\n" + "="*80)
    print("EVALUACI√ìN DEL SISTEMA RAG (RETRIEVAL + GENERACI√ìN)")
    print("="*80)

    for i, query in enumerate(queries):
        print(f"\n{'='*80}")
        print(f"üîé CONSULTA {i+1}: '{query}'")
        print("="*80)

        # --- Recuperaci√≥n
        retrieved_indices, scores = search_tfidf(query, k=k)
        recall = recall_at_k(retrieved_indices, ground_truth[i])
        precision = precision_at_k(retrieved_indices, ground_truth[i], k=k)
        context_size = average_context_size(retrieved_indices, documents)

        recalls.append(recall)
        precisions.append(precision)
        context_sizes.append(context_size)

        print(f"\nüìä M√âTRICAS RETRIEVAL:")
        print(f"  ‚Ä¢ Recall@{k}:    {recall:.3f}")
        print(f"  ‚Ä¢ Precision@{k}: {precision:.3f}")
        print(f"  ‚Ä¢ Contexto:      {context_size:.1f} palabras")

        # --- Construir contexto para el modelo
        context = "\n".join([documents[j] for j in retrieved_indices])
        prompt = f"""
        You are a knowledgeable assistant.
        Use the following context to answer the question concisely and accurately.
        If the answer is not in the context, say you don‚Äôt know.

        Context:
        {context}

        Question: {query}
        Answer:
        """

        # --- Generaci√≥n con Azure OpenAI
        response = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[{"role": "user", "content": prompt}]
        )

        generated_answer = response.choices[0].message.content.strip()

        print("\nüß† RESPUESTA GENERADA POR EL MODELO:")
        print(generated_answer)
        print("-"*80)

        all_responses.append({
            "query": query,
            "retrieved_indices": retrieved_indices,
            "recall": recall,
            "precision": precision,
            "context_size": context_size,
            "answer": generated_answer
        })

    print("\n" + "="*80)
    print("üìà RESULTADOS PROMEDIO")
    print("="*80)
    print(f"Recall@{k}:    {np.mean(recalls):.3f}")
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"Contexto:      {np.mean(context_sizes):.1f} palabras")
    print("="*80)

    return {
        "recall": np.mean(recalls),
        "precision": np.mean(precisions),
        "context_size": np.mean(context_sizes),
        "details": all_responses
    }


# ===================================
# 7. Ejecutar evaluaci√≥n completa (RAG)
# ===================================

start_time = time.time()
results = evaluate_with_generation(
    queries=queries,
    ground_truth=ground_truth,
    documents=documents,
    k=5
)
elapsed_time = time.time() - start_time
print(f"\n‚è±Ô∏è Tiempo total de evaluaci√≥n: {elapsed_time:.2f} segundos")

# ===================================
# 8. Guardar resultados
# ===================================

results_df = pd.DataFrame([{
    "query": r["query"],
    "recall": r["recall"],
    "precision": r["precision"],
    "context_size": r["context_size"],
    "generated_answer": r["answer"]
} for r in results["details"]])

output_path = os.path.join(project_root, "data", "evaluation_results_rag.csv")
results_df.to_csv(output_path, index=False)
print(f"\n‚úÖ Resultados guardados en: {output_path}")

print("\n" + "="*80)
print("‚úÖ EVALUACI√ìN COMPLETADA (RAG CON AZURE OPENAI)")
print("="*80)



üîë Clave cargada: True
‚úÖ Project root: c:\Users\USER\RAGModel_MineriaMultimedia_202520
üìÑ Total chunks cargados: 66
‚úÖ √çndice TF-IDF recreado: 66 documentos, 1538 features

üîç Consultas definidas: 2

ü§ñ Generando ground truth autom√°tico...
Query 1: 3 chunks relevantes
Query 2: 3 chunks relevantes

EVALUACI√ìN DEL SISTEMA RAG (RETRIEVAL + GENERACI√ìN)

üîé CONSULTA 1: 'Who saves Bella from the van?'

üìä M√âTRICAS RETRIEVAL:
  ‚Ä¢ Recall@5:    0.333
  ‚Ä¢ Precision@5: 0.200
  ‚Ä¢ Contexto:      625.8 palabras

üß† RESPUESTA GENERADA POR EL MODELO:
Edward Cullen saves Bella from the van.
--------------------------------------------------------------------------------

üîé CONSULTA 2: 'Which Cullen family member is a doctor?'

üìä M√âTRICAS RETRIEVAL:
  ‚Ä¢ Recall@5:    0.333
  ‚Ä¢ Precision@5: 0.200
  ‚Ä¢ Contexto:      640.0 palabras

üß† RESPUESTA GENERADA POR EL MODELO:
Carlisle Cullen is the family member who is a doctor.
-------------------------------------------