In [None]:
# 03 - Evaluaci√≥n del Sistema RAG (CON VISUALIZACI√ìN DE RESPUESTAS)

# ===================================
# 1. Setup y carga de dependencias
# ===================================

import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

project_root = os.path.abspath("..")
sys_path = os.path.join(project_root, "src")
if sys_path not in sys.path:
    sys.path.append(sys_path)

print("‚úÖ Project root:", project_root)

# ===================================
# 2. Cargar datos e √≠ndice TF-IDF
# ===================================

from utils import load_chunks_from_folder
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_PREPROCESSED = os.path.join(project_root, "data", "preprocessed")
folders = sorted([
    os.path.join(BASE_PREPROCESSED, f) 
    for f in os.listdir(BASE_PREPROCESSED) 
    if f.startswith("processed_")
])

records = []
for folder in folders:
    recs = load_chunks_from_folder(folder)
    records.extend(recs)

df = pd.DataFrame.from_records(records)
documents = df['text'].astype(str).tolist()

print(f"üìÑ Total chunks cargados: {len(documents)}")

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(documents)

print(f"‚úÖ √çndice TF-IDF recreado: {X.shape[0]} documentos, {X.shape[1]} features")

# ===================================
# 3. Funciones de evaluaci√≥n OPTIMIZADAS
# ===================================

@lru_cache(maxsize=128)
def get_similarities_cached(query, k=5):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X).flatten()
    top_k_indices = np.argsort(similarities)[::-1][:k]
    top_k_scores = similarities[top_k_indices]
    return tuple(top_k_indices), tuple(top_k_scores)


def recall_at_k(top_k_indices, relevant_indices):
    if not relevant_indices:
        return 0.0
    hits = len(set(top_k_indices) & set(relevant_indices))
    recall = hits / len(relevant_indices)
    return recall


def precision_at_k(top_k_indices, relevant_indices, k=5):
    if not relevant_indices:
        return 0.0
    hits = len(set(top_k_indices) & set(relevant_indices))
    precision = hits / k
    return precision


def average_context_size(retrieved_indices, documents):
    chunks = [documents[i] for i in retrieved_indices]
    sizes = [len(c.split()) for c in chunks]
    return np.mean(sizes) if sizes else 0


def search_tfidf(query, k=5):
    indices, scores = get_similarities_cached(query, k)
    return list(indices), list(scores)


# ===================================
# 4. Ground truth m√°s r√°pido
# ===================================

def find_relevant_chunks_fast(keyword_list, documents, max_chunks=3):
    relevant = []
    keywords_lower = [kw.lower() for kw in keyword_list]
    
    for i, doc in enumerate(documents):
        if len(relevant) >= max_chunks:
            break
        doc_lower = doc.lower()
        if any(kw in doc_lower for kw in keywords_lower):
            relevant.append(i)
    
    return relevant


# ===================================
# 5. Definir consultas
# ===================================

queries = [
    "Who saves Bella from the van?",
    "Which Cullen family member is a doctor?",
]

keywords_per_query = [
    ["edward", "van", "save"],
    ["carlisle", "doctor"],
]

print(f"\nüîç Consultas definidas: {len(queries)}")

print("\n Generando ground truth autom√°tico...")
ground_truth = []
for i, keywords in enumerate(keywords_per_query):
    relevant = find_relevant_chunks_fast(keywords, documents, max_chunks=3)
    ground_truth.append(relevant)
    print(f"Query {i+1}: {len(relevant)} chunks relevantes")

# ===================================
# 6. EVALUACI√ìN CON VISUALIZACI√ìN DE RESPUESTAS
# ===================================

def evaluate_with_responses(queries, ground_truth, documents, k=5):
    recalls = []
    precisions = []
    context_sizes = []
    all_responses = []
    
    print("\n" + "=" * 80)
    print("EVALUACI√ìN DEL SISTEMA DE RECUPERACI√ìN")
    print("=" * 80)
    
    for i, query in enumerate(queries):
        print(f"\n{'='*80}")
        print(f"üîé CONSULTA {i+1}: '{query}'")
        print("=" * 80)
        
        # Buscar con scores
        retrieved_indices, scores = search_tfidf(query, k=k)
        
        # Calcular m√©tricas
        recall = recall_at_k(retrieved_indices, ground_truth[i])
        precision = precision_at_k(retrieved_indices, ground_truth[i], k=k)
        context_size = average_context_size(retrieved_indices, documents)
        
        recalls.append(recall)
        precisions.append(precision)
        context_sizes.append(context_size)
        
        print(f"\nüìä M√âTRICAS:")
        print(f"  ‚Ä¢ Recall@{k}:    {recall:.3f}")
        print(f"  ‚Ä¢ Precision@{k}: {precision:.3f}")
        print(f"  ‚Ä¢ Contexto:      {context_size:.1f} palabras")
        print(f"  ‚Ä¢ Ground truth:  {len(ground_truth[i])} chunks relevantes")
        
        # MOSTRAR RESPUESTAS RECUPERADAS
        print(f"\nüìÑ TOP {k} CHUNKS RECUPERADOS:")
        print("-" * 80)
        
        responses = []
        for rank, (idx, score) in enumerate(zip(retrieved_indices, scores), 1):
            is_relevant = "‚úÖ RELEVANTE" if idx in ground_truth[i] else "‚ùå No relevante"
            chunk_text = documents[idx]
            
            # Obtener metadata del chunk
            chunk_meta = df.iloc[idx]
            book = chunk_meta.get('book_name', 'N/A')
            chunk_num = chunk_meta.get('chunk_number', 'N/A')
            
            print(f"\nüèÜ RANK {rank} | Score: {score:.4f} | {is_relevant}")
            print(f"üìö Libro: {book} | Chunk #{chunk_num}")
            print(f"üìù Texto: {chunk_text[:300]}...")
            
            responses.append({
                'rank': rank,
                'score': score,
                'is_relevant': idx in ground_truth[i],
                'text': chunk_text,
                'book': book,
                'chunk_number': chunk_num
            })
        
        all_responses.append(responses)
        print("-" * 80)
    
    # RESUMEN FINAL
    print("\n" + "=" * 80)
    print("üìà RESULTADOS PROMEDIO")
    print("=" * 80)
    print(f"Recall@{k}:    {np.mean(recalls):.3f}")
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"Contexto:      {np.mean(context_sizes):.1f} palabras")
    print("=" * 80)
    
    return {
        'recall': np.mean(recalls),
        'precision': np.mean(precisions),
        'context_size': np.mean(context_sizes),
        'recalls': recalls,
        'precisions': precisions,
        'context_sizes': context_sizes,
        'responses': all_responses
    }

# Ejecutar evaluaci√≥n
import time
start_time = time.time()

results = evaluate_with_responses(
    queries=queries,
    ground_truth=ground_truth,
    documents=documents,
    k=5
)

elapsed_time = time.time() - start_time
print(f"\n‚è±Ô∏è Tiempo de evaluaci√≥n: {elapsed_time:.2f} segundos")

# ===================================
# 7. VISUALIZACI√ìN CON MATPLOTLIB
# ===================================

try:
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.use('TkAgg')  # Backend para Jupyter/notebooks
    
    print("\nüìä Generando visualizaciones...")
    
    # Crear figura con 3 subgr√°ficos
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    query_labels = [f"Q{i+1}" for i in range(len(queries))]
    
    # Gr√°fico 1: Recall
    axes[0].bar(query_labels, results['recalls'], color='#2ecc71', alpha=0.8)
    axes[0].set_ylabel('Score', fontsize=12)
    axes[0].set_title('Recall@5 por Consulta', fontsize=14, fontweight='bold')
    axes[0].set_ylim([0, 1])
    axes[0].grid(axis='y', alpha=0.3)
    for i, v in enumerate(results['recalls']):
        axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')
    
    # Gr√°fico 2: Precision
    axes[1].bar(query_labels, results['precisions'], color='#3498db', alpha=0.8)
    axes[1].set_ylabel('Score', fontsize=12)
    axes[1].set_title('Precision@5 por Consulta', fontsize=14, fontweight='bold')
    axes[1].set_ylim([0, 1])
    axes[1].grid(axis='y', alpha=0.3)
    for i, v in enumerate(results['precisions']):
        axes[1].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')
    
    # Gr√°fico 3: Tama√±o de contexto
    axes[2].bar(query_labels, results['context_sizes'], color='#e74c3c', alpha=0.8)
    axes[2].set_ylabel('Palabras', fontsize=12)
    axes[2].set_title('Tama√±o de Contexto', fontsize=14, fontweight='bold')
    axes[2].grid(axis='y', alpha=0.3)
    for i, v in enumerate(results['context_sizes']):
        axes[2].text(i, v + 5, f'{v:.0f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    
    # Guardar
    plot_path = os.path.join(project_root, 'data', 'evaluation_results.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"‚úÖ Gr√°fico guardado en: {plot_path}")
    
    # Mostrar
    plt.show()
    
    # ===================================
    # GR√ÅFICO ADICIONAL: Scores por rank
    # ===================================
    
    fig2, axes2 = plt.subplots(1, len(queries), figsize=(12, 5))
    if len(queries) == 1:
        axes2 = [axes2]
    
    for i, (query, responses) in enumerate(zip(queries, results['responses'])):
        ranks = [r['rank'] for r in responses]
        scores = [r['score'] for r in responses]
        colors = ['green' if r['is_relevant'] else 'red' for r in responses]
        
        axes2[i].bar(ranks, scores, color=colors, alpha=0.7)
        axes2[i].set_xlabel('Rank', fontsize=11)
        axes2[i].set_ylabel('Similarity Score', fontsize=11)
        axes2[i].set_title(f'Q{i+1}: Scores por Rank\n(Verde=Relevante)', fontsize=12)
        axes2[i].set_xticks(ranks)
        axes2[i].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    
    plot_path2 = os.path.join(project_root, 'data', 'evaluation_scores.png')
    plt.savefig(plot_path2, dpi=300, bbox_inches='tight')
    print(f"‚úÖ Gr√°fico de scores guardado en: {plot_path2}")
    
    plt.show()
    
except ImportError:
    print("\n matplotlib no disponible")
    print("Instalar con: pip install matplotlib")
except Exception as e:
    print(f"\n Error en visualizaci√≥n: {e}")
    try:
        import matplotlib
        matplotlib.use('Agg')  
        import matplotlib.pyplot as plt
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        query_labels = [f"Q{i+1}" for i in range(len(queries))]
        
        axes[0].bar(query_labels, results['recalls'], color='#2ecc71')
        axes[0].set_title('Recall@5')
        
        axes[1].bar(query_labels, results['precisions'], color='#3498db')
        axes[1].set_title('Precision@5')
        
        axes[2].bar(query_labels, results['context_sizes'], color='#e74c3c')
        axes[2].set_title('Contexto')
        
        plt.tight_layout()
        plot_path = os.path.join(project_root, 'data', 'evaluation_results.png')
        plt.savefig(plot_path, dpi=300)
        print(f"‚úÖ Gr√°fico guardado (sin visualizar) en: {plot_path}")
        plt.close()
    except:
        print("‚ùå No se pudo generar gr√°ficos")

# ===================================
# 8. Guardar resultados detallados
# ===================================

# CSV con m√©tricas
results_df = pd.DataFrame({
    'query': queries,
    'recall': results['recalls'],
    'precision': results['precisions'],
    'context_size': results['context_sizes']
})

output_path = os.path.join(project_root, "data", "evaluation_results.csv")
results_df.to_csv(output_path, index=False)
print(f"\n‚úÖ Resultados guardados en: {output_path}")

# CSV detallado con todas las respuestas
detailed_results = []
for i, (query, responses) in enumerate(zip(queries, results['responses'])):
    for r in responses:
        detailed_results.append({
            'query_id': i+1,
            'query': query,
            'rank': r['rank'],
            'score': r['score'],
            'is_relevant': r['is_relevant'],
            'book': r['book'],
            'chunk_number': r['chunk_number'],
            'text_preview': r['text'][:200]
        })

detailed_df = pd.DataFrame(detailed_results)
detailed_path = os.path.join(project_root, "data", "evaluation_detailed.csv")
detailed_df.to_csv(detailed_path, index=False)
print(f"‚úÖ Resultados detallados guardados en: {detailed_path}")

print("\n" + "=" * 80)
print("‚úÖ EVALUACI√ìN COMPLETADA")
print("=" * 80)

‚úÖ Project root: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520
üìÑ Total chunks cargados: 66
‚úÖ √çndice TF-IDF recreado: 66 documentos, 1538 features

üîç Consultas definidas: 2

ü§ñ Generando ground truth autom√°tico...
Query 1: 3 chunks relevantes
Query 2: 3 chunks relevantes

EVALUACI√ìN DEL SISTEMA DE RECUPERACI√ìN

üîé CONSULTA 1: 'Who saves Bella from the van?'

üìä M√âTRICAS:
  ‚Ä¢ Recall@5:    0.333
  ‚Ä¢ Precision@5: 0.200
  ‚Ä¢ Contexto:      625.8 palabras
  ‚Ä¢ Ground truth:  3 chunks relevantes

üìÑ TOP 5 CHUNKS RECUPERADOS:
--------------------------------------------------------------------------------

üèÜ RANK 1 | Score: 0.1557 | ‚úÖ RELEVANTE
üìö Libro: data_summary | Chunk #1
üìù Texto: 1 Bella Swan moves from Phoenix, Arizona to the small town of Forks, Washington, to live with her father, Charlie. She soon notices the mysterious Cullen family at school, who seem pale, beautiful, and distant from everyone else. 2 Edward Cullen, one of the Cullens, save