## Prerequisites

1. ✅ foundation/00-setup-postgres-schema.ipynb
2. ✅ evaluation-lab/01-create-ground-truth-human-in-loop.ipynb (curated test set)

## Configuration

In [None]:
EMBEDDING_MODEL_ALIAS = "all-minilm-l6-v2"
TOP_K_VALUES = [1, 3, 5, 10]  # Compute metrics at these K values

# Generation metrics (optional)
COMPUTE_BLEU_ROUGE = False  # Set to True if you have reference answers
USE_LLM_AS_JUDGE = False     # Set to True for answer quality scoring
JUDGE_MODEL = "gpt-3.5-turbo"

EXPERIMENT_NAME = "metrics-framework-baseline"
TECHNIQUES_APPLIED = ["vector_retrieval"]  # Baseline

## Load Ground-Truth Test Set

In [None]:
import psycopg2
import psycopg2.extras
import json
import math
import pandas as pd
import numpy as np
from typing import List, Set, Dict, Tuple, Optional
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Database connection
db_connection = psycopg2.connect(
    host='localhost',
    port=5432,
    database='rag_wiki_demo',
    user='postgres',
    password='postgres'
)

# Load test queries from evaluation_groundtruth
ground_truth_questions = []

with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
    cur.execute('''
        SELECT 
            id,
            question,
            relevant_chunk_ids,
            quality_rating,
            source_type
        FROM evaluation_groundtruth
        WHERE quality_rating = 'good'
        ORDER BY id
    ''')
    
    for row in cur.fetchall():
        ground_truth_questions.append({
            'id': row['id'],
            'question': row['question'],
            'relevant_chunk_ids': row['relevant_chunk_ids'],
            'quality_rating': row['quality_rating'],
            'source_type': row['source_type']
        })

print(f"Loaded {len(ground_truth_questions)} ground truth questions")
if ground_truth_questions:
    print(f"Sample: {ground_truth_questions[0]['question'][:80]}...")
    print(f"Relevant chunks: {ground_truth_questions[0]['relevant_chunk_ids'][:3]}")

## Compute Retrieval Metrics

In [None]:
# ============================================================================
# PART 1: RETRIEVAL METRICS FUNCTIONS
# ============================================================================

def precision_at_k(retrieved_chunk_ids: List[int], 
                   relevant_chunk_ids: List[int], 
                   k: int = 5) -> float:
    """
    Precision@K: What percentage of top-K results are relevant?
    
    Formula: |{relevant in top-K}| / K
    
    Interpretation:
    - 1.0 = Perfect: all K results are relevant
    - 0.5 = Half of results are relevant
    - 0.0 = None of the results are relevant
    
    Good for: Understanding result quality from user perspective
    ("Of what I saw, how much was useful?")
    
    Args:
        retrieved_chunk_ids: List of chunk IDs in ranked order
        relevant_chunk_ids: List/set of ground-truth relevant chunk IDs
        k: Number of top results to consider
        
    Returns:
        float: Precision score between 0.0 and 1.0
    
    Example:
        >>> retrieved = [1, 2, 3, 4, 5]
        >>> relevant = [1, 3, 5]
        >>> precision_at_k(retrieved, relevant, k=5)
        0.6  # 3 out of 5 are relevant
    """
    if k == 0:
        return 0.0
    
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    num_relevant_in_k = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)
    
    return num_relevant_in_k / k


def recall_at_k(retrieved_chunk_ids: List[int], 
                relevant_chunk_ids: List[int], 
                k: int = 5) -> float:
    """
    Recall@K: What percentage of all relevant chunks were found in top-K?
    
    Formula: |{relevant in top-K}| / |all relevant|
    
    Interpretation:
    - 1.0 = Perfect: found all relevant chunks
    - 0.5 = Found half of relevant chunks
    - 0.0 = Found none of the relevant chunks
    
    Good for: Understanding coverage
    ("Did I find everything that's relevant?")
    
    Args:
        retrieved_chunk_ids: List of chunk IDs in ranked order
        relevant_chunk_ids: List/set of ground-truth relevant chunk IDs
        k: Number of top results to consider
        
    Returns:
        float: Recall score between 0.0 and 1.0
    
    Example:
        >>> retrieved = [1, 2, 3, 4, 5]
        >>> relevant = [1, 3, 5, 7, 9]  # 5 total relevant
        >>> recall_at_k(retrieved, relevant, k=5)
        0.6  # Found 3 out of 5 relevant chunks
    """
    if len(relevant_chunk_ids) == 0:
        return 0.0
    
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    num_relevant_found = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)
    
    return num_relevant_found / len(relevant_set)


def mean_reciprocal_rank(retrieved_chunk_ids: List[int], 
                         relevant_chunk_ids: List[int]) -> float:
    """
    MRR: How quickly do we find the first relevant result?
    
    Formula: 1 / (rank of first relevant result)
    
    Interpretation:
    - 1.0 = First result is relevant
    - 0.5 = Second result is first relevant
    - 0.333 = Third result is first relevant
    - 0.0 = No relevant results found
    
    Good for: Understanding user satisfaction
    ("How many results did user need to look through?")
    
    Args:
        retrieved_chunk_ids: List of chunk IDs in ranked order
        relevant_chunk_ids: List/set of ground-truth relevant chunk IDs
        
    Returns:
        float: MRR score between 0.0 and 1.0
    
    Example:
        >>> retrieved = [1, 2, 3, 4, 5]
        >>> relevant = [3, 7, 9]
        >>> mean_reciprocal_rank(retrieved, relevant)
        0.333  # First relevant at position 3
    """
    relevant_set = set(relevant_chunk_ids)
    
    for rank, chunk_id in enumerate(retrieved_chunk_ids, start=1):
        if chunk_id in relevant_set:
            return 1.0 / rank
    
    return 0.0


def ndcg_at_k(retrieved_chunk_ids: List[int], 
              relevant_chunk_ids: List[int], 
              k: int = 5) -> float:
    """
    NDCG@K: Normalized Discounted Cumulative Gain
    How well-ranked are the results? (rewards relevant results at top)
    
    Formula: DCG@K / IDCG@K
    where DCG = sum((2^rel - 1) / log2(rank + 1))
    
    Interpretation:
    - 1.0 = Perfect ranking (all relevant at top)
    - 0.8 = Good ranking (most relevant near top)
    - 0.5 = Mediocre ranking
    - 0.0 = No relevant results
    
    Good for: Understanding ranking quality
    ("Are the most relevant results at the top?")
    
    Args:
        retrieved_chunk_ids: List of chunk IDs in ranked order
        relevant_chunk_ids: List/set of ground-truth relevant chunk IDs
        k: Number of top results to consider
        
    Returns:
        float: NDCG score between 0.0 and 1.0
    
    Example:
        >>> retrieved = [1, 2, 3, 4, 5]
        >>> relevant = [1, 3, 5]
        >>> ndcg_at_k(retrieved, relevant, k=5)
        0.934  # Good ranking, relevant items at positions 1, 3, 5
    """
    
    def dcg_score(relevance_scores: List[float]) -> float:
        """Compute DCG from relevance scores."""
        return sum(
            (2**rel - 1) / math.log2(rank + 2)
            for rank, rel in enumerate(relevance_scores)
        )
    
    if k == 0 or len(relevant_chunk_ids) == 0:
        return 0.0
    
    # Get top-K retrieved
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    # Binary relevance: 1 if relevant, 0 if not
    relevance = [1 if chunk_id in relevant_set else 0 for chunk_id in retrieved_k]
    
    # Compute DCG for retrieved ranking
    dcg = dcg_score(relevance)
    
    # Compute ideal DCG (perfect ranking)
    ideal_relevance = sorted(relevance, reverse=True)
    idcg = dcg_score(ideal_relevance)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg


# ============================================================================
# PART 2: FRAMEWORK ORCHESTRATOR
# ============================================================================

def evaluate_rag_results(ground_truth_questions: List[Dict], 
                        rag_results: List[Dict], 
                        k_values: List[int] = [1, 3, 5, 10]) -> Dict:
    """
    Compute all metrics for RAG results against ground truth.
    
    Args:
        ground_truth_questions: List of dicts with 'question', 'relevant_chunk_ids'
        rag_results: List of dicts with 'question', 'retrieved_chunk_ids'
        k_values: List of K values to compute metrics for
        
    Returns:
        dict: {
            'precision@1': float (mean),
            'precision@3': float (mean),
            ...
            'recall@1': float (mean),
            ...
            'ndcg@1': float (mean),
            ...
            'mrr': float (mean),
            'per_query': [list of per-query metrics dicts]
        }
    """
    # Map questions to relevant chunks
    gt_map = {q['question']: q['relevant_chunk_ids'] for q in ground_truth_questions}
    
    # Initialize metric storage
    metrics = {f'precision@{k}': [] for k in k_values}
    metrics.update({f'recall@{k}': [] for k in k_values})
    metrics.update({f'ndcg@{k}': [] for k in k_values})
    metrics['mrr'] = []
    
    per_query_metrics = []
    
    # Compute metrics for each query
    for result in rag_results:
        question = result['question']
        retrieved = result['retrieved_chunk_ids']
        
        if question not in gt_map:
            continue  # Skip if no ground truth
        
        relevant = gt_map[question]
        
        query_metrics = {'question': question}
        
        # Precision and Recall at each K
        for k in k_values:
            p = precision_at_k(retrieved, relevant, k)
            r = recall_at_k(retrieved, relevant, k)
            n = ndcg_at_k(retrieved, relevant, k)
            
            metrics[f'precision@{k}'].append(p)
            metrics[f'recall@{k}'].append(r)
            metrics[f'ndcg@{k}'].append(n)
            
            query_metrics[f'precision@{k}'] = p
            query_metrics[f'recall@{k}'] = r
            query_metrics[f'ndcg@{k}'] = n
        
        # MRR
        mrr = mean_reciprocal_rank(retrieved, relevant)
        metrics['mrr'].append(mrr)
        query_metrics['mrr'] = mrr
        
        per_query_metrics.append(query_metrics)
    
    # Aggregate: compute means
    aggregated = {}
    for key, values in metrics.items():
        if len(values) > 0:
            aggregated[key] = np.mean(values)
        else:
            aggregated[key] = 0.0
    
    aggregated['per_query'] = per_query_metrics
    aggregated['num_queries'] = len(per_query_metrics)
    
    return aggregated


# ============================================================================
# STEP 1: Retrieve results for all test questions
# ============================================================================

def retrieve_results_for_questions(db_connection, questions: List[Dict], 
                                   embedding_model_alias: str = "all-minilm-l6-v2",
                                   top_k: int = 10) -> List[Dict]:
    """
    Retrieve top-K results for each test question using vector similarity.
    
    Args:
        db_connection: PostgreSQL connection
        questions: List of question dicts
        embedding_model_alias: Which embedding model to use
        top_k: Number of results to retrieve
        
    Returns:
        List of dicts with 'question' and 'retrieved_chunk_ids'
    """
    from sentence_transformers import SentenceTransformer
    
    # Load embedding model
    model = SentenceTransformer(embedding_model_alias)
    
    rag_results = []
    
    # Get question embeddings
    question_texts = [q['question'] for q in questions]
    question_embeddings = model.encode(question_texts)
    
    with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        for i, question_dict in enumerate(questions):
            question = question_dict['question']
            question_embedding = question_embeddings[i]
            
            # Search for similar chunks using vector similarity
            cur.execute('''
                SELECT 
                    id,
                    content,
                    embedding
                FROM chunks
                WHERE embedding_model = %s
                ORDER BY embedding <-> %s
                LIMIT %s
            ''', (embedding_model_alias, question_embedding, top_k))
            
            retrieved_chunks = cur.fetchall()
            retrieved_ids = [chunk['id'] for chunk in retrieved_chunks]
            
            rag_results.append({
                'question': question,
                'retrieved_chunk_ids': retrieved_ids
            })
    
    return rag_results


# Retrieve results for all test questions
print(f"Retrieving top-{max(TOP_K_VALUES)} results for {len(ground_truth_questions)} questions...")
rag_results = retrieve_results_for_questions(
    db_connection, 
    ground_truth_questions,
    embedding_model_alias=EMBEDDING_MODEL_ALIAS,
    top_k=max(TOP_K_VALUES)
)
print(f"Retrieved {len(rag_results)} result sets")


# ============================================================================
# STEP 2: Compute all retrieval metrics
# ============================================================================

print("\nComputing retrieval metrics...")
all_metrics = evaluate_rag_results(
    ground_truth_questions,
    rag_results,
    k_values=TOP_K_VALUES
)

# Print summary
print(f"\nMetrics for {all_metrics['num_queries']} queries:")
print(f"{'Metric':<20} {'Value':<10}")
print("-" * 30)

for k in TOP_K_VALUES:
    print(f"Precision@{k:<13} {all_metrics[f'precision@{k}']:.4f}")

print()
for k in TOP_K_VALUES:
    print(f"Recall@{k:<16} {all_metrics[f'recall@{k}']:.4f}")

print()
for k in TOP_K_VALUES:
    print(f"NDCG@{k:<19} {all_metrics[f'ndcg@{k}']:.4f}")

print()
print(f"MRR (Mean Reciprocal Rank):    {all_metrics['mrr']:.4f}")

# Compute standard deviations for additional insight
print("\nStandard Deviations:")
print(f"{'Metric':<20} {'Std Dev':<10}")
print("-" * 30)

for k in TOP_K_VALUES:
    p_values = [q[f'precision@{k}'] for q in all_metrics['per_query']]
    print(f"Precision@{k:<13} {np.std(p_values):.4f}")

for k in TOP_K_VALUES:
    r_values = [q[f'recall@{k}'] for q in all_metrics['per_query']]
    print(f"Recall@{k:<16} {np.std(r_values):.4f}")

## Compute Generation Metrics (Optional)

Only if you have reference answers.

In [None]:
# ============================================================================
# PART 3: GENERATION METRICS (Optional)
# ============================================================================

def compute_bleu_score(reference: str, candidate: str) -> float:
    """
    BLEU (Bilingual Evaluation Understudy): n-gram precision with brevity penalty.
    
    Measures: "How similar is the generated answer to the reference answer?"
    
    Good for: When you have human-written reference answers
    
    Args:
        reference: Ground-truth answer text
        candidate: Generated answer text
        
    Returns:
        float: BLEU score between 0.0 and 1.0
    """
    try:
        from nltk.translate.bleu_score import sentence_bleu
        from nltk.tokenize import word_tokenize
        
        reference_tokens = word_tokenize(reference.lower())
        candidate_tokens = word_tokenize(candidate.lower())
        
        # Use BLEU-4 (up to 4-gram matches)
        score = sentence_bleu(
            [reference_tokens],
            candidate_tokens,
            weights=(0.25, 0.25, 0.25, 0.25)
        )
        return score
    except Exception as e:
        print(f"Error computing BLEU: {e}")
        return 0.0


def compute_rouge_score(reference: str, candidate: str) -> float:
    """
    ROUGE (Recall-Oriented Understudy for Gisting Evaluation): n-gram recall.
    
    Measures: "How much of the reference answer appears in the generated answer?"
    
    Good for: When you want to measure coverage of key information
    
    Args:
        reference: Ground-truth answer text
        candidate: Generated answer text
        
    Returns:
        float: ROUGE-L score between 0.0 and 1.0
    """
    try:
        from rouge_score import rouge_scorer
        
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        scores = scorer.score(reference, candidate)
        
        # Return ROUGE-L F1 score
        return scores['rougeL'].fmeasure
    except Exception as e:
        print(f"Error computing ROUGE: {e}")
        return 0.0


def compute_llm_judge_score(question: str, answer: str, 
                            judge_model: str = "gpt-3.5-turbo") -> Tuple[int, str]:
    """
    LLM-as-Judge: Use an LLM to evaluate answer quality.
    
    Measures: "Is this a good answer to the question?"
    
    Good for: When you don't have reference answers but want quality scoring
    
    Args:
        question: The original question
        answer: The generated answer
        judge_model: Which LLM to use for judging
        
    Returns:
        Tuple of (score 1-5, reasoning)
    """
    try:
        import openai
        
        prompt = f"""Rate the quality of this answer to the question on a scale of 1-5:

Question: {question}

Answer: {answer}

Provide your rating (1-5) and brief reasoning. Format as:
SCORE: <number>
REASONING: <your explanation>"""
        
        response = openai.ChatCompletion.create(
            model=judge_model,
            messages=[
                {"role": "system", "content": "You are an expert evaluator of question-answering systems. Rate answers on how well they answer the question, accuracy, and completeness."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        
        response_text = response.choices[0].message.content
        
        # Parse response
        lines = response_text.split('\n')
        score = 3  # Default
        reasoning = ""
        
        for line in lines:
            if line.startswith('SCORE:'):
                try:
                    score = int(line.split(':')[1].strip())
                except:
                    pass
            elif line.startswith('REASONING:'):
                reasoning = line.split(':', 1)[1].strip()
        
        return score, reasoning
    except Exception as e:
        print(f"Error with LLM judge: {e}")
        return 3, "Error evaluating answer"


# ============================================================================
# STEP 3: Optionally compute generation metrics
# ============================================================================

generation_metrics = {}

if COMPUTE_BLEU_ROUGE or USE_LLM_AS_JUDGE:
    print("\n" + "=" * 70)
    print("COMPUTING GENERATION METRICS")
    print("=" * 70)
    
    # For this example, we'll create mock generated answers
    # In practice, you would generate answers from your RAG system
    print("\nNote: Generation metrics require either:")
    print("  1. Reference answers in evaluation_groundtruth table")
    print("  2. Generated answers from your RAG system")
    print("\nSet COMPUTE_BLEU_ROUGE=True and COMPUTE_LLM_JUDGE=True to enable.")
    
    if COMPUTE_BLEU_ROUGE:
        print("\n[BLEU/ROUGE] These metrics require reference answers.")
        print("Add 'reference_answer' column to evaluation_groundtruth if available.")
    
    if USE_LLM_AS_JUDGE:
        print("\n[LLM-as-Judge] This requires OpenAI API key in environment.")
        print("Set OPENAI_API_KEY environment variable to enable.")
    
    # Example: If you have reference answers and generated answers:
    # bleu_scores = []
    # rouge_scores = []
    # 
    # for i, question in enumerate(ground_truth_questions):
    #     reference = question.get('reference_answer', '')
    #     # In practice, generate answer from your RAG system:
    #     # generated = rag_system.generate_answer(question['question'], retrieved_docs)
    #     
    #     if reference and generated:
    #         bleu = compute_bleu_score(reference, generated)
    #         rouge = compute_rouge_score(reference, generated)
    #         bleu_scores.append(bleu)
    #         rouge_scores.append(rouge)
    # 
    # if bleu_scores:
    #     generation_metrics['bleu_score'] = np.mean(bleu_scores)
    #     generation_metrics['rouge_score'] = np.mean(rouge_scores)
    
    print("\nGeneration metrics computation skipped (set flags to enable)")

else:
    print("\nGeneration metrics disabled. Focusing on retrieval evaluation.")

## Visualize Metrics

In [None]:
# ============================================================================
# PART 4: VISUALIZATION
# ============================================================================

def visualize_metrics(metrics: Dict, k_values: List[int] = [1, 3, 5, 10]):
    """
    Create comprehensive visualizations for metrics.
    
    Args:
        metrics: Dict returned by evaluate_rag_results()
        k_values: K values to plot
    """
    fig = plt.figure(figsize=(16, 12))
    
    # ========================================================================
    # Plot 1: Precision and Recall at different K values
    # ========================================================================
    ax1 = plt.subplot(2, 3, 1)
    
    precision_values = [metrics[f'precision@{k}'] for k in k_values]
    recall_values = [metrics[f'recall@{k}'] for k in k_values]
    
    ax1.plot(k_values, precision_values, marker='o', linewidth=2, 
             markersize=8, label='Precision@K', color='#2E86AB')
    ax1.plot(k_values, recall_values, marker='s', linewidth=2, 
             markersize=8, label='Recall@K', color='#A23B72')
    
    ax1.set_xlabel('K (Number of Results)', fontsize=11, fontweight='bold')
    ax1.set_ylabel('Score', fontsize=11, fontweight='bold')
    ax1.set_title('Precision and Recall at Different K', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=10)
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim(0, 1.0)
    ax1.set_xticks(k_values)
    
    # ========================================================================
    # Plot 2: NDCG at different K values
    # ========================================================================
    ax2 = plt.subplot(2, 3, 2)
    
    ndcg_values = [metrics[f'ndcg@{k}'] for k in k_values]
    
    bars = ax2.bar([str(k) for k in k_values], ndcg_values, alpha=0.8, 
                    color=['#06A77D', '#1FA24A', '#F18F01', '#C73E1D'],
                    edgecolor='black', linewidth=1.5)
    
    # Add value labels on bars
    for bar, value in zip(bars, ndcg_values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{value:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    ax2.set_xlabel('K (Number of Results)', fontsize=11, fontweight='bold')
    ax2.set_ylabel('NDCG@K', fontsize=11, fontweight='bold')
    ax2.set_title('Normalized Discounted Cumulative Gain', fontsize=12, fontweight='bold')
    ax2.set_ylim(0, 1.0)
    ax2.grid(True, alpha=0.3, axis='y')
    
    # ========================================================================
    # Plot 3: MRR Score
    # ========================================================================
    ax3 = plt.subplot(2, 3, 3)
    
    mrr_score = metrics['mrr']
    colors = ['#06A77D' if mrr_score >= 0.7 else '#F18F01' if mrr_score >= 0.4 else '#C73E1D']
    
    bar = ax3.bar(['MRR'], [mrr_score], alpha=0.8, color=colors[0], 
                   edgecolor='black', linewidth=2, width=0.5)
    
    ax3.text(0, mrr_score + 0.03, f'{mrr_score:.4f}', 
            ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    ax3.set_ylabel('Mean Reciprocal Rank', fontsize=11, fontweight='bold')
    ax3.set_title('First Relevant Result Position', fontsize=12, fontweight='bold')
    ax3.set_ylim(0, 1.0)
    ax3.grid(True, alpha=0.3, axis='y')
    ax3.set_xticklabels(['MRR'], fontsize=11)
    
    # ========================================================================
    # Plot 4: Distribution of Precision@5 Across Queries
    # ========================================================================
    ax4 = plt.subplot(2, 3, 4)
    
    if 'per_query' in metrics and len(metrics['per_query']) > 0:
        precision_5_scores = [q['precision@5'] for q in metrics['per_query']]
        
        ax4.hist(precision_5_scores, bins=10, alpha=0.7, edgecolor='black', 
                color='#2E86AB', linewidth=1.5)
        
        mean_p5 = np.mean(precision_5_scores)
        median_p5 = np.median(precision_5_scores)
        
        ax4.axvline(mean_p5, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_p5:.3f}')
        ax4.axvline(median_p5, color='green', linestyle='--', linewidth=2, label=f'Median: {median_p5:.3f}')
        
        ax4.set_xlabel('Precision@5', fontsize=11, fontweight='bold')
        ax4.set_ylabel('Number of Queries', fontsize=11, fontweight='bold')
        ax4.set_title('Distribution of Precision@5 Across Queries', fontsize=12, fontweight='bold')
        ax4.legend(fontsize=10)
        ax4.grid(True, alpha=0.3, axis='y')
    
    # ========================================================================
    # Plot 5: Distribution of Recall@5 Across Queries
    # ========================================================================
    ax5 = plt.subplot(2, 3, 5)
    
    if 'per_query' in metrics and len(metrics['per_query']) > 0:
        recall_5_scores = [q['recall@5'] for q in metrics['per_query']]
        
        ax5.hist(recall_5_scores, bins=10, alpha=0.7, edgecolor='black', 
                color='#A23B72', linewidth=1.5)
        
        mean_r5 = np.mean(recall_5_scores)
        median_r5 = np.median(recall_5_scores)
        
        ax5.axvline(mean_r5, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_r5:.3f}')
        ax5.axvline(median_r5, color='green', linestyle='--', linewidth=2, label=f'Median: {median_r5:.3f}')
        
        ax5.set_xlabel('Recall@5', fontsize=11, fontweight='bold')
        ax5.set_ylabel('Number of Queries', fontsize=11, fontweight='bold')
        ax5.set_title('Distribution of Recall@5 Across Queries', fontsize=12, fontweight='bold')
        ax5.legend(fontsize=10)
        ax5.grid(True, alpha=0.3, axis='y')
    
    # ========================================================================
    # Plot 6: Per-Query Performance Heatmap (Top queries)
    # ========================================================================
    ax6 = plt.subplot(2, 3, 6)
    
    if 'per_query' in metrics and len(metrics['per_query']) > 0:
        # Create DataFrame for heatmap
        df = pd.DataFrame(metrics['per_query'])
        
        # Select top 10 hardest queries (lowest precision@5)
        if 'precision@5' in df.columns:
            df_sorted = df.nsmallest(10, 'precision@5')
            
            # Prepare data for heatmap
            metric_cols = [f'precision@{k}' for k in [3, 5, 10]] + ['mrr']
            heatmap_data = df_sorted[metric_cols].values
            
            # Create heatmap
            im = ax6.imshow(heatmap_data, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
            
            # Set ticks and labels
            ax6.set_xticks(range(len(metric_cols)))
            ax6.set_xticklabels(metric_cols, rotation=45, ha='right')
            ax6.set_yticks(range(len(df_sorted)))
            ax6.set_yticklabels([f"Q{i}" for i in range(1, len(df_sorted) + 1)])
            
            ax6.set_title('Hardest 10 Queries: Metric Performance', fontsize=12, fontweight='bold')
            
            # Add colorbar
            cbar = plt.colorbar(im, ax=ax6)
            cbar.set_label('Score', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n" + "=" * 70)
    print("VISUALIZATION SUMMARY")
    print("=" * 70)
    
    if 'per_query' in metrics and len(metrics['per_query']) > 0:
        df = pd.DataFrame(metrics['per_query'])
        
        print("\nPer-Query Statistics:")
        print(f"{'Metric':<20} {'Mean':<10} {'Median':<10} {'Std Dev':<10} {'Min':<10} {'Max':<10}")
        print("-" * 60)
        
        for k in [3, 5, 10]:
            col = f'precision@{k}'
            if col in df.columns:
                mean = df[col].mean()
                median = df[col].median()
                std = df[col].std()
                min_val = df[col].min()
                max_val = df[col].max()
                print(f"Precision@{k:<14} {mean:.4f}     {median:.4f}     {std:.4f}     {min_val:.4f}     {max_val:.4f}")
        
        print()
        
        for k in [3, 5, 10]:
            col = f'recall@{k}'
            if col in df.columns:
                mean = df[col].mean()
                median = df[col].median()
                std = df[col].std()
                min_val = df[col].min()
                max_val = df[col].max()
                print(f"Recall@{k:<17} {mean:.4f}     {median:.4f}     {std:.4f}     {min_val:.4f}     {max_val:.4f}")


# ============================================================================
# STEP 4: Create visualizations
# ============================================================================

print("\n" + "=" * 70)
print("CREATING VISUALIZATIONS")
print("=" * 70)

visualize_metrics(all_metrics, k_values=TOP_K_VALUES)

## Export Results

In [None]:
# ============================================================================
# PART 5: EXPORT AND STORAGE
# ============================================================================

def store_metrics_to_database(db_connection: psycopg2.extensions.connection,
                             experiment_id: int, 
                             metrics: Dict) -> Tuple[bool, str]:
    """
    Store computed metrics to evaluation_results table.
    
    Args:
        db_connection: PostgreSQL connection
        experiment_id: ID from start_experiment()
        metrics: Dict of metrics from evaluate_rag_results()
        
    Returns:
        Tuple of (success: bool, message: str)
    """
    try:
        with db_connection.cursor() as cur:
            for metric_name, metric_value in metrics.items():
                if metric_name == 'per_query':
                    # Store per-query details as JSON
                    metric_details = {'per_query_results': metric_value}
                    cur.execute('''
                        INSERT INTO evaluation_results 
                        (experiment_id, metric_name, metric_value, metric_details_json)
                        VALUES (%s, %s, %s, %s)
                    ''', (
                        experiment_id,
                        'per_query_details',
                        0.0,
                        json.dumps(metric_details)
                    ))
                elif metric_name == 'num_queries':
                    # Store as metadata
                    cur.execute('''
                        INSERT INTO evaluation_results 
                        (experiment_id, metric_name, metric_value, metric_details_json)
                        VALUES (%s, %s, %s, %s)
                    ''', (
                        experiment_id,
                        'num_queries_evaluated',
                        float(metric_value),
                        '{}'
                    ))
                else:
                    # Store aggregate metrics
                    cur.execute('''
                        INSERT INTO evaluation_results 
                        (experiment_id, metric_name, metric_value, metric_details_json)
                        VALUES (%s, %s, %s, %s)
                    ''', (
                        experiment_id,
                        metric_name,
                        float(metric_value),
                        '{}'
                    ))
        
        db_connection.commit()
        msg = f"Successfully stored {len(metrics)} metrics to database for experiment #{experiment_id}"
        print(f"[OK] {msg}")
        return True, msg
    
    except Exception as e:
        db_connection.rollback()
        msg = f"Failed to store metrics to database: {str(e)}"
        print(f"[ERROR] {msg}")
        return False, msg


def save_metrics_to_json(metrics: Dict, 
                        experiment_name: str,
                        export_dir: str = 'data/experiment_results') -> Tuple[bool, str]:
    """
    Export metrics to JSON file for easy sharing and archival.
    
    Args:
        metrics: Dict of metrics from evaluate_rag_results()
        experiment_name: Name of the experiment
        export_dir: Directory to save the JSON file
        
    Returns:
        Tuple of (success: bool, file_path: str)
    """
    import os
    
    try:
        os.makedirs(export_dir, exist_ok=True)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{experiment_name.lower().replace(' ', '_')}_{timestamp}.json"
        file_path = os.path.join(export_dir, filename)
        
        # Prepare export data
        export_data = {
            'experiment_name': experiment_name,
            'timestamp': timestamp,
            'computed_at': datetime.now().isoformat(),
            'metrics': {
                'aggregated': {},
                'per_query': metrics.get('per_query', [])
            }
        }
        
        # Add aggregated metrics
        for key, value in metrics.items():
            if key != 'per_query' and key != 'num_queries':
                export_data['metrics']['aggregated'][key] = float(value)
        
        export_data['metrics']['aggregated']['num_queries'] = metrics.get('num_queries', 0)
        
        # Write to file
        with open(file_path, 'w') as f:
            json.dump(export_data, f, indent=2)
        
        msg = f"Exported metrics to {file_path}"
        print(f"[OK] {msg}")
        return True, file_path
    
    except Exception as e:
        msg = f"Failed to export metrics to JSON: {str(e)}"
        print(f"[ERROR] {msg}")
        return False, ""


def generate_metrics_summary_report(metrics: Dict, 
                                   experiment_name: str,
                                   techniques_applied: List[str]) -> str:
    """
    Generate a human-readable summary report of the evaluation.
    
    Args:
        metrics: Dict of metrics from evaluate_rag_results()
        experiment_name: Name of the experiment
        techniques_applied: List of techniques used
        
    Returns:
        String containing the formatted report
    """
    report = []
    report.append("=" * 80)
    report.append("EVALUATION METRICS SUMMARY REPORT")
    report.append("=" * 80)
    report.append("")
    
    report.append(f"Experiment Name: {experiment_name}")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"Techniques Applied: {', '.join(techniques_applied)}")
    report.append(f"Number of Queries Evaluated: {metrics.get('num_queries', 0)}")
    report.append("")
    
    report.append("-" * 80)
    report.append("AGGREGATE METRICS")
    report.append("-" * 80)
    report.append("")
    
    # Precision metrics
    report.append("PRECISION (What % of results shown are relevant?)")
    report.append(f"  Precision@1  = {metrics.get('precision@1', 0):.4f}")
    report.append(f"  Precision@3  = {metrics.get('precision@3', 0):.4f}")
    report.append(f"  Precision@5  = {metrics.get('precision@5', 0):.4f}")
    report.append(f"  Precision@10 = {metrics.get('precision@10', 0):.4f}")
    report.append("")
    
    # Recall metrics
    report.append("RECALL (What % of relevant results were found?)")
    report.append(f"  Recall@1     = {metrics.get('recall@1', 0):.4f}")
    report.append(f"  Recall@3     = {metrics.get('recall@3', 0):.4f}")
    report.append(f"  Recall@5     = {metrics.get('recall@5', 0):.4f}")
    report.append(f"  Recall@10    = {metrics.get('recall@10', 0):.4f}")
    report.append("")
    
    # NDCG metrics
    report.append("NDCG (How well-ranked are results?)")
    report.append(f"  NDCG@1       = {metrics.get('ndcg@1', 0):.4f}")
    report.append(f"  NDCG@3       = {metrics.get('ndcg@3', 0):.4f}")
    report.append(f"  NDCG@5       = {metrics.get('ndcg@5', 0):.4f}")
    report.append(f"  NDCG@10      = {metrics.get('ndcg@10', 0):.4f}")
    report.append("")
    
    # MRR metric
    report.append("MRR (How quickly do we find first relevant result?)")
    report.append(f"  MRR          = {metrics.get('mrr', 0):.4f}")
    report.append("")
    
    report.append("-" * 80)
    report.append("INTERPRETATION GUIDE")
    report.append("-" * 80)
    report.append("")
    report.append("Precision@K (higher is better, aim for 0.6+)")
    report.append("  Answers: Of the top K results shown, what fraction were relevant?")
    report.append("  User perspective: How much noise in my search results?")
    report.append("")
    report.append("Recall@K (higher is better, aim for 0.5+)")
    report.append("  Answers: Did I find all the relevant information?")
    report.append("  System perspective: How much coverage do I have?")
    report.append("")
    report.append("NDCG@K (higher is better, aim for 0.8+)")
    report.append("  Answers: Are the most relevant results ranked at the top?")
    report.append("  Ranking quality: How well-ordered is my result list?")
    report.append("")
    report.append("MRR (higher is better, aim for 0.8+)")
    report.append("  Answers: On average, which position is the first relevant result?")
    report.append("  User satisfaction: How many items until user finds value?")
    report.append("")
    
    report.append("=" * 80)
    
    return "\n".join(report)


# ============================================================================
# STEP 5: Start experiment and store results
# ============================================================================

print("\n" + "=" * 70)
print("STORING RESULTS")
print("=" * 70)

# For tracking in database, we need to use the patterns from foundation/00
# This creates an experiment record to track the evaluation run

config = {
    'top_k_values': TOP_K_VALUES,
    'embedding_model': EMBEDDING_MODEL_ALIAS,
    'num_test_queries': len(ground_truth_questions),
    'compute_bleu_rouge': COMPUTE_BLEU_ROUGE,
    'use_llm_judge': USE_LLM_AS_JUDGE,
}

print("\nExperiment Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

# Start experiment (using patterns from foundation/00)
print(f"\nStarting experiment: {EXPERIMENT_NAME}")

with db_connection.cursor() as cur:
    import hashlib
    
    # Compute config hash for tracking
    config_str = json.dumps(config, sort_keys=True)
    config_hash = hashlib.sha256(config_str.encode()).hexdigest()[:12]
    
    # Create experiment record
    cur.execute('''
        INSERT INTO experiments (
            experiment_name, 
            notebook_path, 
            embedding_model_alias,
            config_hash,
            config_json,
            techniques_applied,
            notes,
            status
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, 'completed')
        RETURNING id
    ''', (
        EXPERIMENT_NAME,
        'evaluation-lab/02-evaluation-metrics-framework.ipynb',
        EMBEDDING_MODEL_ALIAS,
        config_hash,
        json.dumps(config),
        TECHNIQUES_APPLIED,
        f"Baseline metrics evaluation on {len(ground_truth_questions)} test queries"
    ))
    
    experiment_id = cur.fetchone()[0]

db_connection.commit()

print(f"Created experiment #{experiment_id}")

# Store metrics to database
print("\nStoring metrics to evaluation_results table...")
success_db, msg_db = store_metrics_to_database(db_connection, experiment_id, all_metrics)

# Export to JSON file
print("\nExporting metrics to JSON...")
success_json, json_path = save_metrics_to_json(all_metrics, EXPERIMENT_NAME)

# Generate and print summary report
print("\n" + "=" * 70)
summary_report = generate_metrics_summary_report(all_metrics, EXPERIMENT_NAME, TECHNIQUES_APPLIED)
print(summary_report)
print("=" * 70)

# Save report to text file
import os
os.makedirs('data/experiment_results', exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_path = f"data/experiment_results/{EXPERIMENT_NAME.lower().replace(' ', '_')}_{timestamp}.txt"
with open(report_path, 'w') as f:
    f.write(summary_report)
print(f"\nReport saved to: {report_path}")

# Final summary
print("\n" + "=" * 70)
print("EVALUATION COMPLETE")
print("=" * 70)
print(f"\nResults stored successfully:")
print(f"  Database: experiment #{experiment_id}")
if success_json:
    print(f"  JSON file: {json_path}")
print(f"  Report: {report_path}")

print(f"\nKey metrics:")
print(f"  Precision@5: {all_metrics.get('precision@5', 0):.4f}")
print(f"  Recall@5: {all_metrics.get('recall@5', 0):.4f}")
print(f"  NDCG@5: {all_metrics.get('ndcg@5', 0):.4f}")
print(f"  MRR: {all_metrics.get('mrr', 0):.4f}")
print(f"  Queries evaluated: {all_metrics.get('num_queries', 0)}")

print("\nNext steps:")
print("  1. Review the visualizations above")
print("  2. Check the metrics report")
print("  3. Use metrics as baseline for advanced techniques")
print("  4. Run advanced-techniques notebooks to improve scores")