## Prerequisites

1. ✅ foundation/00-setup-postgres-schema.ipynb
2. ✅ foundation/02-rag-postgresql-persistent.ipynb
3. ✅ evaluation-lab/01-create-ground-truth-human-in-loop.ipynb

## Configuration

In [None]:
EMBEDDING_MODEL_ALIAS = "all-minilm-l6-v2"
CHUNK_SIZE = 256
CHUNK_OVERLAP = 50
SPLIT_STRATEGY = "semantic"  # vs. 'fixed' or 'paragraph'
TOP_K = 5

EXPERIMENT_NAME = "semantic-chunking-metadata"
TECHNIQUES_APPLIED = ["semantic_chunking", "metadata_extraction", "filtered_retrieval"]

## Load Embeddings from Registry

In [None]:
import psycopg2
import psycopg2.extras
import ollama
import json
import pandas as pd
import numpy as np
import hashlib
import re
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import os

# PostgreSQL connection
POSTGRES_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'rag_db',
    'user': 'postgres',
    'password': 'postgres',
}

# Create database connection
try:
    db_connection = psycopg2.connect(
        host=POSTGRES_CONFIG['host'],
        port=POSTGRES_CONFIG['port'],
        database=POSTGRES_CONFIG['database'],
        user=POSTGRES_CONFIG['user'],
        password=POSTGRES_CONFIG['password']
    )
    print("✓ Connected to PostgreSQL")
except psycopg2.OperationalError as e:
    print(f"✗ Failed to connect to PostgreSQL: {e}")
    raise

# ============================================================================
# PART 1: REGISTRY DISCOVERY & LOAD-OR-GENERATE PATTERN
# ============================================================================

def list_available_embeddings(db_connection) -> pd.DataFrame:
    """Query embedding_registry to show available models with metadata.

    Returns:
        DataFrame with columns: model_alias, model_name, dimension, embedding_count,
                                 chunk_source_dataset, created_at, chunk_size_config
    """
    query = '''
        SELECT
            model_alias,
            model_name,
            dimension,
            embedding_count,
            chunk_source_dataset,
            chunk_size_config,
            created_at,
            last_accessed
        FROM embedding_registry
        ORDER BY created_at DESC
    '''
    return pd.read_sql(query, db_connection)


def get_embedding_metadata(db_connection, model_alias: str) -> Optional[Dict]:
    """Fetch metadata_json and other info for a specific model.

    Args:
        db_connection: PostgreSQL connection
        model_alias: The model alias (e.g., 'all_minilm_l6_v2')

    Returns:
        Dict with: dimension, embedding_count, config_hash (if stored),
                   chunk_source_dataset, created_at, metadata_json
    """
    with db_connection.cursor() as cur:
        cur.execute('''
            SELECT
                dimension,
                embedding_count,
                chunk_source_dataset,
                chunk_size_config,
                created_at,
                metadata_json
            FROM embedding_registry
            WHERE model_alias = %s
        ''', (model_alias,))
        result = cur.fetchone()

        if not result:
            return None

        return {
            'dimension': result[0],
            'embedding_count': result[1],
            'chunk_source_dataset': result[2],
            'chunk_size_config': result[3],
            'created_at': result[4],
            'metadata_json': result[5] or {}
        }


class PostgreSQLVectorDB:
    """Helper to load embeddings from PostgreSQL without regeneration."""

    def __init__(self, config, table_name, preserve_existing=True):
        self.config = config
        self.table_name = table_name
        self.conn = psycopg2.connect(
            host=config['host'],
            port=config['port'],
            database=config['database'],
            user=config['user'],
            password=config['password']
        )
        print(f'✓ Connected to table: {table_name}')

    def get_chunk_count(self):
        """How many embeddings are stored?"""
        with self.conn.cursor() as cur:
            cur.execute(f'SELECT COUNT(*) FROM {self.table_name}')
            return cur.fetchone()[0]

    def similarity_search(self, query_embedding, top_n=5):
        """Retrieve most similar chunks using pgvector.

        Args:
            query_embedding: Query embedding vector
            top_n: Number of results to return

        Returns:
            List of tuples: (chunk_text, similarity_score, chunk_id, metadata)
        """
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f'''
                SELECT id,
                       content as chunk_text,
                       metadata_json,
                       1 - (embedding <=> %s::vector) as similarity
                FROM {self.table_name}
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            ''', (query_embedding, query_embedding, top_n))

            results = cur.fetchall()
            return [(row['chunk_text'], row['similarity'], row['id'], row['metadata_json'] or {}) 
                    for row in results]

    def close(self):
        if self.conn:
            self.conn.close()


def load_or_generate(db_connection, embedding_model_alias, preserve_existing=True):
    """Load embeddings from registry OR show instructions if not available.

    This is the CORE PATTERN for fast iteration: check registry first,
    load existing embeddings instantly (<1 second), avoid 50+ minute regeneration.

    Args:
        db_connection: PostgreSQL connection object
        embedding_model_alias: Model identifier (e.g., 'all_minilm_l6_v2')
        preserve_existing: If True, always load. If False, regenerate.

    Returns:
        PostgreSQLVectorDB instance ready for use, or None if no embeddings available
    """

    print(f"\n{'='*70}")
    print(f"Checking for embeddings: '{embedding_model_alias}'...")
    print(f"{'='*70}\n")

    try:
        with db_connection.cursor() as cur:
            cur.execute('''
                SELECT id, dimension, embedding_count, created_at, metadata_json
                FROM embedding_registry
                WHERE model_alias = %s
            ''', (embedding_model_alias,))
            registry_entry = cur.fetchone()
    except Exception as e:
        print(f"Could not query registry: {e}")
        print("Make sure foundation/00-setup-postgres-schema.ipynb has been run.")
        return None

    # Case A: Embeddings exist
    if registry_entry:
        reg_id, dimension, embedding_count, created_at, metadata_json = registry_entry

        print(f"✓ FOUND EXISTING EMBEDDINGS")
        print(f"  Model:      {embedding_model_alias}")
        print(f"  Count:      {embedding_count:,} embeddings")
        print(f"  Dimension:  {dimension}")
        print(f"  Created:    {created_at}")
        print(f"\n  TIME SAVINGS:")
        print(f"    Loading:       <1 second")
        print(f"    Regenerating:  ~50+ minutes")
        print(f"    ➜ You save 50+ minutes by loading!\n")

        if preserve_existing:
            # Auto-load (for scripts/notebooks)
            print("Loading existing embeddings...\n")

            try:
                table_name = f'embeddings_{embedding_model_alias.replace(".", "_")}'

                db_instance = PostgreSQLVectorDB(
                    config=POSTGRES_CONFIG,
                    table_name=table_name,
                    preserve_existing=True
                )

                count = db_instance.get_chunk_count()
                print(f"✓ LOADED SUCCESSFULLY")
                print(f"  Embeddings: {count:,}")
                print(f"  Table: {table_name}")
                print(f"  Status: Ready for retrieval\n")

                return db_instance

            except Exception as e:
                print(f"\n✗ Error loading embeddings: {e}")
                print(f"\nTroubleshooting:")
                print(f"  1. Verify PostgreSQL is running")
                print(f"  2. Check POSTGRES_CONFIG settings")
                print(f"  3. Run foundation/02 to generate embeddings first")
                return None

    # Case B: No embeddings found
    else:
        print(f"✗ NO EMBEDDINGS FOUND")
        print(f"  Model: {embedding_model_alias}")
        print(f"\nTo create embeddings, run:")
        print(f"  foundation/02-rag-postgresql-persistent.ipynb")
        print(f"\nThen come back and re-run this cell.\n")
        return None


# Discover and load embeddings
print("Step 1: Discovering available embeddings...\n")
available = list_available_embeddings(db_connection)

if available.empty:
    print("⚠️  No embeddings found in registry yet.")
    print("Run foundation/02-rag-postgresql-persistent.ipynb first.\n")
else:
    print("Available embeddings:")
    print(available.to_string(index=False))
    print()

# Load embeddings using the pattern
print("\nStep 2: Loading embeddings using load-or-generate pattern...\n")
embeddings_db = load_or_generate(
    db_connection=db_connection,
    embedding_model_alias=EMBEDDING_MODEL_ALIAS,
    preserve_existing=True  # Auto-load if available
)

if embeddings_db:
    print("✓ Success! Embeddings loaded and ready for retrieval.")
else:
    print("⚠️  Could not load embeddings. See instructions above.")
    embeddings_db = None

## Implement Semantic Chunking

In [None]:
# ============================================================================
# PART 2: IMPLEMENT SEMANTIC CHUNKING WITH METADATA EXTRACTION
# ============================================================================

def semantic_chunk_split(text, max_chunk_size=512, overlap=50):
    """
    Split text at semantic boundaries (paragraphs, sentences) not fixed chars.
    
    Args:
        text: Full text to chunk
        max_chunk_size: Target max characters per chunk
        overlap: Characters to overlap between chunks
        
    Returns:
        List of dicts with 'text', 'start_pos', 'end_pos', 'boundary_type'
    """
    chunks = []
    
    # Split by double newline (paragraph boundaries)
    paragraphs = text.split('\n\n')
    
    current_chunk = ""
    current_start = 0
    
    for para in paragraphs:
        # If adding paragraph exceeds max, save current chunk
        if len(current_chunk) + len(para) > max_chunk_size and current_chunk:
            chunks.append({
                'text': current_chunk.strip(),
                'start_pos': current_start,
                'end_pos': current_start + len(current_chunk),
                'boundary_type': 'paragraph'
            })
            
            # Start new chunk with overlap
            overlap_text = current_chunk[-overlap:] if len(current_chunk) > overlap else current_chunk
            current_chunk = overlap_text + "\n\n" + para
            current_start = current_start + len(current_chunk) - overlap - len(para) - 2
        else:
            current_chunk += "\n\n" + para if current_chunk else para
    
    # Add final chunk
    if current_chunk:
        chunks.append({
            'text': current_chunk.strip(),
            'start_pos': current_start,
            'end_pos': current_start + len(current_chunk),
            'boundary_type': 'paragraph'
        })
    
    return chunks


def extract_metadata(chunk_text):
    """
    Extract metadata from chunk text.
    
    Returns:
        dict: {
            'has_dates': bool,
            'has_numbers': bool,
            'has_entities': bool,
            'word_count': int,
            'has_questions': bool,
            'entity_count': int
        }
    """
    metadata = {}
    
    # Detect dates (YYYY format)
    metadata['has_dates'] = bool(re.search(r'\b\d{4}\b', chunk_text))
    
    # Detect numbers
    metadata['has_numbers'] = bool(re.search(r'\d+', chunk_text))
    
    # Detect capitalized entities (simple heuristic)
    entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', chunk_text)
    metadata['has_entities'] = len(entities) > 0
    metadata['entity_count'] = len(set(entities))
    
    # Word count
    metadata['word_count'] = len(chunk_text.split())
    
    # Has questions
    metadata['has_questions'] = '?' in chunk_text
    
    return metadata


# Load documents from database for chunking demonstration
print("\n" + "="*70)
print("LOADING DOCUMENTS FOR SEMANTIC CHUNKING DEMONSTRATION")
print("="*70)

# Get sample documents from embeddings table
sample_docs = []
embeddings_table_name = f'embeddings_{EMBEDDING_MODEL_ALIAS.replace(".", "_")}'

if embeddings_db:
    with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        cur.execute(f'''
            SELECT DISTINCT content
            FROM {embeddings_table_name}
            LIMIT 5
        ''')
        for row in cur.fetchall():
            sample_docs.append(row['content'])

print(f"\nLoaded {len(sample_docs)} sample documents for demonstration\n")

if sample_docs:
    # Demonstrate semantic chunking on first document
    test_doc = sample_docs[0]
    
    print(f"Sample document (first 200 chars):")
    print(f"  {test_doc[:200]}...\n")
    
    # Apply semantic chunking
    print(f"Applying semantic chunking (max_chunk_size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})...")
    semantic_chunks = semantic_chunk_split(test_doc, max_chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
    
    print(f"✓ Created {len(semantic_chunks)} semantic chunks\n")
    
    # Extract and display metadata for each chunk
    print("Chunks with extracted metadata:")
    print("-" * 70)
    
    chunks_with_metadata = []
    for i, chunk in enumerate(semantic_chunks):
        metadata = extract_metadata(chunk['text'])
        chunk['metadata'] = metadata
        chunks_with_metadata.append(chunk)
        
        print(f"\n[Chunk {i+1}]")
        print(f"  Length: {len(chunk['text'])} chars, {metadata['word_count']} words")
        print(f"  Has dates: {metadata['has_dates']}")
        print(f"  Has numbers: {metadata['has_numbers']}")
        print(f"  Has entities: {metadata['has_entities']} ({metadata['entity_count']} unique)")
        print(f"  Has questions: {metadata['has_questions']}")
        print(f"  Preview: {chunk['text'][:80].replace(chr(10), ' ')}...")
    
    print("\n" + "="*70)
    print("SEMANTIC CHUNKING SUMMARY")
    print("="*70)
    
    # Show statistics
    total_chars = sum(len(c['text']) for c in chunks_with_metadata)
    avg_chunk_size = total_chars / len(chunks_with_metadata) if chunks_with_metadata else 0
    
    print(f"\nChunking Statistics:")
    print(f"  Total chunks: {len(chunks_with_metadata)}")
    print(f"  Average chunk size: {avg_chunk_size:.0f} chars")
    print(f"  Min chunk size: {min((len(c['text']) for c in chunks_with_metadata), default=0)} chars")
    print(f"  Max chunk size: {max((len(c['text']) for c in chunks_with_metadata), default=0)} chars")
    
    # Metadata statistics
    chunks_with_dates = sum(1 for c in chunks_with_metadata if c['metadata']['has_dates'])
    chunks_with_entities = sum(1 for c in chunks_with_metadata if c['metadata']['has_entities'])
    chunks_with_questions = sum(1 for c in chunks_with_metadata if c['metadata']['has_questions'])
    
    print(f"\nMetadata Statistics:")
    print(f"  Chunks with dates: {chunks_with_dates}/{len(chunks_with_metadata)}")
    print(f"  Chunks with entities: {chunks_with_entities}/{len(chunks_with_metadata)}")
    print(f"  Chunks with questions: {chunks_with_questions}/{len(chunks_with_metadata)}")
    
    avg_entities = np.mean([c['metadata']['entity_count'] for c in chunks_with_metadata])
    avg_words = np.mean([c['metadata']['word_count'] for c in chunks_with_metadata])
    print(f"  Average entities per chunk: {avg_entities:.1f}")
    print(f"  Average words per chunk: {avg_words:.1f}")
    
else:
    print("⚠️  No embeddings available for demonstration")
    chunks_with_metadata = []

## Filtered Retrieval

In [None]:
# ============================================================================
# PART 3: IMPLEMENT METADATA-FILTERED RETRIEVAL
# ============================================================================

def filtered_retrieval(query, embeddings_db, embedding_model, filters=None, top_k=5):
    """
    Retrieve with optional metadata filtering.
    
    Args:
        query: User question
        embeddings_db: PostgreSQLVectorDB instance
        embedding_model: Model for query embedding
        filters: Dict of metadata filters {'has_dates': True, 'min_word_count': 50}
        top_k: Number of results
        
    Returns:
        Filtered results matching criteria: list of (chunk_text, score, chunk_id, metadata)
    """
    
    # Step 1: Vector retrieval (broad recall)
    query_emb = ollama.embed(model=embedding_model, input=query)['embeddings'][0]
    candidates = embeddings_db.similarity_search(query_emb, top_n=top_k*3)  # Over-retrieve
    
    # Step 2: Apply metadata filters
    if filters:
        filtered = []
        for chunk_text, score, chunk_id, metadata in candidates:
            # Check all filter conditions
            passes_filters = True
            for key, value in filters.items():
                if key.startswith('min_'):
                    # Minimum value filter
                    metric_key = key[4:]  # Remove 'min_' prefix
                    if metadata.get(metric_key, 0) < value:
                        passes_filters = False
                        break
                elif key.startswith('has_'):
                    # Boolean filter
                    if metadata.get(key, False) != value:
                        passes_filters = False
                        break
            
            if passes_filters:
                filtered.append((chunk_text, score, chunk_id, metadata))
        
        candidates = filtered
    
    return candidates[:top_k]


# Load ground truth test questions
print("\n" + "="*70)
print("LOADING GROUND TRUTH TEST QUESTIONS")
print("="*70)

ground_truth_questions = []

with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
    cur.execute('''
        SELECT
            id,
            question,
            relevant_chunk_ids,
            quality_rating,
            source_type
        FROM evaluation_groundtruth
        WHERE quality_rating = 'good'
        ORDER BY id
        LIMIT 10
    ''')

    for row in cur.fetchall():
        ground_truth_questions.append({
            'id': row['id'],
            'question': row['question'],
            'relevant_chunk_ids': row['relevant_chunk_ids'] or [],
            'quality_rating': row['quality_rating'],
            'source_type': row['source_type']
        })

print(f"\nLoaded {len(ground_truth_questions)} ground truth questions")

if ground_truth_questions:
    print(f"Sample question: {ground_truth_questions[0]['question'][:80]}...\n")

    # Demonstrate filtered retrieval
    print("="*70)
    print("DEMONSTRATING FILTERED RETRIEVAL")
    print("="*70)

    if embeddings_db:
        test_question = ground_truth_questions[0]['question']
        print(f"\nQuery: {test_question}\n")

        # Baseline: no filters
        print("BASELINE (No Filters):")
        baseline_results = filtered_retrieval(
            test_question,
            embeddings_db,
            EMBEDDING_MODEL_ALIAS,
            filters=None,
            top_k=TOP_K
        )

        print(f"Retrieved {len(baseline_results)} results:")
        for i, (chunk_text, score, chunk_id, metadata) in enumerate(baseline_results, 1):
            preview = chunk_text[:80].replace('\n', ' ')
            print(f"  [{i}] (score: {score:.4f}) {preview}...")

        # With entity filter
        print("\nWITH ENTITY FILTER (has_entities=True):")
        entity_filtered = filtered_retrieval(
            test_question,
            embeddings_db,
            EMBEDDING_MODEL_ALIAS,
            filters={'has_entities': True},
            top_k=TOP_K
        )

        print(f"Retrieved {len(entity_filtered)} results with entities:")
        for i, (chunk_text, score, chunk_id, metadata) in enumerate(entity_filtered, 1):
            preview = chunk_text[:80].replace('\n', ' ')
            entity_count = metadata.get('entity_count', 0)
            print(f"  [{i}] (score: {score:.4f}, entities: {entity_count}) {preview}...")

        # With minimum word count filter
        print(f"\nWITH MINIMUM WORD COUNT FILTER (min_word_count=50):")
        word_filtered = filtered_retrieval(
            test_question,
            embeddings_db,
            EMBEDDING_MODEL_ALIAS,
            filters={'min_word_count': 50},
            top_k=TOP_K
        )

        print(f"Retrieved {len(word_filtered)} results with at least 50 words:")
        for i, (chunk_text, score, chunk_id, metadata) in enumerate(word_filtered, 1):
            preview = chunk_text[:80].replace('\n', ' ')
            word_count = metadata.get('word_count', 0)
            print(f"  [{i}] (score: {score:.4f}, words: {word_count}) {preview}...")

        # With combined filters
        print(f"\nWITH COMBINED FILTERS (has_entities=True AND min_word_count=30):")
        combined_filtered = filtered_retrieval(
            test_question,
            embeddings_db,
            EMBEDDING_MODEL_ALIAS,
            filters={'has_entities': True, 'min_word_count': 30},
            top_k=TOP_K
        )

        print(f"Retrieved {len(combined_filtered)} results:")
        for i, (chunk_text, score, chunk_id, metadata) in enumerate(combined_filtered, 1):
            preview = chunk_text[:80].replace('\n', ' ')
            entity_count = metadata.get('entity_count', 0)
            word_count = metadata.get('word_count', 0)
            print(f"  [{i}] (score: {score:.4f}, entities: {entity_count}, words: {word_count}) {preview}...")

    else:
        print("⚠️  No embeddings available for filtered retrieval demonstration")

## Evaluate Impact

In [None]:
# ============================================================================
# PART 4: EVALUATE IMPACT - COHERENCE AND FILTERED RETRIEVAL
# ============================================================================

# Metric functions
def precision_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:
    """Precision@K: What % of top-K results are relevant?"""
    if k == 0:
        return 0.0

    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)

    num_relevant_in_k = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)

    return num_relevant_in_k / k


def recall_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:
    """Recall@K: What % of all relevant chunks were found in top-K?"""
    if len(relevant_chunk_ids) == 0:
        return 0.0

    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)

    num_relevant_found = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)

    return num_relevant_found / len(relevant_set)


def mean_reciprocal_rank(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int]) -> float:
    """MRR: How quickly do we find the first relevant result?"""
    relevant_set = set(relevant_chunk_ids)

    for rank, chunk_id in enumerate(retrieved_chunk_ids, start=1):
        if chunk_id in relevant_set:
            return 1.0 / rank

    return 0.0


def ndcg_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:
    """NDCG@K: How well-ranked are results? (rewards relevant at top)"""
    import math
    
    def dcg_score(relevance_scores: List[float]) -> float:
        """Compute DCG from relevance scores."""
        return sum(
            (2**rel - 1) / math.log2(rank + 2)
            for rank, rel in enumerate(relevance_scores)
        )

    if k == 0 or len(relevant_chunk_ids) == 0:
        return 0.0

    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)

    # Binary relevance: 1 if relevant, 0 if not
    relevance = [1 if chunk_id in relevant_set else 0 for chunk_id in retrieved_k]

    dcg = dcg_score(relevance)
    ideal_relevance = sorted(relevance, reverse=True)
    idcg = dcg_score(ideal_relevance)

    if idcg == 0:
        return 0.0

    return dcg / idcg


def evaluate_filtering_impact(test_questions: List[Dict],
                              embeddings_db,
                              embedding_model: str,
                              top_k: int = 5) -> Dict:
    """
    Compare unfiltered vs filtered retrieval.
    
    Focus: Precision improvement from metadata filtering
    
    Args:
        test_questions: List of question dicts with 'question' and 'relevant_chunk_ids'
        embeddings_db: PostgreSQLVectorDB instance
        embedding_model: Model name for embeddings
        top_k: Results to return
        
    Returns:
        Dict with baseline, filtered metrics, and improvements
    """
    
    unfiltered_results = []
    filtered_results = []
    per_query_details = []
    
    print(f"\nEvaluating metadata filtering on {len(test_questions)} test questions...")
    print("-" * 70)
    
    for q_idx, q in enumerate(test_questions, 1):
        query = q['question']
        relevant_ids = q['relevant_chunk_ids']
        
        if not relevant_ids:
            continue
        
        # Baseline: no filters
        unfiltered_chunks = filtered_retrieval(
            query,
            embeddings_db,
            embedding_model,
            filters=None,
            top_k=top_k
        )
        unfiltered_ids = [chunk_id for _, _, chunk_id, _ in unfiltered_chunks]
        
        # With filtering: require entities (more likely to be relevant)
        filtered_chunks = filtered_retrieval(
            query,
            embeddings_db,
            embedding_model,
            filters={'has_entities': True, 'min_word_count': 20},
            top_k=top_k
        )
        filtered_ids = [chunk_id for _, _, chunk_id, _ in filtered_chunks]
        
        # Compute metrics
        unfiltered_metrics = {
            'precision@5': precision_at_k(unfiltered_ids, relevant_ids, k=5),
            'recall@5': recall_at_k(unfiltered_ids, relevant_ids, k=5),
            'mrr': mean_reciprocal_rank(unfiltered_ids, relevant_ids),
            'ndcg@5': ndcg_at_k(unfiltered_ids, relevant_ids, k=5)
        }
        
        filtered_metrics = {
            'precision@5': precision_at_k(filtered_ids, relevant_ids, k=5),
            'recall@5': recall_at_k(filtered_ids, relevant_ids, k=5),
            'mrr': mean_reciprocal_rank(filtered_ids, relevant_ids),
            'ndcg@5': ndcg_at_k(filtered_ids, relevant_ids, k=5)
        }
        
        unfiltered_results.append(unfiltered_metrics)
        filtered_results.append(filtered_metrics)
        
        per_query_details.append({
            'question': query,
            'unfiltered': unfiltered_metrics,
            'filtered': filtered_metrics,
            'unfiltered_count': len(unfiltered_ids),
            'filtered_count': len(filtered_ids)
        })
        
        if q_idx % max(1, len(test_questions) // 5) == 0:
            print(f"  Progress: {q_idx}/{len(test_questions)} queries evaluated")
    
    # Aggregate metrics
    def aggregate(results):
        if not results:
            return {'precision@5': 0, 'recall@5': 0, 'mrr': 0, 'ndcg@5': 0}
        return {
            metric: np.mean([r[metric] for r in results])
            for metric in results[0].keys()
        }
    
    unfiltered_agg = aggregate(unfiltered_results)
    filtered_agg = aggregate(filtered_results)
    
    # Compute improvements
    improvements = {}
    for metric in unfiltered_agg.keys():
        baseline_val = unfiltered_agg[metric]
        filtered_val = filtered_agg[metric]
        
        if baseline_val > 0:
            improvements[metric] = ((filtered_val - baseline_val) / baseline_val * 100)
        else:
            improvements[metric] = 0.0
    
    return {
        'unfiltered': unfiltered_agg,
        'filtered': filtered_agg,
        'improvements_pct': improvements,
        'per_query': per_query_details,
        'num_queries': len(unfiltered_results)
    }


# Run evaluation
print("\n" + "="*70)
print("EVALUATION: UNFILTERED VS METADATA-FILTERED RETRIEVAL")
print("="*70)

if embeddings_db and ground_truth_questions:
    eval_results = evaluate_filtering_impact(
        ground_truth_questions,
        embeddings_db,
        EMBEDDING_MODEL_ALIAS,
        top_k=TOP_K
    )

    # Display results
    print("\n" + "="*70)
    print("RESULTS SUMMARY")
    print("="*70)

    print(f"\nQueries evaluated: {eval_results['num_queries']}\n")

    print(f"{'Metric':<20} {'Unfiltered':<15} {'Filtered':<15} {'Change':<15}")
    print("-" * 65)

    for metric in ['precision@5', 'recall@5', 'ndcg@5', 'mrr']:
        unfiltered_val = eval_results['unfiltered'][metric]
        filtered_val = eval_results['filtered'][metric]
        improvement = eval_results['improvements_pct'][metric]
        
        improvement_str = f"+{improvement:.1f}%" if improvement >= 0 else f"{improvement:.1f}%"
        print(f"{metric:<20} {unfiltered_val:<15.4f} {filtered_val:<15.4f} {improvement_str:<15}")

    # Analysis of which filters matter most
    print("\n" + "="*70)
    print("ANALYSIS: FILTERING EFFECTIVENESS")
    print("="*70)

    # Count how many queries benefited from filtering
    beneficial_queries = sum(
        1 for q in eval_results['per_query']
        if q['filtered']['precision@5'] >= q['unfiltered']['precision@5']
    )
    
    print(f"\nQueries with improved/maintained Precision@5: {beneficial_queries}/{eval_results['num_queries']}")
    print(f"  ({beneficial_queries/eval_results['num_queries']*100:.1f}% of queries)")

    # Queries where filtering reduced result count
    reduced_queries = sum(
        1 for q in eval_results['per_query']
        if q['filtered_count'] < q['unfiltered_count']
    )
    
    print(f"\nQueries where filtering reduced result count: {reduced_queries}/{eval_results['num_queries']}")
    
    avg_reduction = np.mean([
        q['unfiltered_count'] - q['filtered_count']
        for q in eval_results['per_query']
    ])
    
    print(f"  Average reduction in results: {avg_reduction:.1f} chunks")

    # Show top improvement queries
    top_improved = sorted(
        eval_results['per_query'],
        key=lambda x: x['filtered']['precision@5'] - x['unfiltered']['precision@5'],
        reverse=True
    )

    print(f"\nTop 3 queries with largest Precision@5 improvement:")
    for i, q in enumerate(top_improved[:3], 1):
        unf_p5 = q['unfiltered']['precision@5']
        filt_p5 = q['filtered']['precision@5']
        improvement = (filt_p5 - unf_p5) * 100
        print(f"  [{i}] +{improvement:.1f}% (Precision@5: {unf_p5:.2f} → {filt_p5:.2f})")
        print(f"      Q: {q['question'][:70]}...")

else:
    print("⚠️  Cannot evaluate: embeddings or test questions not available")
    eval_results = None

## Track Experiment

In [None]:
# ============================================================================
# PART 5: EXPERIMENT TRACKING
# ============================================================================

def compute_config_hash(config_dict: Dict) -> str:
    """Create deterministic SHA256 hash of a configuration dictionary.

    This enables finding all experiments with identical configurations.

    Args:
        config_dict: Configuration parameters

    Returns:
        SHA256 hash string (first 12 characters for readability)
    """
    config_str = json.dumps(config_dict, sort_keys=True)
    hash_obj = hashlib.sha256(config_str.encode())
    return hash_obj.hexdigest()[:12]


def start_experiment(db_connection, experiment_name: str,
                     notebook_path: str = None,
                     embedding_model_alias: str = None,
                     config: Dict = None,
                     techniques: List[str] = None,
                     notes: str = None) -> int:
    """Start a new experiment and return its ID for tracking.

    Args:
        db_connection: PostgreSQL connection
        experiment_name: Human-readable experiment name
        notebook_path: Path to the notebook running this experiment
        embedding_model_alias: Which embedding model is being used
        config: Dict of configuration parameters
        techniques: List of techniques being applied
        notes: Optional notes about the experiment

    Returns:
        Experiment ID for use in save_metrics() and complete_experiment()
    """
    if config is None:
        config = {}
    if techniques is None:
        techniques = []

    config_hash = compute_config_hash(config)

    with db_connection.cursor() as cur:
        cur.execute('''
            INSERT INTO experiments (
                experiment_name, notebook_path, embedding_model_alias,
                config_hash, config_json, techniques_applied, notes, status
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, 'running')
            RETURNING id
        ''', (
            experiment_name,
            notebook_path,
            embedding_model_alias,
            config_hash,
            json.dumps(config),
            techniques,
            notes
        ))
        exp_id = cur.fetchone()[0]
    db_connection.commit()
    print(f"✓ Started experiment #{exp_id}: {experiment_name}")
    return exp_id


def save_metrics(db_connection, experiment_id: int, metrics_dict: Dict,
                 export_to_file: bool = True,
                 export_dir: str = 'data/experiment_results') -> Tuple[bool, str]:
    """Save experiment metrics to database and optionally to JSON file.

    Args:
        db_connection: PostgreSQL connection
        experiment_id: ID from start_experiment()
        metrics_dict: Dict of {metric_name: value, ...}
        export_to_file: Whether to also save to filesystem JSON
        export_dir: Directory for JSON exports

    Returns:
        Tuple of (success: bool, message: str)
    """
    try:
        with db_connection.cursor() as cur:
            for metric_name, metric_data in metrics_dict.items():
                # Handle both simple floats and nested dicts with details
                if isinstance(metric_data, dict):
                    metric_value = metric_data.get('value', 0.0)
                    metric_details = metric_data.get('details', {})
                else:
                    metric_value = metric_data
                    metric_details = {}

                cur.execute('''
                    INSERT INTO evaluation_results (
                        experiment_id, metric_name, metric_value, metric_details_json
                    )
                    VALUES (%s, %s, %s, %s)
                ''', (
                    experiment_id,
                    metric_name,
                    float(metric_value),
                    json.dumps(metric_details) if metric_details else '{}'
                ))
        db_connection.commit()

        # Export to file if requested
        file_path = None
        if export_to_file:
            os.makedirs(export_dir, exist_ok=True)
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            file_path = os.path.join(export_dir, f'experiment_{experiment_id}_{timestamp}.json')
            with open(file_path, 'w') as f:
                json.dump({
                    'experiment_id': experiment_id,
                    'timestamp': timestamp,
                    'metrics': metrics_dict
                }, f, indent=2)

        msg = f"✓ Saved {len(metrics_dict)} metrics for experiment #{experiment_id}"
        if file_path:
            msg += f" to {file_path}"
        print(msg)
        return True, msg
    except Exception as e:
        msg = f"✗ Failed to save metrics: {e}"
        print(msg)
        db_connection.rollback()
        return False, msg


def complete_experiment(db_connection, experiment_id: int,
                       status: str = 'completed',
                       notes: str = None) -> bool:
    """Mark an experiment as complete.

    Args:
        db_connection: PostgreSQL connection
        experiment_id: ID returned from start_experiment()
        status: 'completed' or 'failed'
        notes: Optional update to notes field

    Returns:
        True if successful
    """
    try:
        with db_connection.cursor() as cur:
            if notes:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, notes = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, notes, experiment_id))
            else:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, experiment_id))
        db_connection.commit()
        print(f"✓ Experiment #{experiment_id} marked as {status}")
        return True
    except Exception as e:
        print(f"✗ Failed to complete experiment: {e}")
        db_connection.rollback()
        return False


# ============================================================================
# RUN EXPERIMENT TRACKING
# ============================================================================

print("\n" + "="*70)
print("TRACKING EXPERIMENT")
print("="*70)

if eval_results:
    # Prepare configuration
    config_dict = {
        'embedding_model_alias': EMBEDDING_MODEL_ALIAS,
        'chunk_size': CHUNK_SIZE,
        'chunk_overlap': CHUNK_OVERLAP,
        'split_strategy': SPLIT_STRATEGY,
        'top_k': TOP_K,
        'filtering_criteria': {
            'has_entities': True,
            'min_word_count': 20
        },
        'num_test_queries': eval_results['num_queries'],
    }

    config_hash = compute_config_hash(config_dict)

    print(f"\nExperiment Configuration:")
    print(f"  Name: {EXPERIMENT_NAME}")
    print(f"  Embedding Model: {EMBEDDING_MODEL_ALIAS}")
    print(f"  Chunk Size: {CHUNK_SIZE}")
    print(f"  Chunk Overlap: {CHUNK_OVERLAP}")
    print(f"  Split Strategy: {SPLIT_STRATEGY}")
    print(f"  Config Hash: {config_hash}")
    print(f"  Test Queries: {eval_results['num_queries']}\n")

    # Start experiment tracking
    experiment_id = start_experiment(
        db_connection,
        experiment_name=EXPERIMENT_NAME,
        notebook_path='advanced-techniques/08-semantic-chunking-and-metadata.ipynb',
        embedding_model_alias=EMBEDDING_MODEL_ALIAS,
        config=config_dict,
        techniques=TECHNIQUES_APPLIED,
        notes=f'Semantic chunking with metadata filtering evaluation on {eval_results["num_queries"]} queries. '
              f'Techniques: {", ".join(TECHNIQUES_APPLIED)}'
    )

    # Prepare metrics for storage
    metrics_to_store = {}

    # Unfiltered baseline metrics
    for metric_name, metric_value in eval_results['unfiltered'].items():
        metrics_to_store[f'baseline_{metric_name}'] = float(metric_value)

    # Filtered metrics
    for metric_name, metric_value in eval_results['filtered'].items():
        metrics_to_store[f'filtered_{metric_name}'] = float(metric_value)

    # Improvement percentages
    for metric_name, improvement_pct in eval_results['improvements_pct'].items():
        metrics_to_store[f'improvement_pct_{metric_name}'] = float(improvement_pct)

    # Configuration and metadata
    metrics_to_store['num_queries_evaluated'] = eval_results['num_queries']
    metrics_to_store['config_hash'] = config_hash
    
    # Store semantic chunking statistics
    if chunks_with_metadata:
        avg_chunk_size = np.mean([len(c['text']) for c in chunks_with_metadata])
        avg_entities = np.mean([c['metadata']['entity_count'] for c in chunks_with_metadata])
        avg_words = np.mean([c['metadata']['word_count'] for c in chunks_with_metadata])
        
        metrics_to_store['semantic_chunking_avg_chunk_size'] = float(avg_chunk_size)
        metrics_to_store['semantic_chunking_avg_entities'] = float(avg_entities)
        metrics_to_store['semantic_chunking_avg_words'] = float(avg_words)
        metrics_to_store['semantic_chunking_num_chunks'] = len(chunks_with_metadata)

    # Save metrics
    print("\nSaving metrics to database...\n")
    success, message = save_metrics(db_connection, experiment_id, metrics_to_store, export_to_file=True)

    # Complete experiment
    if success:
        notes = f"Successfully evaluated semantic chunking with metadata filtering on {eval_results['num_queries']} queries. "
        notes += f"Precision@5 improvement: {eval_results['improvements_pct']['precision@5']:.2f}%. "
        notes += f"Techniques applied: {', '.join(TECHNIQUES_APPLIED)}"

        complete_experiment(db_connection, experiment_id, status='completed', notes=notes)

        # Display results summary
        print("\n" + "="*70)
        print("EXPERIMENT RESULTS SUMMARY")
        print("="*70)

        print(f"\nExperiment ID: {experiment_id}")
        print(f"Experiment Name: {EXPERIMENT_NAME}")
        print(f"Status: Completed")
        print(f"Config Hash: {config_hash}")

        print(f"\nTechniques Applied:")
        for technique in TECHNIQUES_APPLIED:
            print(f"  - {technique}")

        print(f"\nKey Improvements:")
        for metric_name, improvement_pct in eval_results['improvements_pct'].items():
            sign = "+" if improvement_pct > 0 else ""
            baseline_val = eval_results['unfiltered'][metric_name]
            filtered_val = eval_results['filtered'][metric_name]
            print(f"  {metric_name}:")
            print(f"    Baseline (unfiltered): {baseline_val:.4f}")
            print(f"    With filtering:        {filtered_val:.4f}")
            print(f"    Improvement:           {sign}{improvement_pct:.2f}%")

        if chunks_with_metadata:
            print(f"\nSemantic Chunking Statistics:")
            print(f"  Average chunk size: {avg_chunk_size:.0f} characters")
            print(f"  Average entities: {avg_entities:.1f} per chunk")
            print(f"  Average words: {avg_words:.1f} per chunk")
            print(f"  Total chunks created: {len(chunks_with_metadata)}")

        print(f"\nResults exported to:")
        print(f"  Database: evaluation_results table (experiment_id={experiment_id})")
        print(f"  JSON: data/experiment_results/experiment_{experiment_id}_*.json")

        print("\n" + "="*70)
        print("NEXT STEPS")
        print("="*70)
        print("\n1. Review semantic chunking coherence improvements")
        print("2. Compare metadata filter effectiveness:")
        print("   - Try different combinations (has_dates, has_questions, etc.)")
        print("   - Adjust min_word_count threshold")
        print("3. Combine with other techniques:")
        print("   - Query expansion (notebook 06)")
        print("   - Reranking (notebook 05)")
        print("   - Hybrid search (notebook 07)")
        print("4. Evaluate on different question types in evaluation-lab/01")
        print("5. Compare all advanced techniques using evaluation-lab/04")

    else:
        print("\n✗ Failed to track experiment")
        complete_experiment(db_connection, experiment_id, status='failed', notes='Failed to save metrics')

else:
    print("⚠️  Cannot track experiment: evaluation results not available")

# Close database connection
print("\n\nClosing database connection...")
if embeddings_db:
    embeddings_db.close()
db_connection.close()
print("✓ All connections closed")