## Prerequisites

1. ✅ foundation/00-setup-postgres-schema.ipynb
2. ✅ foundation/02-rag-postgresql-persistent.ipynb
3. ✅ evaluation-lab/01-create-ground-truth-human-in-loop.ipynb
4. ✅ Run individual technique notebooks 05-09 to understand each

## Configuration

In [None]:
# Feature flags - enable/disable each technique
ENABLE_SEMANTIC_CHUNKING = True
ENABLE_QUERY_EXPANSION = True
ENABLE_HYBRID_SEARCH = False  # Can be expensive
ENABLE_RERANKING = True
ENABLE_CITATION_TRACKING = True

EMBEDDING_MODEL_ALIAS = "all-minilm-l6-v2"
TOP_K = 5

# Name this experiment based on enabled techniques
ENABLED_TECHNIQUES = []
if ENABLE_SEMANTIC_CHUNKING: ENABLED_TECHNIQUES.append("semantic_chunking")
if ENABLE_QUERY_EXPANSION: ENABLED_TECHNIQUES.append("query_expansion")
if ENABLE_HYBRID_SEARCH: ENABLED_TECHNIQUES.append("hybrid_search")
if ENABLE_RERANKING: ENABLED_TECHNIQUES.append("reranking")
if ENABLE_CITATION_TRACKING: ENABLED_TECHNIQUES.append("citation_tracking")

EXPERIMENT_NAME = f"combined-advanced-rag-{'-'.join(ENABLED_TECHNIQUES)}"
TECHNIQUES_APPLIED = ENABLED_TECHNIQUES

## Load Embeddings from Registry

In [None]:
import psycopg2
import psycopg2.extras
import ollama
import json
import pandas as pd
import numpy as np
import hashlib
import math
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import os

# PostgreSQL connection
POSTGRES_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'rag_db',
    'user': 'postgres',
    'password': 'postgres',
}

# Create database connection
try:
    db_connection = psycopg2.connect(
        host=POSTGRES_CONFIG['host'],
        port=POSTGRES_CONFIG['port'],
        database=POSTGRES_CONFIG['database'],
        user=POSTGRES_CONFIG['user'],
        password=POSTGRES_CONFIG['password']
    )
    print("✓ Connected to PostgreSQL")
except psycopg2.OperationalError as e:
    print(f"✗ Failed to connect to PostgreSQL: {e}")
    raise

# ============================================================================
# REGISTRY DISCOVERY & LOAD-OR-GENERATE PATTERN
# ============================================================================

def list_available_embeddings(db_connection) -> pd.DataFrame:
    """Query embedding_registry to show available models with metadata."""
    query = '''
        SELECT
            model_alias,
            model_name,
            dimension,
            embedding_count,
            chunk_source_dataset,
            chunk_size_config,
            created_at,
            last_accessed
        FROM embedding_registry
        ORDER BY created_at DESC
    '''
    return pd.read_sql(query, db_connection)


def get_embedding_metadata(db_connection, model_alias: str) -> Optional[Dict]:
    """Fetch metadata for a specific embedding model."""
    with db_connection.cursor() as cur:
        cur.execute('''
            SELECT
                dimension,
                embedding_count,
                chunk_source_dataset,
                chunk_size_config,
                created_at,
                metadata_json
            FROM embedding_registry
            WHERE model_alias = %s
        ''', (model_alias,))
        result = cur.fetchone()

        if not result:
            return None

        return {
            'dimension': result[0],
            'embedding_count': result[1],
            'chunk_source_dataset': result[2],
            'chunk_size_config': result[3],
            'created_at': result[4],
            'metadata_json': result[5] or {}
        }


class PostgreSQLVectorDB:
    """Helper to load embeddings from PostgreSQL without regeneration."""

    def __init__(self, config, table_name, preserve_existing=True):
        self.config = config
        self.table_name = table_name
        self.conn = psycopg2.connect(
            host=config['host'],
            port=config['port'],
            database=config['database'],
            user=config['user'],
            password=config['password']
        )
        print(f'✓ Connected to table: {table_name}')

    def get_chunk_count(self):
        """How many embeddings are stored?"""
        with self.conn.cursor() as cur:
            cur.execute(f'SELECT COUNT(*) FROM {self.table_name}')
            return cur.fetchone()[0]

    def similarity_search(self, query_embedding, top_n=5):
        """Retrieve most similar chunks using pgvector."""
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f'''
                SELECT id,
                       content as chunk_text,
                       1 - (embedding <=> %s::vector) as similarity
                FROM {self.table_name}
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            ''', (query_embedding, query_embedding, top_n))

            results = cur.fetchall()
            return [(row['chunk_text'], row['similarity'], row['id']) for row in results]

    def close(self):
        if self.conn:
            self.conn.close()


def load_or_generate(db_connection, embedding_model_alias, preserve_existing=True):
    """Load embeddings from registry OR show instructions if not available."""

    print(f"\n{'='*70}")
    print(f"Checking for embeddings: '{embedding_model_alias}'...")
    print(f"{'='*70}\n")

    try:
        with db_connection.cursor() as cur:
            cur.execute('''
                SELECT id, dimension, embedding_count, created_at, metadata_json
                FROM embedding_registry
                WHERE model_alias = %s
            ''', (embedding_model_alias,))
            registry_entry = cur.fetchone()
    except Exception as e:
        print(f"Could not query registry: {e}")
        print("Make sure foundation/00-setup-postgres-schema.ipynb has been run.")
        return None

    if registry_entry:
        reg_id, dimension, embedding_count, created_at, metadata_json = registry_entry

        print(f"✓ FOUND EXISTING EMBEDDINGS")
        print(f"  Model:      {embedding_model_alias}")
        print(f"  Count:      {embedding_count:,} embeddings")
        print(f"  Dimension:  {dimension}")
        print(f"  Created:    {created_at}\n")

        if preserve_existing:
            print("Loading existing embeddings...\n")

            try:
                table_name = f'embeddings_{embedding_model_alias.replace(".", "_")}'

                db_instance = PostgreSQLVectorDB(
                    config=POSTGRES_CONFIG,
                    table_name=table_name,
                    preserve_existing=True
                )

                count = db_instance.get_chunk_count()
                print(f"✓ LOADED SUCCESSFULLY")
                print(f"  Embeddings: {count:,}")
                print(f"  Table: {table_name}")
                print(f"  Status: Ready for retrieval\n")

                return db_instance

            except Exception as e:
                print(f"\n✗ Error loading embeddings: {e}")
                return None

    else:
        print(f"✗ NO EMBEDDINGS FOUND")
        print(f"  Model: {embedding_model_alias}")
        print(f"\nTo create embeddings, run:")
        print(f"  foundation/02-rag-postgresql-persistent.ipynb\n")
        return None


# Discover and load embeddings
print("Step 1: Discovering available embeddings...\n")
available = list_available_embeddings(db_connection)

if available.empty:
    print("⚠️  No embeddings found in registry yet.")
    print("Run foundation/02-rag-postgresql-persistent.ipynb first.\n")
else:
    print("Available embeddings:")
    print(available.to_string(index=False))
    print()

# Load embeddings using the pattern
print("\nStep 2: Loading embeddings using load-or-generate pattern...\n")
embeddings_db = load_or_generate(
    db_connection=db_connection,
    embedding_model_alias=EMBEDDING_MODEL_ALIAS,
    preserve_existing=True
)

if embeddings_db:
    print("✓ Success! Embeddings loaded and ready for retrieval.")
else:
    print("⚠️  Could not load embeddings. See instructions above.")
    embeddings_db = None

# Load ground truth test questions
print("\nLoading ground truth test questions...")
ground_truth_questions = []

with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
    cur.execute('''
        SELECT
            id,
            question,
            relevant_chunk_ids,
            quality_rating,
            source_type
        FROM evaluation_groundtruth
        WHERE quality_rating = 'good'
        ORDER BY id
    ''')

    for row in cur.fetchall():
        ground_truth_questions.append({
            'id': row['id'],
            'question': row['question'],
            'relevant_chunk_ids': row['relevant_chunk_ids'] or [],
            'quality_rating': row['quality_rating'],
            'source_type': row['source_type']
        })

print(f"✓ Loaded {len(ground_truth_questions)} ground truth questions\n")

if ground_truth_questions:
    print(f"Sample question: {ground_truth_questions[0]['question'][:80]}...")

## Unified RAG Pipeline

In [None]:
# ============================================================================
# PART 2: COMBINED PIPELINE WITH FEATURE FLAGS
# ============================================================================

def expand_query_with_llm(query: str, num_expansions: int = 4, llm_model: str = 'llama3.2:1b') -> List[str]:
    """Generate semantically similar query reformulations using an LLM."""
    prompt = f"""Generate {num_expansions} different ways to ask this question. Each variant should:
- Mean the same thing as the original
- Use different wording and phrasing
- Be a complete question that can stand alone

Original question: {query}

Generate exactly {num_expansions} variants. Format each on a new line starting with "Q:".
Example format:
Q: How is something done?
Q: What is the process of something?

Now generate {num_expansions} variants for the original question:"""
    
    try:
        response = ollama.chat(
            model=llm_model,
            messages=[{'role': 'user', 'content': prompt}]
        )
        
        content = response['message']['content']
        variants = [query]  # Start with original
        
        for line in content.split('\n'):
            line = line.strip()
            if line.startswith('Q:'):
                variant = line[2:].strip()
                if variant and len(variant) > 10 and variant not in variants:
                    variants.append(variant)
            elif line and '?' in line and len(line) > 10:
                if not any(x in line[:5] for x in ['1.', '2.', '3.', '-']):
                    if line not in variants:
                        variants.append(line)
        
        return variants[:num_expansions + 1]
    
    except Exception as e:
        print(f"  LLM expansion failed: {e}")
        return [query]


def bm25_search_postgresql(query: str, db_connection, table_name: str, top_k: int = 10):
    """Keyword-based retrieval using PostgreSQL full-text search."""
    with db_connection.cursor() as cur:
        try:
            cur.execute(f'''
                SELECT content,
                       ts_rank(to_tsvector('english', content),
                              plainto_tsquery('english', %s)) as relevance,
                       id
                FROM {table_name}
                WHERE to_tsvector('english', content) @@ plainto_tsquery('english', %s)
                ORDER BY relevance DESC
                LIMIT %s
            ''', (query, query, top_k))
            
            results = cur.fetchall()
            return [(chunk, float(score), chunk_id) for chunk, score, chunk_id in results]
        except Exception as e:
            # Full-text search may not be configured
            return []


def reciprocal_rank_fusion(dense_results: List[Tuple],
                          sparse_results: List[Tuple],
                          rrf_k: int = 60,
                          top_k: int = 5) -> List[Tuple]:
    """Fuse rankings from multiple sources using Reciprocal Rank Fusion."""
    dense_ranks = {chunk_id: rank + 1 for rank, (_, _, chunk_id) in enumerate(dense_results)}
    sparse_ranks = {chunk_id: rank + 1 for rank, (_, _, chunk_id) in enumerate(sparse_results)}
    
    all_chunk_ids = set(dense_ranks.keys()) | set(sparse_ranks.keys())
    
    chunk_texts = {}
    for chunk_text, _, chunk_id in dense_results + sparse_results:
        if chunk_id not in chunk_texts:
            chunk_texts[chunk_id] = chunk_text
    
    fused_scores = {}
    for chunk_id in all_chunk_ids:
        rrf_score = 0.0
        
        if chunk_id in dense_ranks:
            rrf_score += 1.0 / (rrf_k + dense_ranks[chunk_id])
        
        if chunk_id in sparse_ranks:
            rrf_score += 1.0 / (rrf_k + sparse_ranks[chunk_id])
        
        fused_scores[chunk_id] = rrf_score
    
    fused = [
        (chunk_texts[chunk_id], score, chunk_id)
        for chunk_id, score in fused_scores.items()
    ]
    fused.sort(key=lambda x: x[1], reverse=True)
    
    return fused[:top_k]


def rerank_with_crossencoder(query: str,
                             candidates: List[Tuple],
                             reranker_model: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2',
                             top_k: int = 5) -> List[Tuple]:
    """Two-stage retrieval: dense retrieval -> cross-encoder reranking."""
    if not candidates:
        return []
    
    from sentence_transformers import CrossEncoder
    
    print(f"Loading cross-encoder model: {reranker_model}")
    model = CrossEncoder(reranker_model)
    
    chunk_texts = [chunk_text for chunk_text, _, _ in candidates]
    pairs = [[query, chunk_text] for chunk_text in chunk_texts]
    
    print(f"Scoring {len(pairs)} candidates with cross-encoder...")
    scores = model.predict(pairs)
    
    reranked = [
        (chunk_text, float(score), chunk_id)
        for (chunk_text, _, chunk_id), score in zip(candidates, scores)
    ]
    
    reranked.sort(key=lambda x: x[1], reverse=True)
    
    return reranked[:top_k]


def combined_retrieval_pipeline(query: str,
                               embeddings_db: PostgreSQLVectorDB,
                               embedding_model: str,
                               db_connection,
                               config: Dict) -> Dict:
    """
    Apply all enabled techniques in optimal order.
    
    Optimal ordering (from experiments):
    1. Query Expansion (improves recall)
    2. Hybrid Search (combines dense + sparse)
    3. Reranking (improves precision)
    4. Citation Tracking (adds transparency)
    
    Returns:
        dict with: results, citations, query_variants, techniques_applied
    """
    techniques_applied = []
    query_variants = [query]
    
    # Step 1: Query Expansion (optional)
    if config.get('enable_query_expansion', False):
        query_variants = expand_query_with_llm(query, num_expansions=config.get('num_expansions', 4))
        techniques_applied.append('query_expansion')
        print(f"  Generated {len(query_variants)} query variants")
    
    # Step 2: Hybrid Search or Vector-only
    table_name = f'embeddings_{embedding_model.replace(".", "_")}'
    results = []
    
    if config.get('enable_hybrid_search', False):
        # Hybrid: combine dense + sparse
        for q_variant in query_variants:
            # Dense retrieval
            q_emb = ollama.embed(model=embedding_model, input=q_variant)['embeddings'][0]
            with embeddings_db.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(f'''
                    SELECT id, content as chunk_text,
                           1 - (embedding <=> %s::vector) as similarity
                    FROM {table_name}
                    ORDER BY embedding <=> %s::vector
                    LIMIT %s
                ''', (q_emb, q_emb, config.get('top_k_initial', 20)))
                dense_results_raw = cur.fetchall()
                dense_results = [(row['chunk_text'], float(row['similarity']), row['id'])
                                for row in dense_results_raw]
            
            # Sparse retrieval
            sparse_results = bm25_search_postgresql(q_variant, db_connection, table_name,
                                                   top_k=config.get('top_k_initial', 20))
            
            # Fuse with RRF
            fused = reciprocal_rank_fusion(dense_results, sparse_results,
                                         rrf_k=config.get('rrf_k', 60),
                                         top_k=config.get('top_k_initial', 20))
            results.extend(fused)
        
        techniques_applied.append('hybrid_search')
        print(f"  Retrieved with hybrid search (vector + BM25 + RRF)")
    
    else:
        # Vector-only retrieval
        for q_variant in query_variants:
            q_emb = ollama.embed(model=embedding_model, input=q_variant)['embeddings'][0]
            vector_results = embeddings_db.similarity_search(q_emb, top_n=config.get('top_k_initial', 20))
            results.extend(vector_results)
        
        techniques_applied.append('vector_search')
        print(f"  Retrieved with vector-only search")
    
    # Deduplicate merged results
    unique_results = {}
    for chunk, score, chunk_id in results:
        if chunk_id not in unique_results or score > unique_results[chunk_id][1]:
            unique_results[chunk_id] = (chunk, score)
    
    results = [(chunk, score, chunk_id) for chunk_id, (chunk, score) in unique_results.items()]
    results.sort(key=lambda x: x[1], reverse=True)
    
    # Step 3: Reranking (optional)
    if config.get('enable_reranking', False):
        results = rerank_with_crossencoder(query, results[:config.get('top_k_initial', 20)],
                                          top_k=config.get('top_k_final', 5))
        techniques_applied.append('reranking')
        print(f"  Applied cross-encoder reranking")
    else:
        results = results[:config.get('top_k_final', 5)]
    
    # Step 4: Citation Tracking (optional)
    citations = None
    if config.get('enable_citation_tracking', False):
        citations = [
            {
                'citation_id': f"[{i+1}]",
                'chunk_text': chunk,
                'score': score,
                'chunk_id': chunk_id
            }
            for i, (chunk, score, chunk_id) in enumerate(results)
        ]
        techniques_applied.append('citation_tracking')
        print(f"  Prepared citation tracking")
    
    return {
        'results': results,
        'citations': citations,
        'query_variants': query_variants if len(query_variants) > 1 else None,
        'techniques_applied': techniques_applied
    }


print("\nTesting combined pipeline with current configuration...")
if embeddings_db and ground_truth_questions:
    test_query = ground_truth_questions[0]['question']
    print(f"\nQuery: {test_query}\n")
    
    pipeline_result = combined_retrieval_pipeline(
        test_query,
        embeddings_db,
        EMBEDDING_MODEL_ALIAS,
        db_connection,
        {
            'enable_query_expansion': ENABLE_QUERY_EXPANSION,
            'enable_hybrid_search': ENABLE_HYBRID_SEARCH,
            'enable_reranking': ENABLE_RERANKING,
            'enable_citation_tracking': ENABLE_CITATION_TRACKING,
            'num_expansions': 4,
            'top_k_initial': 20,
            'top_k_final': 5,
            'rrf_k': 60
        }
    )
    
    print(f"\nTechniques applied: {', '.join(pipeline_result['techniques_applied'])}")
    print(f"\nTop {len(pipeline_result['results'])} results:")
    for i, (chunk, score, chunk_id) in enumerate(pipeline_result['results'], 1):
        preview = chunk[:100].replace('\n', ' ') + '...'
        print(f"  [{i}] (score: {score:.4f}) {preview}")

## Evaluate System Performance

In [None]:
# Configuration parameters for combined advanced RAG
NUM_EXPANSIONS = 3  # Number of query variations to generate
TOP_K_INITIAL = 10  # Initial retrieval count before reranking
TOP_K_FINAL = 5  # Final results after reranking
ENABLE_RERANKING = True
ENABLE_CITATION_TRACKING = True


# ============================================================================# PART 3: EVALUATE SYSTEM PERFORMANCE# ============================================================================# Metric computation functionsdef precision_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:    """Precision@K: What % of top-K results are relevant?"""    if k == 0:        return 0.0        retrieved_k = retrieved_chunk_ids[:k]    relevant_set = set(relevant_chunk_ids)        num_relevant_in_k = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)    return num_relevant_in_k / kdef recall_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:    """Recall@K: What % of all relevant chunks were found in top-K?"""    if len(relevant_chunk_ids) == 0:        return 0.0        retrieved_k = retrieved_chunk_ids[:k]    relevant_set = set(relevant_chunk_ids)        num_relevant_found = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)    return num_relevant_found / len(relevant_set)def mean_reciprocal_rank(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int]) -> float:    """MRR: How quickly do we find the first relevant result?"""    relevant_set = set(relevant_chunk_ids)        for rank, chunk_id in enumerate(retrieved_chunk_ids, start=1):        if chunk_id in relevant_set:            return 1.0 / rank        return 0.0def ndcg_at_k(retrieved_chunk_ids: List[int], relevant_chunk_ids: List[int], k: int = 5) -> float:    """NDCG@K: Normalized Discounted Cumulative Gain (ranking quality)"""        def dcg_score(relevance_scores: List[float]) -> float:        return sum(            (2**rel - 1) / math.log2(rank + 2)            for rank, rel in enumerate(relevance_scores)        )        if k == 0 or len(relevant_chunk_ids) == 0:        return 0.0        retrieved_k = retrieved_chunk_ids[:k]    relevant_set = set(relevant_chunk_ids)        relevance = [1 if chunk_id in relevant_set else 0 for chunk_id in retrieved_k]        dcg = dcg_score(relevance)    ideal_relevance = sorted(relevance, reverse=True)    idcg = dcg_score(ideal_relevance)        if idcg == 0:        return 0.0        return dcg / idcgdef evaluate_with_config(test_questions: List[Dict],                        embeddings_db: PostgreSQLVectorDB,                        embedding_model: str,                        db_connection,                        config: Dict) -> Dict:    """Evaluate pipeline with given configuration."""        results_list = []        print(f"\nEvaluating configuration on {len(test_questions)} test questions...")    print("-" * 70)        for i, q in enumerate(test_questions, 1):        query = q['question']        relevant_ids = q['relevant_chunk_ids']                if not relevant_ids:            continue                # Run pipeline        pipeline_result = combined_retrieval_pipeline(query, embeddings_db, embedding_model,                                                     db_connection, config)        retrieved_ids = [chunk_id for _, _, chunk_id in pipeline_result['results']]                # Compute metrics        metrics = {            'precision@5': precision_at_k(retrieved_ids, relevant_ids, k=5),            'recall@5': recall_at_k(retrieved_ids, relevant_ids, k=5),            'mrr': mean_reciprocal_rank(retrieved_ids, relevant_ids),            'ndcg@5': ndcg_at_k(retrieved_ids, relevant_ids, k=5)        }                results_list.append(metrics)                if i % max(1, len(test_questions) // 10) == 0:            print(f"  Progress: {i}/{len(test_questions)} queries evaluated")        print("-" * 70)        # Aggregate metrics    if not results_list:        return {'precision@5': 0, 'recall@5': 0, 'mrr': 0, 'ndcg@5': 0}        return {        'precision@5': np.mean([r['precision@5'] for r in results_list]),        'recall@5': np.mean([r['recall@5'] for r in results_list]),        'mrr': np.mean([r['mrr'] for r in results_list]),        'ndcg@5': np.mean([r['ndcg@5'] for r in results_list]),        'num_queries': len(results_list)    }print("\n" + "=" * 70)print("EVALUATING CURRENT CONFIGURATION")print("=" * 70)config = {    'enable_query_expansion': ENABLE_QUERY_EXPANSION,    'enable_hybrid_search': ENABLE_HYBRID_SEARCH,    'enable_reranking': ENABLE_RERANKING,    'enable_citation_tracking': ENABLE_CITATION_TRACKING,    'num_expansions': NUM_EXPANSIONS,    'top_k_initial': TOP_K_INITIAL,    'top_k_final': TOP_K_FINAL,    'rrf_k': RRF_K}if embeddings_db and ground_truth_questions:    eval_metrics = evaluate_with_config(        ground_truth_questions,        embeddings_db,        EMBEDDING_MODEL_ALIAS,        db_connection,        config    )        print("\n" + "=" * 70)    print("CURRENT CONFIGURATION RESULTS")    print("=" * 70)        print(f"\nEnabled Techniques: {', '.join(ENABLED_TECHNIQUES)}")    print(f"Queries Evaluated: {eval_metrics.get('num_queries', 0)}\n")        print(f"{'Metric':<20} {'Score':<15}")    print("-" * 35)        for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:        value = eval_metrics.get(metric, 0)        print(f"{metric:<20} {value:<15.4f}")    else:    print("⚠️  Cannot evaluate: embeddings or test questions not available")    eval_metrics = {}

## Compare All Technique Combinations

In [None]:
# ============================================================================
# PART 4: COMPARE ALL TECHNIQUE COMBINATIONS
# ============================================================================

def evaluate_all_combinations(test_questions: List[Dict],
                             embeddings_db: PostgreSQLVectorDB,
                             embedding_model: str,
                             db_connection) -> Dict:
    """
    Test all technique combinations to find optimal configuration.
    
    Combinations to test:
    1. Baseline (no techniques)
    2. Query Expansion only
    3. Hybrid Search only
    4. Reranking only
    5. Query Expansion + Reranking
    6. Hybrid + Reranking
    7. All techniques combined
    
    Returns:
        DataFrame with results for all combinations
    """
    
    combinations = [
        {
            'name': 'baseline',
            'enable_query_expansion': False,
            'enable_hybrid_search': False,
            'enable_reranking': False,
            'enable_citation_tracking': False
        },
        {
            'name': 'expansion_only',
            'enable_query_expansion': True,
            'enable_hybrid_search': False,
            'enable_reranking': False,
            'enable_citation_tracking': False
        },
        {
            'name': 'hybrid_only',
            'enable_query_expansion': False,
            'enable_hybrid_search': True,
            'enable_reranking': False,
            'enable_citation_tracking': False
        },
        {
            'name': 'reranking_only',
            'enable_query_expansion': False,
            'enable_hybrid_search': False,
            'enable_reranking': True,
            'enable_citation_tracking': False
        },
        {
            'name': 'expansion+reranking',
            'enable_query_expansion': True,
            'enable_hybrid_search': False,
            'enable_reranking': True,
            'enable_citation_tracking': False
        },
        {
            'name': 'hybrid+reranking',
            'enable_query_expansion': False,
            'enable_hybrid_search': True,
            'enable_reranking': True,
            'enable_citation_tracking': False
        },
        {
            'name': 'all_combined',
            'enable_query_expansion': True,
            'enable_hybrid_search': True,
            'enable_reranking': True,
            'enable_citation_tracking': True
        }
    ]
    
    results = {}
    
    print("\n" + "=" * 70)
    print("EVALUATING ALL TECHNIQUE COMBINATIONS")
    print("=" * 70)
    
    for combo in combinations:
        combo_name = combo['name']
        print(f"\nTesting: {combo_name}")
        
        # Create config from combination
        config = {
            'enable_query_expansion': combo['enable_query_expansion'],
            'enable_hybrid_search': combo['enable_hybrid_search'],
            'enable_reranking': combo['enable_reranking'],
            'enable_citation_tracking': combo['enable_citation_tracking'],
            'num_expansions': NUM_EXPANSIONS,
            'top_k_initial': TOP_K_INITIAL,
            'top_k_final': TOP_K_FINAL,
            'rrf_k': RRF_K
        }
        
        metrics = evaluate_with_config(test_questions, embeddings_db, embedding_model,
                                      db_connection, config)
        results[combo_name] = metrics
    
    return results


# Run all combinations
print("\n" + "=" * 70)
print("SYSTEMATIC COMBINATION EVALUATION")
print("=" * 70)

if embeddings_db and ground_truth_questions:
    all_combinations_results = evaluate_all_combinations(
        ground_truth_questions,
        embeddings_db,
        EMBEDDING_MODEL_ALIAS,
        db_connection
    )
    
    # Create comparison DataFrame
    print("\n" + "=" * 70)
    print("COMBINATION COMPARISON RESULTS")
    print("=" * 70)
    
    comparison_data = []
    for combo_name, metrics in all_combinations_results.items():
        comparison_data.append({
            'combination': combo_name,
            'precision@5': metrics.get('precision@5', 0),
            'recall@5': metrics.get('recall@5', 0),
            'mrr': metrics.get('mrr', 0),
            'ndcg@5': metrics.get('ndcg@5', 0)
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Display results
    print("\n" + comparison_df.to_string(index=False))
    
    # Identify best combination
    best_ndcg_idx = comparison_df['ndcg@5'].idxmax()
    best_combo = comparison_df.iloc[best_ndcg_idx]
    
    print("\n" + "=" * 70)
    print("BEST CONFIGURATION IDENTIFIED")
    print("=" * 70)
    
    print(f"\nBest Combination: {best_combo['combination']}")
    print(f"  Precision@5: {best_combo['precision@5']:.4f}")
    print(f"  Recall@5:    {best_combo['recall@5']:.4f}")
    print(f"  MRR:         {best_combo['mrr']:.4f}")
    print(f"  NDCG@5:      {best_combo['ndcg@5']:.4f}")
    
    # Show top 3 combinations
    print(f"\nTop 3 Best Combinations (by NDCG@5):")
    top3 = comparison_df.nlargest(3, 'ndcg@5')
    
    for i, row in top3.iterrows():
        print(f"\n  {i+1}. {row['combination']}")
        print(f"     P@5: {row['precision@5']:.4f}, R@5: {row['recall@5']:.4f}, NDCG@5: {row['ndcg@5']:.4f}")
    
    # Measure cumulative improvements
    print("\n" + "=" * 70)
    print("CUMULATIVE IMPROVEMENTS")
    print("=" * 70)
    
    baseline_metrics = all_combinations_results['baseline']
    
    improvements = []
    for combo_name in ['expansion_only', 'hybrid_only', 'reranking_only', 'all_combined']:
        if combo_name in all_combinations_results:
            variant = all_combinations_results[combo_name]
            
            improvements.append({
                'technique': combo_name,
                'precision_improvement_%': ((variant.get('precision@5', 0) - baseline_metrics.get('precision@5', 1)) / max(baseline_metrics.get('precision@5', 1), 0.001) * 100),
                'recall_improvement_%': ((variant.get('recall@5', 0) - baseline_metrics.get('recall@5', 1)) / max(baseline_metrics.get('recall@5', 1), 0.001) * 100),
                'ndcg_improvement_%': ((variant.get('ndcg@5', 0) - baseline_metrics.get('ndcg@5', 1)) / max(baseline_metrics.get('ndcg@5', 1), 0.001) * 100)
            })
    
    improvements_df = pd.DataFrame(improvements)
    
    print("\n" + improvements_df.to_string(index=False))
    
    # Visualization
    print("\n" + "=" * 70)
    print("CREATING COMPARISON VISUALIZATIONS")
    print("=" * 70)
    
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Technique Combination Comparison', fontsize=14, fontweight='bold')
    
    metrics_to_plot = ['precision@5', 'recall@5', 'mrr', 'ndcg@5']
    
    for idx, metric in enumerate(metrics_to_plot):
        ax = axes[idx // 2, idx % 2]
        
        colors = ['#FF6B6B' if 'baseline' in x else '#4ECDC4' if 'all_combined' in x else '#95E1D3'
                 for x in comparison_df['combination']]
        
        ax.barh(comparison_df['combination'], comparison_df[metric], color=colors, alpha=0.7)
        ax.set_xlabel(metric, fontweight='bold')
        ax.set_title(f'{metric} by Combination')
        ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nVisualization displayed above")
    
else:
    print("⚠️  Cannot evaluate combinations: embeddings or test questions not available")
    all_combinations_results = {}

## Track Final Experiment

In [None]:
# ============================================================================
# PART 5: EXPERIMENT TRACKING
# ============================================================================

def compute_config_hash(config_dict: Dict) -> str:
    """Create deterministic SHA256 hash of configuration."""
    config_str = json.dumps(config_dict, sort_keys=True)
    hash_obj = hashlib.sha256(config_str.encode())
    return hash_obj.hexdigest()[:12]


def start_experiment(db_connection, experiment_name: str,
                     notebook_path: str = None,
                     embedding_model_alias: str = None,
                     config: Dict = None,
                     techniques: List[str] = None,
                     notes: str = None) -> int:
    """Start a new experiment and return its ID for tracking."""
    if config is None:
        config = {}
    if techniques is None:
        techniques = []

    config_hash = compute_config_hash(config)

    with db_connection.cursor() as cur:
        cur.execute('''
            INSERT INTO experiments (
                experiment_name, notebook_path, embedding_model_alias,
                config_hash, config_json, techniques_applied, notes, status
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, 'running')
            RETURNING id
        ''', (
            experiment_name,
            notebook_path,
            embedding_model_alias,
            config_hash,
            json.dumps(config),
            techniques,
            notes
        ))
        exp_id = cur.fetchone()[0]
    db_connection.commit()
    print(f"✓ Started experiment #{exp_id}: {experiment_name}")
    return exp_id


def save_metrics(db_connection, experiment_id: int, metrics_dict: Dict,
                 export_to_file: bool = True,
                 export_dir: str = 'data/experiment_results') -> Tuple[bool, str]:
    """Save experiment metrics to database and optionally to JSON file."""
    try:
        with db_connection.cursor() as cur:
            for metric_name, metric_data in metrics_dict.items():
                if isinstance(metric_data, dict):
                    metric_value = metric_data.get('value', 0.0)
                    metric_details = metric_data.get('details', {})
                else:
                    metric_value = metric_data
                    metric_details = {}

                cur.execute('''
                    INSERT INTO evaluation_results (
                        experiment_id, metric_name, metric_value, metric_details_json
                    )
                    VALUES (%s, %s, %s, %s)
                ''', (
                    experiment_id,
                    metric_name,
                    float(metric_value),
                    json.dumps(metric_details) if metric_details else '{}'
                ))
        db_connection.commit()

        file_path = None
        if export_to_file:
            os.makedirs(export_dir, exist_ok=True)
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            file_path = os.path.join(export_dir, f'experiment_{experiment_id}_{timestamp}.json')
            with open(file_path, 'w') as f:
                json.dump({
                    'experiment_id': experiment_id,
                    'timestamp': timestamp,
                    'metrics': metrics_dict
                }, f, indent=2)

        msg = f"✓ Saved {len(metrics_dict)} metrics for experiment #{experiment_id}"
        if file_path:
            msg += f" to {file_path}"
        print(msg)
        return True, msg
    except Exception as e:
        msg = f"✗ Failed to save metrics: {e}"
        print(msg)
        db_connection.rollback()
        return False, msg


def complete_experiment(db_connection, experiment_id: int,
                       status: str = 'completed',
                       notes: str = None) -> bool:
    """Mark an experiment as complete."""
    try:
        with db_connection.cursor() as cur:
            if notes:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, notes = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, notes, experiment_id))
            else:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, experiment_id))
        db_connection.commit()
        print(f"✓ Experiment #{experiment_id} marked as {status}")
        return True
    except Exception as e:
        print(f"✗ Failed to complete experiment: {e}")
        db_connection.rollback()
        return False


# ============================================================================
# TRACK FINAL EXPERIMENT
# ============================================================================

print("\n" + "=" * 70)
print("TRACKING FINAL EXPERIMENT")
print("=" * 70)

if eval_metrics or all_combinations_results:
    
    # Prepare configuration
    config_dict = {
        'embedding_model_alias': EMBEDDING_MODEL_ALIAS,
        'enable_query_expansion': ENABLE_QUERY_EXPANSION,
        'enable_hybrid_search': ENABLE_HYBRID_SEARCH,
        'enable_reranking': ENABLE_RERANKING,
        'enable_citation_tracking': ENABLE_CITATION_TRACKING,
        'num_expansions': NUM_EXPANSIONS,
        'top_k_initial': TOP_K_INITIAL,
        'top_k_final': TOP_K_FINAL,
        'rrf_k': RRF_K,
        'num_test_queries': eval_metrics.get('num_queries', 0),
    }

    config_hash = compute_config_hash(config_dict)

    print(f"\nExperiment Configuration:")
    print(f"  Name: {EXPERIMENT_NAME}")
    print(f"  Enabled Techniques: {', '.join(ENABLED_TECHNIQUES) if ENABLED_TECHNIQUES else 'baseline'}")
    print(f"  Embedding Model: {EMBEDDING_MODEL_ALIAS}")
    print(f"  Config Hash: {config_hash}")
    print(f"  Test Queries: {eval_metrics.get('num_queries', 0)}\n")

    # Start experiment tracking
    experiment_id = start_experiment(
        db_connection,
        experiment_name=EXPERIMENT_NAME,
        notebook_path='advanced-techniques/10-combined-advanced-rag.ipynb',
        embedding_model_alias=EMBEDDING_MODEL_ALIAS,
        config=config_dict,
        techniques=ENABLED_TECHNIQUES,
        notes=f'Combined advanced RAG evaluation with feature flags: {", ".join(ENABLED_TECHNIQUES) if ENABLED_TECHNIQUES else "baseline"}'
    )

    # Prepare metrics for storage
    metrics_to_store = {}

    # Store current configuration metrics
    if eval_metrics:
        for metric_name, metric_value in eval_metrics.items():
            if metric_name != 'num_queries':
                metrics_to_store[f'config_{metric_name}'] = float(metric_value)

    # Store all combination results
    if all_combinations_results:
        for combo_name, combo_metrics in all_combinations_results.items():
            for metric_name, metric_value in combo_metrics.items():
                if metric_name != 'num_queries':
                    metrics_to_store[f'combo_{combo_name}_{metric_name}'] = float(metric_value)

    # Configuration and metadata
    metrics_to_store['num_queries_evaluated'] = len(ground_truth_questions)
    metrics_to_store['config_hash'] = config_hash

    # Save metrics
    print("\nSaving metrics to database...\n")
    success, message = save_metrics(db_connection, experiment_id, metrics_to_store, export_to_file=True)

    # Complete experiment
    if success:
        notes = f"Successfully evaluated combined advanced RAG on {len(ground_truth_questions)} queries. "
        
        if eval_metrics:
            notes += f"Precision@5: {eval_metrics.get('precision@5', 0):.4f}, "
            notes += f"NDCG@5: {eval_metrics.get('ndcg@5', 0):.4f}"

        complete_experiment(db_connection, experiment_id, status='completed', notes=notes)

        # Display results summary
        print("\n" + "=" * 70)
        print("FINAL EXPERIMENT RESULTS")
        print("=" * 70)

        print(f"\nExperiment ID: {experiment_id}")
        print(f"Experiment Name: {EXPERIMENT_NAME}")
        print(f"Status: Completed")
        print(f"Config Hash: {config_hash}")

        print(f"\nTechniques Applied:")
        for technique in ENABLED_TECHNIQUES:
            print(f"  - {technique}")

        if eval_metrics:
            print(f"\nMetrics (Current Configuration):")
            for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
                value = eval_metrics.get(metric, 0)
                print(f"  {metric:<20} {value:.4f}")

        if all_combinations_results:
            print(f"\nBest Combination Evaluated: ", end="")
            best_combo = max(all_combinations_results.items(),
                           key=lambda x: x[1].get('ndcg@5', 0))
            print(f"{best_combo[0]}")
            print(f"  NDCG@5: {best_combo[1].get('ndcg@5', 0):.4f}")

        print(f"\nResults exported to:")
        print(f"  Database: evaluation_results table (experiment_id={experiment_id})")
        print(f"  JSON: data/experiment_results/experiment_{experiment_id}_*.json")

        print("\n" + "=" * 70)
        print("RECOMMENDATIONS")
        print("=" * 70)
        
        print("\n1. Review the combination comparison results above")
        print("2. Select the best performing configuration for your use case")
        print("3. Consider the trade-off between:")
        print("   - Retrieval Quality (NDCG, Precision)")
        print("   - Computational Cost (query expansion, reranking)")
        print("   - Latency (important for real-time applications)")
        print("4. Deploy the recommended configuration")
        print("5. Monitor performance on production queries")

        print("\nNext Steps:")
        print("  - Review evaluation-lab/03-compare-experiments.ipynb to compare with previous techniques")
        print("  - Use evaluation-lab/04-plot-improvements.ipynb for detailed visualization")
        print("  - Run EVALUATION_GUIDE.md for production deployment checklist")

    else:
        print("\n✗ Failed to track experiment")
        complete_experiment(db_connection, experiment_id, status='failed', notes='Failed to save metrics')

else:
    print("⚠️  Cannot track experiment: evaluation results not available")

# Close database connection
print("\n\nClosing database connections...")
if embeddings_db:
    embeddings_db.close()
db_connection.close()
print("✓ All connections closed")

## Next Steps

Based on results, you can:

1. **Deploy your best configuration** - Use the combination with highest quality that meets your latency budget
2. **Further optimize** - Fine-tune parameters of winning techniques
3. **Ablation study** - Understand which techniques matter most for your domain
4. **Production evaluation** - Test on real user queries and monitor performance

See [EVALUATION_GUIDE.md](../EVALUATION_GUIDE.md) for deployment checklist.