## Prerequisites

1. ✅ foundation/00-setup-postgres-schema.ipynb
2. ✅ evaluation-lab/01-create-ground-truth-human-in-loop.ipynb
3. ✅ evaluation-lab/02-evaluation-metrics-framework.ipynb

## Configuration

In [None]:
# Configurations to compare
CONFIGURATIONS = [
    {
        "name": "baseline-vector-only",
        "description": "Vector-only retrieval (simple baseline)",
        "embedding_model": "all-minilm-l6-v2",
        "top_k": 5,
        "techniques": ["vector_retrieval"]
    },
    {
        "name": "config-variant-1",
        "description": "Vector retrieval with larger top_k (10 vs 5)",
        "embedding_model": "all-minilm-l6-v2",
        "top_k": 10,
        "techniques": ["vector_retrieval"]
    },
    {
        "name": "config-variant-2",
        "description": "Vector-only but increased retrieval set",
        "embedding_model": "all-minilm-l6-v2",
        "top_k": 15,
        "techniques": ["vector_retrieval"]
    },
]

SIGNIFICANCE_THRESHOLD = 0.05  # Statistical significance p-value threshold

## Run Baseline

Simple vector-only retrieval as reference point.

In [None]:
# ============================================================================
# UTILITY FUNCTIONS: Copy from foundation/00-registry-and-tracking-utilities
# ============================================================================

def compute_config_hash(config_dict: Dict) -> str:
    """Create deterministic SHA256 hash of a configuration."""
    config_str = json.dumps(config_dict, sort_keys=True)
    hash_obj = __import__('hashlib').sha256(config_str.encode())
    return hash_obj.hexdigest()[:12]


def start_experiment(db_connection, experiment_name: str, 
                     notebook_path: str = None,
                     embedding_model_alias: str = None,
                     config: Dict = None,
                     techniques: List[str] = None,
                     notes: str = None) -> int:
    """Start a new experiment and return its ID for tracking."""
    if config is None:
        config = {}
    if techniques is None:
        techniques = []
    
    config_hash = compute_config_hash(config)
    
    with db_connection.cursor() as cur:
        cur.execute('''
            INSERT INTO experiments (
                experiment_name, notebook_path, embedding_model_alias,
                config_hash, config_json, techniques_applied, notes, status
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, 'running')
            RETURNING id
        ''', (
            experiment_name,
            notebook_path,
            embedding_model_alias,
            config_hash,
            json.dumps(config),
            techniques,
            notes
        ))
        exp_id = cur.fetchone()[0]
    db_connection.commit()
    return exp_id


def complete_experiment(db_connection, experiment_id: int, 
                       status: str = 'completed',
                       notes: str = None) -> bool:
    """Mark an experiment as complete."""
    try:
        with db_connection.cursor() as cur:
            if notes:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, notes = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, notes, experiment_id))
            else:
                cur.execute('''
                    UPDATE experiments
                    SET status = %s, completed_at = CURRENT_TIMESTAMP
                    WHERE id = %s
                ''', (status, experiment_id))
        db_connection.commit()
        return True
    except Exception as e:
        db_connection.rollback()
        print(f"✗ Failed to complete experiment: {e}")
        return False


def save_metrics(db_connection, experiment_id: int, metrics_dict: Dict) -> Tuple[bool, str]:
    """Save experiment metrics to database."""
    try:
        with db_connection.cursor() as cur:
            for metric_name, metric_value in metrics_dict.items():
                if isinstance(metric_value, dict):
                    metric_val = metric_value.get('value', 0.0)
                    metric_details = metric_value.get('details', {})
                else:
                    metric_val = metric_value
                    metric_details = {}
                
                cur.execute('''
                    INSERT INTO evaluation_results (
                        experiment_id, metric_name, metric_value, metric_details_json
                    )
                    VALUES (%s, %s, %s, %s)
                ''', (
                    experiment_id,
                    metric_name,
                    float(metric_val),
                    json.dumps(metric_details) if metric_details else '{}'
                ))
        db_connection.commit()
        msg = f"✓ Saved {len(metrics_dict)} metrics for experiment #{experiment_id}"
        return True, msg
    except Exception as e:
        db_connection.rollback()
        msg = f"✗ Failed to save metrics: {e}"
        return False, msg


# ============================================================================
# RETRIEVAL METRICS (from evaluation-lab/02)
# ============================================================================

def precision_at_k(retrieved_chunk_ids: List[int], 
                   relevant_chunk_ids: List[int], 
                   k: int = 5) -> float:
    """Precision@K: What percentage of top-K results are relevant?"""
    if k == 0:
        return 0.0
    
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    num_relevant_in_k = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)
    
    return num_relevant_in_k / k


def recall_at_k(retrieved_chunk_ids: List[int], 
                relevant_chunk_ids: List[int], 
                k: int = 5) -> float:
    """Recall@K: What percentage of all relevant chunks were found in top-K?"""
    if len(relevant_chunk_ids) == 0:
        return 0.0
    
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    num_relevant_found = sum(1 for chunk_id in retrieved_k if chunk_id in relevant_set)
    
    return num_relevant_found / len(relevant_set)


def mean_reciprocal_rank(retrieved_chunk_ids: List[int], 
                         relevant_chunk_ids: List[int]) -> float:
    """MRR: How quickly do we find the first relevant result?"""
    relevant_set = set(relevant_chunk_ids)
    
    for rank, chunk_id in enumerate(retrieved_chunk_ids, start=1):
        if chunk_id in relevant_set:
            return 1.0 / rank
    
    return 0.0


def ndcg_at_k(retrieved_chunk_ids: List[int], 
              relevant_chunk_ids: List[int], 
              k: int = 5) -> float:
    """NDCG@K: Normalized Discounted Cumulative Gain"""
    
    def dcg_score(relevance_scores: List[float]) -> float:
        """Compute DCG from relevance scores."""
        return sum(
            (2**rel - 1) / math.log2(rank + 2)
            for rank, rel in enumerate(relevance_scores)
        )
    
    if k == 0 or len(relevant_chunk_ids) == 0:
        return 0.0
    
    retrieved_k = retrieved_chunk_ids[:k]
    relevant_set = set(relevant_chunk_ids)
    
    relevance = [1 if chunk_id in relevant_set else 0 for chunk_id in retrieved_k]
    
    dcg = dcg_score(relevance)
    ideal_relevance = sorted(relevance, reverse=True)
    idcg = dcg_score(ideal_relevance)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg


def evaluate_rag_results(ground_truth_questions: List[Dict], 
                        rag_results: List[Dict], 
                        k_values: List[int] = [1, 3, 5, 10]) -> Dict:
    """Compute all metrics for RAG results against ground truth."""
    gt_map = {q['question']: q['relevant_chunk_ids'] for q in ground_truth_questions}
    
    metrics = {f'precision@{k}': [] for k in k_values}
    metrics.update({f'recall@{k}': [] for k in k_values})
    metrics.update({f'ndcg@{k}': [] for k in k_values})
    metrics['mrr'] = []
    
    per_query_metrics = []
    
    for result in rag_results:
        question = result['question']
        retrieved = result['retrieved_chunk_ids']
        
        if question not in gt_map:
            continue
        
        relevant = gt_map[question]
        
        query_metrics = {'question': question}
        
        for k in k_values:
            p = precision_at_k(retrieved, relevant, k)
            r = recall_at_k(retrieved, relevant, k)
            n = ndcg_at_k(retrieved, relevant, k)
            
            metrics[f'precision@{k}'].append(p)
            metrics[f'recall@{k}'].append(r)
            metrics[f'ndcg@{k}'].append(n)
            
            query_metrics[f'precision@{k}'] = p
            query_metrics[f'recall@{k}'] = r
            query_metrics[f'ndcg@{k}'] = n
        
        mrr = mean_reciprocal_rank(retrieved, relevant)
        metrics['mrr'].append(mrr)
        query_metrics['mrr'] = mrr
        
        per_query_metrics.append(query_metrics)
    
    aggregated = {}
    for key, values in metrics.items():
        if len(values) > 0:
            aggregated[key] = np.mean(values)
        else:
            aggregated[key] = 0.0
    
    aggregated['per_query'] = per_query_metrics
    aggregated['num_queries'] = len(per_query_metrics)
    
    return aggregated


print("✓ Utility functions loaded")

## Run Configuration Variations

Run all configured variants with full metrics computation.

In [None]:
# ============================================================================
# PHASE 1: LOAD GROUND TRUTH TEST SET
# ============================================================================

print("\n" + "="*70)
print("PHASE 1: LOADING GROUND TRUTH TEST SET")
print("="*70)

ground_truth_questions = []

with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
    cur.execute('''
        SELECT 
            id,
            question,
            relevant_chunk_ids,
            quality_rating,
            source_type
        FROM evaluation_groundtruth
        WHERE quality_rating = 'good'
        ORDER BY id
    ''')
    
    for row in cur.fetchall():
        ground_truth_questions.append({
            'id': row['id'],
            'question': row['question'],
            'relevant_chunk_ids': row['relevant_chunk_ids'],
            'quality_rating': row['quality_rating'],
            'source_type': row['source_type']
        })

print(f"\nLoaded {len(ground_truth_questions)} ground truth questions")
if ground_truth_questions:
    print(f"Sample: {ground_truth_questions[0]['question'][:80]}...")
    print(f"Relevant chunks: {ground_truth_questions[0]['relevant_chunk_ids'][:3]}")
else:
    print("WARNING: No ground truth questions found. Run evaluation-lab/01 first.")


# ============================================================================
# PHASE 2: RETRIEVE RESULTS FOR EACH CONFIGURATION
# ============================================================================

def retrieve_results_for_questions(db_connection, questions: List[Dict], 
                                   embedding_model_alias: str = "all-minilm-l6-v2",
                                   top_k: int = 10) -> List[Dict]:
    """Retrieve top-K results for each test question using vector similarity."""
    from sentence_transformers import SentenceTransformer
    
    model = SentenceTransformer(embedding_model_alias)
    
    rag_results = []
    question_texts = [q['question'] for q in questions]
    question_embeddings = model.encode(question_texts)
    
    with db_connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        for i, question_dict in enumerate(questions):
            question = question_dict['question']
            question_embedding = question_embeddings[i]
            
            cur.execute('''
                SELECT 
                    id,
                    content,
                    embedding
                FROM chunks
                WHERE embedding_model = %s
                ORDER BY embedding <-> %s
                LIMIT %s
            ''', (embedding_model_alias, question_embedding, top_k))
            
            retrieved_chunks = cur.fetchall()
            retrieved_ids = [chunk['id'] for chunk in retrieved_chunks]
            
            rag_results.append({
                'question': question,
                'retrieved_chunk_ids': retrieved_ids
            })
    
    return rag_results


print("\n" + "="*70)
print("PHASE 2: RUNNING CONFIGURATIONS")
print("="*70)

# Run each configuration
all_config_results = {}

for config_def in CONFIGURATIONS:
    config_name = config_def['name']
    
    print(f"\n[{config_name}] {config_def['description']}")
    print("-" * 70)
    
    # Start experiment
    exp_id = start_experiment(
        db_connection,
        experiment_name=config_name,
        notebook_path='evaluation-lab/03-baseline-and-comparison.ipynb',
        embedding_model_alias=config_def['embedding_model'],
        config=config_def,
        techniques=config_def['techniques'],
        notes=config_def['description']
    )
    
    print(f"  Experiment ID: #{exp_id}")
    
    try:
        # Retrieve results
        top_k_retrieve = config_def['top_k']
        print(f"  Retrieving top {top_k_retrieve} results...")
        
        rag_results = retrieve_results_for_questions(
            db_connection,
            ground_truth_questions,
            embedding_model_alias=config_def['embedding_model'],
            top_k=top_k_retrieve
        )
        
        # Compute metrics
        print(f"  Computing metrics...")
        metrics = evaluate_rag_results(
            ground_truth_questions,
            rag_results,
            k_values=[1, 3, 5, 10]
        )
        
        # Store metrics
        success, msg = save_metrics(db_connection, exp_id, metrics)
        if success:
            print(f"  {msg}")
        else:
            print(f"  ERROR: {msg}")
        
        # Complete experiment
        complete_experiment(db_connection, exp_id, status='completed')
        
        # Store results for comparison
        all_config_results[config_name] = {
            'experiment_id': exp_id,
            'config': config_def,
            'metrics': metrics
        }
        
        # Print summary
        print(f"  Precision@5: {metrics.get('precision@5', 0):.4f}")
        print(f"  Recall@5:    {metrics.get('recall@5', 0):.4f}")
        print(f"  NDCG@5:      {metrics.get('ndcg@5', 0):.4f}")
        print(f"  MRR:         {metrics.get('mrr', 0):.4f}")
        print(f"  Queries:     {metrics.get('num_queries', 0)}")
        
    except Exception as e:
        print(f"  ERROR: Failed to run configuration: {e}")
        complete_experiment(db_connection, exp_id, status='failed', notes=str(e))

print("\n✓ All configurations completed")

## Compare Results

Side-by-side comparison of all configurations with improvement percentages.

In [None]:
# ============================================================================
# PHASE 3: COMPARE RESULTS SIDE-BY-SIDE
# ============================================================================

print("\n" + "="*70)
print("PHASE 3: COMPARING CONFIGURATIONS")
print("="*70)

baseline_name = CONFIGURATIONS[0]['name']
baseline_metrics = all_config_results[baseline_name]['metrics']

print(f"\nBaseline: {baseline_name}")
print(f"Variants: {', '.join([c['name'] for c in CONFIGURATIONS[1:]])}")

# Create comparison table
comparisons = []

for config_name, data in all_config_results.items():
    config_metrics = data['metrics']
    
    # Add baseline row
    if config_name == baseline_name:
        for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
            comparisons.append({
                'configuration': config_name,
                'metric': metric,
                'value': config_metrics.get(metric, 0.0),
                'improvement_%': 0.0,
                'is_baseline': True
            })
    else:
        # Compare against baseline
        for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
            baseline_val = baseline_metrics.get(metric, 0.0)
            variant_val = config_metrics.get(metric, 0.0)
            
            if baseline_val > 0:
                improvement = ((variant_val - baseline_val) / baseline_val) * 100
            else:
                improvement = 0.0
            
            comparisons.append({
                'configuration': config_name,
                'metric': metric,
                'value': variant_val,
                'baseline': baseline_val,
                'improvement_%': improvement,
                'is_baseline': False
            })

comparison_df = pd.DataFrame(comparisons)

# Display comparison table
print("\nMetrics Comparison Table:")
print("-" * 100)

for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
    print(f"\n{metric.upper()}")
    print("-" * 100)
    print(f"{'Configuration':<30} {'Value':<15} {'Baseline':<15} {'Improvement':<15}")
    print("-" * 100)
    
    metric_data = comparison_df[comparison_df['metric'] == metric]
    
    for _, row in metric_data.iterrows():
        if row['is_baseline']:
            print(f"{row['configuration']:<30} {row['value']:<15.4f}")
        else:
            baseline = row.get('baseline', 0.0)
            improvement = row['improvement_%']
            marker = '↑' if improvement > 0 else '↓' if improvement < 0 else '='
            print(f"{row['configuration']:<30} {row['value']:<15.4f} {baseline:<15.4f} {marker} {improvement:+.2f}%")

print("\n✓ Comparison complete")

## Statistical Significance

Test whether improvements are statistically significant using paired t-tests.

In [None]:
# ============================================================================
# PHASE 4: STATISTICAL SIGNIFICANCE TESTING
# ============================================================================

print("\n" + "="*70)
print("PHASE 4: STATISTICAL SIGNIFICANCE TESTING")
print("="*70)

def paired_t_test(baseline_results, variant_results, metric_name='precision@5'):
    """
    Perform paired t-test to check if improvement is statistically significant.
    
    Args:
        baseline_results: List of per-query metrics
        variant_results: List of per-query metrics
        metric_name: Which metric to test
        
    Returns:
        dict with t_statistic, p_value, significant (p < 0.05), effect_size
    """
    baseline_values = [r[metric_name] for r in baseline_results]
    variant_values = [r[metric_name] for r in variant_results]
    
    if len(baseline_values) < 2:
        return {
            't_statistic': 0.0,
            'p_value': 1.0,
            'significant': False,
            'effect_size': 0.0,
            'n_queries': len(baseline_values)
        }
    
    t_stat, p_value = stats.ttest_rel(variant_values, baseline_values)
    
    effect_size = (np.mean(variant_values) - np.mean(baseline_values)) / np.std(baseline_values) if np.std(baseline_values) > 0 else 0
    
    return {
        't_statistic': t_stat,
        'p_value': p_value,
        'significant': p_value < SIGNIFICANCE_THRESHOLD,
        'effect_size': effect_size,
        'n_queries': len(baseline_values),
        'baseline_mean': np.mean(baseline_values),
        'variant_mean': np.mean(variant_values)
    }


# Run statistical tests for each variant
print(f"\nSignificance threshold: p < {SIGNIFICANCE_THRESHOLD}")
print("\nStatistical Significance Tests (Paired t-test):")
print("-" * 110)

significance_results = []

baseline_per_query = baseline_metrics['per_query']

for config_name, data in all_config_results.items():
    if config_name == baseline_name:
        continue
    
    config_metrics = data['metrics']
    variant_per_query = config_metrics['per_query']
    
    print(f"\n{config_name}:")
    print(f"  {'Metric':<15} {'t-stat':<12} {'p-value':<12} {'Significant':<12} {'Effect Size':<12} {'N Queries':<10}")
    print("  " + "-" * 100)
    
    for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
        result = paired_t_test(baseline_per_query, variant_per_query, metric)
        
        sig_marker = "YES *" if result['significant'] else "NO"
        print(f"  {metric:<15} {result['t_statistic']:<12.4f} {result['p_value']:<12.4f} {sig_marker:<12} {result['effect_size']:<12.4f} {result['n_queries']:<10}")
        
        significance_results.append({
            'configuration': config_name,
            'metric': metric,
            't_statistic': result['t_statistic'],
            'p_value': result['p_value'],
            'significant': result['significant'],
            'effect_size': result['effect_size'],
            'baseline_mean': result['baseline_mean'],
            'variant_mean': result['variant_mean'],
            'n_queries': result['n_queries']
        })

significance_df = pd.DataFrame(significance_results)

# Summary of significant improvements
print("\n" + "="*110)
print("SUMMARY: Statistically Significant Improvements (p < 0.05)")
print("="*110)

significant_improvements = significance_df[significance_df['significant']]

if len(significant_improvements) > 0:
    for _, row in significant_improvements.iterrows():
        improvement = ((row['variant_mean'] - row['baseline_mean']) / row['baseline_mean'] * 100) if row['baseline_mean'] > 0 else 0
        print(f"\n{row['configuration']} - {row['metric']}:")
        print(f"  Baseline: {row['baseline_mean']:.4f}")
        print(f"  Variant:  {row['variant_mean']:.4f}")
        print(f"  Improvement: {improvement:+.2f}%")
        print(f"  t-statistic: {row['t_statistic']:.4f}")
        print(f"  p-value: {row['p_value']:.4f}")
        print(f"  Effect size (Cohen's d): {row['effect_size']:.4f}")
else:
    print("\nNo statistically significant improvements found (p < 0.05)")
    print("Note: This doesn't mean configurations are identical - consider increasing sample size or lowering threshold.")

print("\n✓ Statistical significance testing complete")

## Visualize Comparison

Create visualizations showing metric improvements and distributions.

In [None]:
# ============================================================================
# PHASE 5: VISUALIZE COMPARISON
# ============================================================================

print("\n" + "="*70)
print("PHASE 5: VISUALIZATION")
print("="*70)

def visualize_configuration_comparison(comparison_df, baseline_name):
    """Create bar charts showing improvements across configurations."""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Configuration Comparison: Improvement vs Baseline', fontsize=16, fontweight='bold')
    
    metrics = ['precision@5', 'recall@5', 'mrr', 'ndcg@5']
    
    for idx, metric in enumerate(metrics):
        ax = axes[idx // 2, idx % 2]
        
        subset = comparison_df[comparison_df['metric'] == metric]
        baseline_val = subset[subset['configuration'] == baseline_name]['value'].values[0] if baseline_name in subset['configuration'].values else 0
        
        variant_data = subset[subset['configuration'] != baseline_name]
        
        if len(variant_data) > 0:
            configs = variant_data['configuration'].values
            improvements = variant_data['improvement_%'].values
            values = variant_data['value'].values
            
            colors = ['green' if imp > 0 else 'red' if imp < 0 else 'gray' for imp in improvements]
            
            bars = ax.bar(range(len(configs)), improvements, color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
            
            # Add value labels on bars
            for bar, val, imp in zip(bars, values, improvements):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{imp:+.1f}%\n({val:.3f})', ha='center', va='bottom' if height > 0 else 'top', fontsize=9)
            
            ax.set_xticks(range(len(configs)))
            ax.set_xticklabels(configs, rotation=45, ha='right')
            ax.set_ylabel('Improvement %', fontweight='bold')
            ax.set_title(f'{metric.upper()} (Baseline: {baseline_val:.4f})', fontweight='bold')
            ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
            ax.grid(True, alpha=0.3, axis='y')
        
    plt.tight_layout()
    plt.show()
    print("✓ Comparison chart generated")


visualize_configuration_comparison(comparison_df, baseline_name)


# Distribution plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Per-Query Metric Distributions', fontsize=14, fontweight='bold')

# Precision@5 distribution
ax = axes[0]
for config_name, data in all_config_results.items():
    per_query = data['metrics']['per_query']
    precision_values = [q['precision@5'] for q in per_query]
    ax.hist(precision_values, alpha=0.5, label=config_name, bins=10, edgecolor='black')

ax.set_xlabel('Precision@5', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Distribution of Precision@5 Across Queries', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Recall@5 distribution
ax = axes[1]
for config_name, data in all_config_results.items():
    per_query = data['metrics']['per_query']
    recall_values = [q['recall@5'] for q in per_query]
    ax.hist(recall_values, alpha=0.5, label=config_name, bins=10, edgecolor='black')

ax.set_xlabel('Recall@5', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Distribution of Recall@5 Across Queries', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
print("✓ Distribution charts generated")


# Metric comparison heatmap
fig, ax = plt.subplots(figsize=(10, 6))

metrics_list = ['precision@1', 'precision@3', 'precision@5', 'precision@10',
                'recall@1', 'recall@3', 'recall@5', 'recall@10',
                'ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10', 'mrr']

heatmap_data = []
config_names = []

for config_name, data in all_config_results.items():
    config_names.append(config_name)
    row = []
    for metric in metrics_list:
        value = data['metrics'].get(metric, 0.0)
        row.append(value)
    heatmap_data.append(row)

heatmap_array = np.array(heatmap_data)

im = ax.imshow(heatmap_array, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)

ax.set_xticks(range(len(metrics_list)))
ax.set_xticklabels(metrics_list, rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(len(config_names)))
ax.set_yticklabels(config_names)

ax.set_title('Metrics Heatmap Across Configurations', fontweight='bold', pad=20)

# Add values to heatmap
for i in range(len(config_names)):
    for j in range(len(metrics_list)):
        text = ax.text(j, i, f'{heatmap_array[i, j]:.2f}',
                      ha="center", va="center", color="black", fontsize=8)

cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Score', fontweight='bold')

plt.tight_layout()
plt.show()
print("✓ Heatmap generated")

## Recommendation

Generate production recommendation based on statistical significance and trade-off analysis.

In [None]:
# ============================================================================
# PHASE 6: RECOMMENDATION
# ============================================================================

print("\n" + "="*70)
print("PHASE 6: RECOMMENDATION")
print("="*70)

def generate_recommendation(comparison_df, significance_df, all_config_results, baseline_name):
    """
    Analyze results and provide actionable recommendation.
    
    Returns:
        str: Recommendation report
    """
    report = []
    report.append("\n" + "="*80)
    report.append("CONFIGURATION RECOMMENDATION REPORT")
    report.append("="*80)
    report.append(f"\nBaseline: {baseline_name}")
    report.append(f"Variants: {', '.join([c for c in all_config_results.keys() if c != baseline_name])}")
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"Significance Threshold: p < {SIGNIFICANCE_THRESHOLD}")
    
    report.append("\n" + "-"*80)
    report.append("BEST CONFIGURATIONS BY METRIC")
    report.append("-"*80)
    
    # Find best configuration per metric
    best_configs = {}
    for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
        subset = comparison_df[comparison_df['metric'] == metric]
        
        # Find best absolute value
        best_idx = subset['value'].idxmax()
        best_row = subset.loc[best_idx]
        
        # Check if significant
        sig_data = significance_df[
            (significance_df['configuration'] == best_row['configuration']) & 
            (significance_df['metric'] == metric)
        ]
        
        is_significant = False
        if len(sig_data) > 0:
            is_significant = sig_data['significant'].values[0]
        elif best_row['configuration'] == baseline_name:
            is_significant = True  # Baseline is inherently significant
        
        best_configs[metric] = {
            'config': best_row['configuration'],
            'value': best_row['value'],
            'improvement': best_row['improvement_%'],
            'significant': is_significant
        }
        
        sig_marker = "*" if is_significant else ""
        report.append(f"\n{metric}:")
        report.append(f"  Best: {best_row['configuration']} (value: {best_row['value']:.4f}){sig_marker}")
        if best_row['configuration'] != baseline_name:
            report.append(f"  Improvement vs baseline: {best_row['improvement_%']:+.2f}%")
    
    # Overall recommendation
    report.append("\n" + "-"*80)
    report.append("PRODUCTION RECOMMENDATION")
    report.append("-"*80)
    
    # Count wins per configuration
    config_wins = {}
    config_sig_wins = {}
    
    for metric, data in best_configs.items():
        config = data['config']
        config_wins[config] = config_wins.get(config, 0) + 1
        
        if data['significant']:
            config_sig_wins[config] = config_sig_wins.get(config, 0) + 1
    
    if len(config_wins) == 0:
        report.append("\nERROR: No configurations to recommend")
        return "\n".join(report)
    
    winner = max(config_wins, key=config_wins.get)
    winner_wins = config_wins[winner]
    winner_sig_wins = config_sig_wins.get(winner, 0)
    
    report.append(f"\nRecommended Configuration: {winner}")
    report.append(f"  Wins best in {winner_wins} out of 4 metrics")
    report.append(f"  Statistically significant wins: {winner_sig_wins} out of {winner_wins}")
    
    if winner != baseline_name:
        # Compute average improvement
        improvements = []
        for metric in ['precision@5', 'recall@5', 'mrr', 'ndcg@5']:
            subset = comparison_df[
                (comparison_df['configuration'] == winner) & 
                (comparison_df['metric'] == metric)
            ]
            if len(subset) > 0:
                improvements.append(subset['improvement_%'].values[0])
        
        avg_improvement = np.mean(improvements) if improvements else 0
        report.append(f"  Average improvement across all metrics: {avg_improvement:+.2f}%")
    
    # Trade-offs analysis
    report.append("\n" + "-"*80)
    report.append("TRADE-OFF ANALYSIS")
    report.append("-"*80)
    
    for config_name in all_config_results.keys():
        if config_name == baseline_name:
            continue
        
        config_data = all_config_results[config_name]
        
        report.append(f"\n{config_name}:")
        
        # Compute average performance
        metrics_to_check = ['precision@5', 'recall@5', 'mrr', 'ndcg@5']
        avg_value = np.mean([config_data['metrics'].get(m, 0) for m in metrics_to_check])
        baseline_avg = np.mean([all_config_results[baseline_name]['metrics'].get(m, 0) for m in metrics_to_check])
        
        avg_improvement = ((avg_value - baseline_avg) / baseline_avg * 100) if baseline_avg > 0 else 0
        
        report.append(f"  Average metric value: {avg_value:.4f} ({avg_improvement:+.2f}% vs baseline)")
        report.append(f"  Configuration details: {config_data['config']['techniques']}")
    
    # Caveats and assumptions
    report.append("\n" + "-"*80)
    report.append("CAVEATS & ASSUMPTIONS")
    report.append("-"*80)
    
    n_queries = all_config_results[baseline_name]['metrics'].get('num_queries', 0)
    report.append(f"\n1. Sample Size: {n_queries} test queries")
    if n_queries < 20:
        report.append("   WARNING: Small sample size. Consider collecting more ground truth for reliable results.")
    elif n_queries < 50:
        report.append("   Note: Sample size is moderate. Results may vary with larger datasets.")
    
    report.append(f"\n2. Metric Coverage: Evaluation based on")
    report.append("   - Precision@5, Recall@5 (retrieval quality)")
    report.append("   - NDCG@5 (ranking quality)")
    report.append("   - MRR (user satisfaction)")
    
    report.append(f"\n3. Statistical Test: Paired t-test at p < {SIGNIFICANCE_THRESHOLD}")
    report.append("   - Appropriate for comparing same set of queries")
    report.append("   - Assumes approximately normal distribution")
    
    report.append(f"\n4. Configuration Context:")
    report.append(f"   - Embedding model: all-minilm-l6-v2")
    report.append(f"   - Ground truth created: evaluation-lab/01")
    report.append(f"   - Note: Results are specific to this test set and may differ on other domains")
    
    # Next steps
    report.append("\n" + "-"*80)
    report.append("NEXT STEPS")
    report.append("-"*80)
    
    if len(significance_df[significance_df['significant']]) > 0:
        report.append(f"\n1. VALIDATE: Run recommended configuration on holdout test set")
        report.append(f"2. DEPLOY: If results hold, deploy {winner} to production")
        report.append(f"3. MONITOR: Track metrics in production to detect drift")
    else:
        report.append(f"\n1. COLLECT MORE DATA: Current sample size may be too small")
        report.append(f"2. ADJUST THRESHOLD: Consider lowering significance threshold if practical difference exists")
        report.append(f"3. ANALYZE FAILURE CASES: Look at worst-performing queries for insights")
    
    report.append(f"\n4. EXPLORE ADVANCED TECHNIQUES:")
    report.append(f"   - Reranking with cross-encoders")
    report.append(f"   - Query expansion with LLM")
    report.append(f"   - Hybrid search combining multiple modalities")
    report.append(f"   See advanced-techniques/ notebooks for implementations")
    
    report.append("\n" + "="*80 + "\n")
    
    return "\n".join(report)


recommendation_report = generate_recommendation(comparison_df, significance_df, all_config_results, baseline_name)
print(recommendation_report)

# Save recommendation to file
import os
os.makedirs('data/experiment_results', exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_path = f"data/experiment_results/baseline_comparison_{timestamp}.txt"
with open(report_path, 'w') as f:
    f.write(recommendation_report)
print(f"✓ Report saved to: {report_path}")