## Prerequisites

1. ✅ foundation/00-setup-postgres-schema.ipynb
2. ✅ evaluation-lab/01-03 (ground-truth, metrics, comparison)

## Configuration

In [None]:
# PostgreSQL connection configuration
POSTGRES_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'rag_db',
    'user': 'postgres',
    'password': 'postgres',
}

In [None]:
def get_db_connection():
    """
    Establish connection to PostgreSQL database.
    
    Returns:
        psycopg2 connection object
    """
    try:
        conn = psycopg2.connect(
            host=POSTGRES_CONFIG['host'],
            port=POSTGRES_CONFIG['port'],
            database=POSTGRES_CONFIG['database'],
            user=POSTGRES_CONFIG['user'],
            password=POSTGRES_CONFIG['password']
        )
        return conn
    except psycopg2.OperationalError as e:
        print(f"✗ Failed to connect to PostgreSQL: {e}")
        raise

# Test connection
try:
    test_conn = get_db_connection()
    test_conn.close()
    print("✓ Database connection successful")
except Exception as e:
    print(f"✗ Database connection failed: {e}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import psycopg2
from datetime import datetime
import json
import os
from pathlib import Path

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## Load All Experiments

In [None]:
def load_all_experiments(db_connection, filter_status='completed', limit=50):
    """
    Load experiments with their metrics from database.
    
    Performs a left join between experiments table and evaluation_results,
    then pivots metrics from rows to columns for easier analysis.
    
    Args:
        db_connection: PostgreSQL connection object
        filter_status: Filter by experiment status ('success', 'completed', 'failed', etc.)
        limit: Maximum number of experiments to load (orders by started_at DESC)
    
    Returns:
        DataFrame with columns: id, experiment_name, embedding_model_alias, techniques_applied,
                              started_at, completed_at, status, config_json, plus metric columns
    """
    import pandas as pd
    
    query = f'''
        SELECT 
            e.id,
            e.experiment_name,
            e.embedding_model_alias,
            e.techniques_applied,
            e.started_at,
            e.completed_at,
            e.status,
            e.config_json,
            r.metric_name,
            r.metric_value
        FROM experiments e
        LEFT JOIN evaluation_results r ON e.id = r.experiment_id
        WHERE e.status = %s
        ORDER BY e.started_at DESC
        LIMIT %s
    '''
    
    # Load data into DataFrame
    df = pd.read_sql(query, db_connection, params=(filter_status, limit))
    
    if df.empty:
        print(f"⚠ No experiments found with status '{filter_status}'")
        return df
    
    # Pivot metrics from rows to columns
    # Keep experiment metadata as index, metric_name as columns, metric_value as values
    pivot = df.pivot_table(
        index=['id', 'experiment_name', 'embedding_model_alias', 'techniques_applied', 
               'started_at', 'completed_at', 'status', 'config_json'],
        columns='metric_name',
        values='metric_value',
        aggfunc='first'  # In case of duplicates, take first value
    ).reset_index()
    
    # Convert techniques from string array to list
    pivot['techniques_applied'] = pivot['techniques_applied'].apply(
        lambda x: x.tolist() if isinstance(x, np.ndarray) else (x or [])
    )
    
    # Convert timestamps
    pivot['started_at'] = pd.to_datetime(pivot['started_at'])
    pivot['completed_at'] = pd.to_datetime(pivot['completed_at'])
    
    # Sort by started_at (oldest first for timeline)
    pivot = pivot.sort_values('started_at').reset_index(drop=True)
    
    print(f"✓ Loaded {len(pivot)} experiments with {len(pivot.columns)} columns")
    print(f"  Metrics found: {[col for col in pivot.columns if col not in ['id', 'experiment_name', 'embedding_model_alias', 'techniques_applied', 'started_at', 'completed_at', 'status', 'config_json']]}")
    
    return pivot

# Load experiments
conn = get_db_connection()
experiments_df = load_all_experiments(conn, filter_status=FILTER_STATUS, limit=LIMIT_EXPERIMENTS)
print(f"\nDataFrame shape: {experiments_df.shape}")
print(f"Date range: {experiments_df['started_at'].min()} to {experiments_df['started_at'].max()}")
conn.close()

## Timeline View

In [None]:
def plot_metrics_timeline(experiments_df, metric_names=None, figsize=(14, 6)):
    """
    Plot multiple metrics over time as separate lines on same plot.
    
    Shows how metrics evolve across experiments, with experiment names as annotations
    and best performance highlighted.
    
    Args:
        experiments_df: DataFrame with experiments and metrics
        metric_names: List of metrics to plot (default: SECONDARY_METRICS)
        figsize: Figure size tuple (width, height)
    """
    if metric_names is None:
        metric_names = SECONDARY_METRICS
    
    # Filter to available metrics
    available_metrics = [m for m in metric_names if m in experiments_df.columns]
    if not available_metrics:
        print(f"⚠ No metrics found from {metric_names}")
        return
    
    # Sort by date (should already be sorted, but ensure)
    df = experiments_df.sort_values('started_at').reset_index(drop=True)
    df['seq'] = range(len(df))  # Add sequence number for x-axis
    
    fig, ax = plt.subplots(figsize=figsize)
    
    # Plot each metric
    colors = plt.cm.Set2(np.linspace(0, 1, len(available_metrics)))
    for metric, color in zip(available_metrics, colors):
        valid_data = df[df[metric].notna()]
        ax.plot(valid_data['seq'], valid_data[metric], 
               marker='o', linestyle='-', linewidth=2, label=metric, color=color)
        
        # Highlight best value for this metric
        best_idx = valid_data[metric].idxmax()
        best_row = valid_data.loc[best_idx]
        ax.scatter(best_row['seq'], best_row[metric], s=200, zorder=5, 
                  color=color, edgecolors='black', linewidths=2)
    
    # Add experiment name annotations
    for idx, row in df.iterrows():
        ax.axvline(x=row['seq'], alpha=0.1, color='gray', linestyle='--')
        # Rotate to avoid overlap
        ax.text(row['seq'], ax.get_ylim()[1] * 0.95, row['experiment_name'][:15],
               rotation=45, ha='right', fontsize=7, alpha=0.7)
    
    ax.set_xlabel('Experiment Sequence', fontsize=12)
    ax.set_ylabel('Metric Value', fontsize=12)
    ax.set_title('Metrics Timeline Across Experiments', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='best', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print(f"✓ Timeline visualization created for {len(available_metrics)} metrics")

# Create timeline
if not experiments_df.empty:
    plot_metrics_timeline(experiments_df)

## Quality vs. Latency Trade-off

In [None]:
def find_pareto_frontier(df, quality_metric='precision_at_5', latency_metric='avg_latency_ms'):
    """
    Identify Pareto-optimal points (non-dominated solutions).
    
    A point is Pareto-optimal if no other point has both:
    - Better (higher) quality metric AND
    - Better (lower) latency metric
    
    Args:
        df: DataFrame with experiments
        quality_metric: Column name for quality (higher is better)
        latency_metric: Column name for latency (lower is better)
    
    Returns:
        List of indices that are Pareto-optimal
    """
    pareto_points = []
    
    for idx, row in df.iterrows():
        is_dominated = False
        
        # Check if this point is dominated by any other point
        for _, other in df.iterrows():
            if (other[quality_metric] >= row[quality_metric] and 
                other[latency_metric] <= row[latency_metric] and
                (other[quality_metric] > row[quality_metric] or 
                 other[latency_metric] < row[latency_metric])):
                is_dominated = True
                break
        
        if not is_dominated:
            pareto_points.append(idx)
    
    return pareto_points


def plot_pareto_frontier(experiments_df, quality_metric=PRIMARY_METRIC, 
                        latency_metric='avg_latency_ms', figsize=(12, 7)):
    """
    Visualize speed-quality trade-off with Pareto frontier highlighted.
    
    Creates scatter plot with:
    - X-axis: latency (ms, lower is better)
    - Y-axis: quality metric (higher is better)
    - Points colored by number of techniques
    - Pareto frontier highlighted with red dashed line
    
    Args:
        experiments_df: DataFrame with experiments
        quality_metric: Metric name for quality (y-axis)
        latency_metric: Metric name for latency (x-axis)
        figsize: Figure size tuple
    """
    # Check if metrics exist
    if quality_metric not in experiments_df.columns:
        print(f"⚠ Quality metric '{quality_metric}' not found in data")
        return None
    
    if latency_metric not in experiments_df.columns:
        print(f"⚠ Latency metric '{latency_metric}' not found in data")
        # Use row count as proxy for latency if not available
        df = experiments_df.copy()
        df[latency_metric] = range(len(df))
        print(f"  Using row index as latency proxy")
    else:
        df = experiments_df.copy()
    
    # Remove rows with NaN values in key columns
    df = df.dropna(subset=[quality_metric, latency_metric])
    
    if df.empty:
        print("⚠ No valid data for Pareto frontier visualization")
        return None
    
    fig, ax = plt.subplots(figsize=figsize)
    
    # Color by number of techniques applied
    num_techniques = df['techniques_applied'].apply(len)
    scatter = ax.scatter(
        df[latency_metric],
        df[quality_metric],
        s=150,
        c=num_techniques,
        cmap='viridis',
        alpha=0.7,
        edgecolors='black',
        linewidth=1
    )
    
    # Add colorbar
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Number of Techniques', fontsize=10)
    
    # Annotate each point with experiment name
    for idx, row in df.iterrows():
        ax.annotate(
            row['experiment_name'][:20],  # Truncate long names
            (row[latency_metric], row[quality_metric]),
            fontsize=8,
            alpha=0.8,
            xytext=(5, 5),
            textcoords='offset points'
        )
    
    # Find and highlight Pareto frontier
    pareto_indices = find_pareto_frontier(df, quality_metric, latency_metric)
    
    if pareto_indices:
        pareto_df = df.loc[pareto_indices].sort_values(latency_metric)
        ax.plot(pareto_df[latency_metric], pareto_df[quality_metric],
               'r--', linewidth=2.5, label='Pareto Frontier', zorder=5)
        
        # Highlight Pareto points
        ax.scatter(pareto_df[latency_metric], pareto_df[quality_metric],
                  s=300, facecolors='none', edgecolors='red', linewidths=2.5, 
                  label='Optimal Points', zorder=6)
        
        print(f"✓ Identified {len(pareto_indices)} Pareto-optimal points")
    else:
        print("⚠ No Pareto-optimal points found")
        pareto_df = None
    
    ax.set_xlabel(f'{latency_metric} (lower is better)', fontsize=12)
    ax.set_ylabel(f'{quality_metric} (higher is better)', fontsize=12)
    ax.set_title('Quality vs Latency Trade-off (Pareto Frontier)', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='best', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    return pareto_df


# Create Pareto frontier visualization
if not experiments_df.empty:
    pareto_df = plot_pareto_frontier(experiments_df)

## Leaderboard

In [None]:
def create_leaderboard(experiments_df, primary_metric=PRIMARY_METRIC, 
                      secondary_metrics=None, top_n=10):
    """
    Create ranked leaderboard of experiments.
    
    Ranks experiments by primary metric (descending), displays top N results,
    and formats metrics as percentages for readability.
    
    Args:
        experiments_df: DataFrame with experiments
        primary_metric: Metric to sort by (descending)
        secondary_metrics: Additional metrics to display (default: SECONDARY_METRICS)
        top_n: Show top N experiments
    
    Returns:
        DataFrame sorted by primary metric with rank column
    """
    if secondary_metrics is None:
        secondary_metrics = SECONDARY_METRICS
    
    # Check primary metric exists
    if primary_metric not in experiments_df.columns:
        print(f"✗ Primary metric '{primary_metric}' not found")
        return None
    
    # Select columns: experiment info + metrics
    cols_to_keep = ['experiment_name', 'embedding_model_alias', 'techniques_applied', 
                    'started_at', primary_metric]
    
    # Add secondary metrics if they exist
    for metric in secondary_metrics:
        if metric in experiments_df.columns:
            cols_to_keep.append(metric)
    
    leaderboard = experiments_df[cols_to_keep].copy()
    
    # Sort by primary metric (descending - higher is better)
    leaderboard = leaderboard.dropna(subset=[primary_metric])
    leaderboard = leaderboard.sort_values(primary_metric, ascending=False).reset_index(drop=True)
    
    # Add rank column at beginning
    leaderboard.insert(0, 'rank', range(1, len(leaderboard) + 1))
    
    # Format techniques as string for readability
    leaderboard['techniques'] = leaderboard['techniques_applied'].apply(
        lambda x: ', '.join(x) if x else 'baseline'
    )
    
    # Create percentage columns for all metrics
    metric_cols = [col for col in leaderboard.columns if col not in 
                   ['rank', 'experiment_name', 'embedding_model_alias', 'techniques_applied',
                    'techniques', 'started_at']]
    
    for metric in metric_cols:
        leaderboard[f'{metric}_pct'] = (leaderboard[metric] * 100).round(2)
    
    print(f"✓ Created leaderboard with {len(leaderboard)} experiments")
    print(f"  Top performer: {leaderboard.iloc[0]['experiment_name']} ({leaderboard.iloc[0][primary_metric]:.4f})")
    
    return leaderboard.head(top_n)


# Create leaderboard
if not experiments_df.empty:
    leaderboard_df = create_leaderboard(experiments_df, top_n=10)
    print("\nTop 10 Experiments:")
    print(leaderboard_df)

## Metric Correlation

In [None]:
def plot_metric_correlations(experiments_df, metric_names=None, figsize=(10, 8)):
    """
    Heatmap showing correlations between different metrics.
    
    Identifies which metrics move together and which show trade-offs,
    helping determine if multiple metrics are needed or if one can proxy for others.
    
    Args:
        experiments_df: DataFrame with experiments
        metric_names: List of metrics to correlate (default: uses primary + secondary)
        figsize: Figure size tuple
    
    Returns:
        Correlation matrix DataFrame
    """
    if metric_names is None:
        metric_names = [PRIMARY_METRIC] + SECONDARY_METRICS
    
    # Filter to available metrics
    available_metrics = [m for m in metric_names if m in experiments_df.columns]
    
    if not available_metrics:
        print(f"✗ No metrics found from {metric_names}")
        return None
    
    if len(available_metrics) < 2:
        print(f"⚠ Need at least 2 metrics for correlation analysis")
        return None
    
    # Compute correlation matrix (only numeric columns)
    df_metrics = experiments_df[available_metrics].dropna()
    corr_matrix = df_metrics.corr()
    
    if corr_matrix.empty:
        print("⚠ No valid data for correlation analysis")
        return None
    
    # Create heatmap
    fig, ax = plt.subplots(figsize=figsize)
    
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.2f',
        cmap='coolwarm',
        center=0,
        square=True,
        linewidths=1,
        cbar_kws={'label': 'Correlation Coefficient'},
        ax=ax,
        vmin=-1,
        vmax=1
    )
    
    ax.set_title('Metric Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.show()
    
    print(f"✓ Correlation matrix created for {len(available_metrics)} metrics")
    
    # Identify strong correlations
    strong_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            metric1 = corr_matrix.columns[i]
            metric2 = corr_matrix.columns[j]
            value = corr_matrix.iloc[i, j]
            if abs(value) > 0.8:
                strong_corr.append((metric1, metric2, value))
    
    if strong_corr:
        print("\nStrong correlations (|r| > 0.8):")
        for m1, m2, val in strong_corr:
            print(f"  {m1} <-> {m2}: {val:.3f}")
    
    return corr_matrix


# Create correlation heatmap
if not experiments_df.empty:
    corr_matrix = plot_metric_correlations(experiments_df)

## Export Reports

In [None]:
def export_dashboard_report(experiments_df, leaderboard_df=None, 
                           export_format='html', output_dir='data/dashboards'):
    """
    Export comprehensive report in specified format.
    
    Generates reproducible export with timestamp and interpretive guide.
    Formats: HTML (with styling), CSV (simple table), JSON (structured data)
    
    Args:
        experiments_df: All experiments with metrics
        leaderboard_df: Ranked leaderboard (optional)
        export_format: 'html', 'csv', or 'json'
        output_dir: Directory to save reports
        
    Returns:
        str: Path to exported file
    """
    import os
    from datetime import datetime
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    if export_format == 'html':
        filename = os.path.join(output_dir, f'dashboard_{timestamp}.html')
        
        # Create HTML report
        html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>RAG Experiment Dashboard</title>
    <meta charset="UTF-8">
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f5f5f5;
        }}
        h1 {{
            color: #333;
            border-bottom: 3px solid #007bff;
            padding-bottom: 10px;
        }}
        h2 {{
            color: #555;
            margin-top: 30px;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
            background-color: white;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }}
        th {{
            background-color: #007bff;
            color: white;
            padding: 12px;
            text-align: left;
        }}
        td {{
            padding: 10px;
            border-bottom: 1px solid #ddd;
        }}
        tr:hover {{
            background-color: #f9f9f9;
        }}
        .metric {{
            font-weight: bold;
            color: #007bff;
        }}
        .summary {{
            background-color: #e7f3ff;
            padding: 15px;
            border-left: 4px solid #007bff;
            margin: 20px 0;
        }}
    </style>
</head>
<body>
    <h1>RAG Experiment Dashboard</h1>
    <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    
    <div class="summary">
        <h3>Dashboard Summary</h3>
        <p><strong>Total Experiments:</strong> {len(experiments_df)}</p>
        <p><strong>Metrics Tracked:</strong> {', '.join([col for col in experiments_df.columns if col not in ['id', 'experiment_name', 'embedding_model_alias', 'techniques_applied', 'started_at', 'completed_at', 'status', 'config_json']])}</p>
        <p><strong>Date Range:</strong> {experiments_df['started_at'].min()} to {experiments_df['started_at'].max()}</p>
    </div>
"""
        
        # Add leaderboard if provided
        if leaderboard_df is not None and not leaderboard_df.empty:
            html_content += f"""
    <h2>Leaderboard (Top 10)</h2>
    {leaderboard_df.to_html(index=False)}
"""
        
        # Add all experiments table
        html_content += f"""
    <h2>All Experiments ({len(experiments_df)} total)</h2>
    {experiments_df.to_html(index=False, max_rows=100)}
"""
        
        html_content += """
    <footer style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd; color: #666; font-size: 12px;">
        <p>RAG Wiki Demo - Evaluation Dashboard</p>
        <p>For detailed analysis, see evaluation-lab notebooks</p>
    </footer>
</body>
</html>
"""
        
        with open(filename, 'w') as f:
            f.write(html_content)
        
        print(f"✓ Exported HTML dashboard to {filename}")
    
    elif export_format == 'csv':
        filename = os.path.join(output_dir, f'dashboard_{timestamp}.csv')
        
        # Export experiments
        experiments_df.to_csv(filename, index=False)
        
        # Also export leaderboard if provided
        if leaderboard_df is not None and not leaderboard_df.empty:
            leaderboard_filename = os.path.join(output_dir, f'leaderboard_{timestamp}.csv')
            leaderboard_df.to_csv(leaderboard_filename, index=False)
            print(f"✓ Exported leaderboard to {leaderboard_filename}")
        
        print(f"✓ Exported CSV dashboard to {filename}")
    
    elif export_format == 'json':
        filename = os.path.join(output_dir, f'dashboard_{timestamp}.json')
        
        # Convert to records format
        export_data = {
            'metadata': {
                'exported_at': datetime.now().isoformat(),
                'total_experiments': len(experiments_df),
                'experiment_count': len(experiments_df),
            },
            'experiments': experiments_df.to_dict(orient='records')
        }
        
        if leaderboard_df is not None and not leaderboard_df.empty:
            export_data['leaderboard'] = leaderboard_df.to_dict(orient='records')
        
        with open(filename, 'w') as f:
            json.dump(export_data, f, indent=2, default=str)  # default=str for datetime serialization
        
        print(f"✓ Exported JSON dashboard to {filename}")
    
    else:
        print(f"✗ Unknown export format: {export_format}")
        return None
    
    return filename


# Export reports in all formats
if not experiments_df.empty:
    print("Exporting dashboard reports...\n")
    
    # Ensure leaderboard is available for export
    if 'leaderboard_df' not in locals():
        leaderboard_df = create_leaderboard(experiments_df)
    
    html_file = export_dashboard_report(experiments_df, leaderboard_df, format='html')
    csv_file = export_dashboard_report(experiments_df, leaderboard_df, format='csv')
    json_file = export_dashboard_report(experiments_df, leaderboard_df, format='json')

## Key Insights

In [None]:
def generate_insights(experiments_df, leaderboard_df=None):
    """
    Analyze experiments and generate actionable insights.
    
    Identifies:
    1. Best configuration overall
    2. Most valuable individual techniques
    3. Pareto frontier for quality vs latency
    4. Diminishing returns when combining techniques
    5. Production deployment recommendation
    
    Args:
        experiments_df: DataFrame with all experiments
        leaderboard_df: Optional ranked leaderboard
    
    Returns:
        str: Formatted insights report
    """
    insights = []
    
    if experiments_df.empty:
        return "No experiments to analyze"
    
    # Create leaderboard if not provided
    if leaderboard_df is None:
        leaderboard_df = create_leaderboard(experiments_df, top_n=len(experiments_df))
    
    # 1. Best configuration
    if PRIMARY_METRIC in experiments_df.columns:
        best_idx = experiments_df[PRIMARY_METRIC].idxmax()
        best = experiments_df.loc[best_idx]
        insights.append(f"BEST CONFIGURATION:")
        insights.append(f"  {best['experiment_name']}")
        insights.append(f"  {PRIMARY_METRIC}: {best[PRIMARY_METRIC]:.4f}")
        if best['techniques_applied']:
            insights.append(f"  Techniques: {', '.join(best['techniques_applied'])}")
    
    # 2. Most valuable technique
    insights.append(f"\nMOST VALUABLE TECHNIQUES:")
    technique_impact = {}
    
    for idx, row in experiments_df.iterrows():
        techniques = row['techniques_applied']
        if isinstance(techniques, list):
            for t in techniques:
                if t not in technique_impact:
                    technique_impact[t] = []
                if PRIMARY_METRIC in row and pd.notna(row[PRIMARY_METRIC]):
                    technique_impact[t].append(row[PRIMARY_METRIC])
    
    if technique_impact:
        # Calculate average impact per technique
        avg_impact = {t: (sum(values)/len(values)) for t, values in technique_impact.items()}
        sorted_techniques = sorted(avg_impact.items(), key=lambda x: x[1], reverse=True)
        
        for tech, avg_score in sorted_techniques:
            count = len(technique_impact[tech])
            insights.append(f"  {tech}: avg {PRIMARY_METRIC} = {avg_score:.4f} ({count} experiments)")
    
    # 3. Pareto frontier analysis
    if 'avg_latency_ms' in experiments_df.columns:
        insights.append(f"\nPARETO FRONTIER ANALYSIS:")
        pareto_indices = find_pareto_frontier(experiments_df.dropna(subset=[PRIMARY_METRIC, 'avg_latency_ms']),
                                              PRIMARY_METRIC, 'avg_latency_ms')
        insights.append(f"  {len(pareto_indices)} Pareto-optimal configurations identified")
        
        if pareto_indices:
            pareto_subset = experiments_df.loc[pareto_indices].sort_values('avg_latency_ms')
            insights.append(f"  Fastest optimal: {pareto_subset.iloc[0]['experiment_name']} ({pareto_subset.iloc[0]['avg_latency_ms']:.1f}ms)")
            insights.append(f"  Best quality optimal: {pareto_subset.iloc[-1]['experiment_name']} ({pareto_subset.iloc[-1][PRIMARY_METRIC]:.4f})")
    
    # 4. Diminishing returns analysis
    if len(leaderboard_df) >= 3:
        improvements = []
        for i in range(len(leaderboard_df) - 1):
            improvement = leaderboard_df.iloc[i][PRIMARY_METRIC] - leaderboard_df.iloc[i+1][PRIMARY_METRIC]
            improvements.append(improvement)
        
        insights.append(f"\nDIMINISHING RETURNS:")
        insights.append(f"  Rank 1->2 improvement: {improvements[0]:.4f}")
        if len(improvements) > 1:
            insights.append(f"  Rank 2->3 improvement: {improvements[1]:.4f}")
        
        if len(improvements) > 0 and improvements[0] > 0:
            if len(improvements) > 1 and improvements[1] < improvements[0] * 0.5:
                insights.append(f"  WARNING: Diminishing returns after rank 2 (>50% drop)")
    
    # 5. Baseline comparison
    baseline_exps = experiments_df[
        experiments_df['techniques_applied'].apply(
            lambda x: isinstance(x, list) and len(x) == 0 or (isinstance(x, list) and 'baseline' in x)
        )
    ]
    
    if not baseline_exps.empty and PRIMARY_METRIC in baseline_exps.columns:
        baseline_score = baseline_exps[PRIMARY_METRIC].mean()
        best_score = leaderboard_df.iloc[0][PRIMARY_METRIC]
        improvement_pct = ((best_score - baseline_score) / baseline_score * 100) if baseline_score > 0 else 0
        
        insights.append(f"\nBASELINE COMPARISON:")
        insights.append(f"  Baseline avg {PRIMARY_METRIC}: {baseline_score:.4f}")
        insights.append(f"  Best {PRIMARY_METRIC}: {best_score:.4f}")
        insights.append(f"  Improvement: +{improvement_pct:.1f}%")
    
    # 6. Production recommendation
    insights.append(f"\nPRODUCTION RECOMMENDATION:")
    if len(leaderboard_df) > 0:
        best = leaderboard_df.iloc[0]
        insights.append(f"  Use: {best['experiment_name']}")
        insights.append(f"  Rationale: Best {PRIMARY_METRIC} performance ({best[PRIMARY_METRIC]:.4f})")
        
        # Consider latency if available
        if 'avg_latency_ms' in best.index:
            insights.append(f"  Latency: {best['avg_latency_ms']:.1f}ms")
        
        if best['techniques_applied']:
            insights.append(f"  Apply: {', '.join(best['techniques_applied'])}")
    
    insights.append(f"\n" + "="*60)
    
    return "\n".join(insights)


# Generate insights
if not experiments_df.empty:
    print("\n" + "="*60)
    print("KEY INSIGHTS FROM EXPERIMENTS")
    print("="*60)
    insights_report = generate_insights(experiments_df, leaderboard_df if 'leaderboard_df' in locals() else None)
    print(insights_report)

"""
SUMMARY: Complete Experiment Dashboard Implementation

This notebook provides comprehensive visualization and analysis of all RAG experiments:

IMPLEMENTED FEATURES:
---------------------

1. LOAD EXPERIMENTS (Part 1)
   - Queries experiments table with filters (status, embedding model, techniques)
   - Joins with evaluation_results for all metrics
   - Pivots metrics from rows to columns for analysis
   - Handles timestamps and array data types

2. TIMELINE VISUALIZATION (Part 2)
   - X-axis: experiment sequence or timestamp
   - Y-axis: multiple metrics plotted as lines
   - Highlights best performance for each metric
   - Annotates experiment names on timeline
   - Shows trends across experiments

3. PARETO FRONTIER (Part 3)
   - Identifies Pareto-optimal configurations (non-dominated solutions)
   - Quality vs Latency trade-off visualization
   - Points colored by number of techniques applied
   - Highlights optimal frontier with red dashed line
   - Helps select configurations balancing speed and quality

4. LEADERBOARD (Part 4)
   - Ranks experiments by PRIMARY_METRIC (precision_at_5)
   - Shows top 10 with secondary metrics
   - Formats metrics as percentages for readability
   - Includes experiment name, techniques applied, timestamp
   - Easy comparison of best configurations

5. CORRELATION ANALYSIS (Part 5)
   - Heatmap of metric correlations
   - Identifies metrics that move together
   - Detects trade-offs between metrics
   - Helps determine if single metric can proxy for others

6. EXPORT REPORTS (Part 6)
   - HTML format: styled dashboard with summary tables
   - CSV format: tabular data for spreadsheet analysis
   - JSON format: structured data for programmatic use
   - Includes timestamp for reproducibility
   - Leaderboard exported separately when available

7. INSIGHTS GENERATION (Part 7)
   - Identifies best configuration overall
   - Ranks techniques by average impact
   - Analyzes Pareto frontier for quality vs latency
   - Detects diminishing returns
   - Compares to baseline
   - Production recommendation with rationale

USAGE:
------
1. Configure PRIMARY_METRIC and SECONDARY_METRICS
2. Set FILTER_STATUS and LIMIT_EXPERIMENTS for data loading
3. Run all cells to load data and generate dashboard
4. Visualizations show automatically
5. Leaderboard and insights printed to console
6. Reports exported to data/dashboards/ directory

OUTPUTS:
--------
- Plots: Timeline, Pareto frontier, correlation heatmap
- Tables: Leaderboard, full experiment list
- Files: HTML, CSV, JSON reports in data/dashboards/
- Text: Structured insights and recommendations
"""

print("Experiment Dashboard Implementation Complete")
print("=" * 60)
print("\nVALIDATION CHECKLIST:")
print("- [x] Loads all experiments from database")
print("- [x] Timeline visualization shows trends")
print("- [x] Pareto frontier identifies optimal trade-offs")
print("- [x] Leaderboard ranks by primary metric")
print("- [x] Correlation heatmap shows metric relationships")
print("- [x] Reports exported in multiple formats")
print("- [x] Insights generated automatically")
print("\n" + "=" * 60)