In [1]:
"""
Compute confidence intervals for evaluation metrics using stratified bootstrapping.
"""

import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score, precision_recall_curve
from typing import Tuple, Dict


def precision_at_recall(y_true: np.ndarray, y_score: np.ndarray, target_recall: float = 0.9) -> float:
    """
    Compute precision at a specific recall threshold.
    
    Parameters
    ----------
    y_true : array-like
        Ground truth binary labels.
    y_score : array-like
        Predicted probability scores.
    target_recall : float
        Target recall value (default 0.9).
    
    Returns
    -------
    float
        Precision at the specified recall level.
    """
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    
    # Find the precision at the target recall
    # precision_recall_curve returns values in decreasing recall order
    # Find indices where recall >= target_recall
    valid_indices = np.where(recall >= target_recall)[0]
    
    if len(valid_indices) == 0:
        return 0.0
    
    # Return the maximum precision at recall >= target_recall
    return precision[valid_indices].max()


def stratified_bootstrap_sample(df: pd.DataFrame, random_state: np.random.RandomState) -> pd.DataFrame:
    """
    Create a stratified bootstrap sample, sampling each class separately.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with 'GT' and 'score' columns.
    random_state : np.random.RandomState
        Random state for reproducibility.
    
    Returns
    -------
    pd.DataFrame
        Bootstrapped sample with same class distribution.
    """
    class_0 = df[df['GT'] == 0]
    class_1 = df[df['GT'] == 1]
    
    # Sample with replacement from each class
    bootstrap_0 = class_0.sample(n=len(class_0), replace=True, random_state=random_state)
    bootstrap_1 = class_1.sample(n=len(class_1), replace=True, random_state=random_state)
    
    return pd.concat([bootstrap_0, bootstrap_1], ignore_index=True)


def compute_metrics(y_true: np.ndarray, y_score: np.ndarray) -> Tuple[float, float]:
    """
    Compute average precision and precision at recall=0.9.
    
    Parameters
    ----------
    y_true : array-like
        Ground truth binary labels.
    y_score : array-like
        Predicted probability scores.
    
    Returns
    -------
    tuple
        (average_precision, precision_at_recall_0.9)
    """
    ap = average_precision_score(y_true, y_score)
    p_at_r = precision_at_recall(y_true, y_score, target_recall=0.9)
    
    return ap, p_at_r


def compute_confidence_intervals(
    df: pd.DataFrame,
    n_bootstrap: int = 10000,
    confidence_level: float = 0.95,
    random_seed: int = 42
) -> Dict[str, Dict[str, float]]:
    """
    Compute confidence intervals for evaluation metrics using stratified bootstrapping.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with columns:
        - 'GT': Ground truth labels (0 or 1)
        - 'score': Predicted probability scores
    n_bootstrap : int
        Number of bootstrap iterations (default 10000).
    confidence_level : float
        Confidence level for intervals (default 0.95 for 95% CI).
    random_seed : int
        Random seed for reproducibility.
    
    Returns
    -------
    dict
        Dictionary containing mean, lower CI, and upper CI for each metric.
        {
            'average_precision': {'mean': float, 'lower_ci': float, 'upper_ci': float},
            'precision_at_recall_0.9': {'mean': float, 'lower_ci': float, 'upper_ci': float}
        }
    """
    # Validate input
    if 'GT' not in df.columns or 'score' not in df.columns:
        raise ValueError("DataFrame must contain 'GT' and 'score' columns")
    
    if not set(df['GT'].unique()).issubset({0, 1}):
        raise ValueError("GT column must contain only 0 and 1 values")
    
    # Initialize random state
    rng = np.random.RandomState(random_seed)
    
    # Store bootstrap results
    ap_scores = np.zeros(n_bootstrap)
    p_at_r_scores = np.zeros(n_bootstrap)
    
    # Perform bootstrapping
    for i in range(n_bootstrap):
        # Create stratified bootstrap sample
        bootstrap_df = stratified_bootstrap_sample(df, rng)
        
        y_true = bootstrap_df['GT'].values
        y_score = bootstrap_df['score'].values
        
        # Compute metrics
        ap, p_at_r = compute_metrics(y_true, y_score)
        
        ap_scores[i] = ap
        p_at_r_scores[i] = p_at_r
    
    # Compute confidence intervals using percentile method
    alpha = 1 - confidence_level
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    results = {
        'average_precision': {
            'mean': np.mean(ap_scores),
            'lower_ci': np.percentile(ap_scores, lower_percentile),
            'upper_ci': np.percentile(ap_scores, upper_percentile)
        },
        'precision_at_recall_0.9': {
            'mean': np.mean(p_at_r_scores),
            'lower_ci': np.percentile(p_at_r_scores, lower_percentile),
            'upper_ci': np.percentile(p_at_r_scores, upper_percentile)
        }
    }
    
    return results


def print_results(results: Dict[str, Dict[str, float]]) -> None:
    """
    Pretty print the confidence interval results.
    
    Parameters
    ----------
    results : dict
        Results dictionary from compute_confidence_intervals.
    """
    print("=" * 60)
    print("Evaluation Metrics with 95% Confidence Intervals")
    print("=" * 60)
    
    for metric_name, values in results.items():
        print(f"\n{metric_name.replace('_', ' ').title()}:")
        print(f"  Mean:     {values['mean']:.4f}")
        print(f"  95% CI:   [{values['lower_ci']:.4f}, {values['upper_ci']:.4f}]")
    
    print("\n" + "=" * 60)





In [2]:

# Create sample data for demonstration
np.random.seed(42)
n_samples = 1000

# Generate synthetic ground truth and scores
gt = np.random.binomial(1, 0.3, n_samples)  # 30% positive class

# Generate scores that are somewhat correlated with ground truth
scores = np.clip(
    gt * np.random.uniform(0.4, 1.0, n_samples) + 
    (1 - gt) * np.random.uniform(0.0, 0.6, n_samples),
    0, 1
)

# Create DataFrame
df = pd.DataFrame({
    'GT': gt,
    'score': scores
})

print(f"Sample data shape: {df.shape}")
print(f"Class distribution: {df['GT'].value_counts().to_dict()}")
print()

# Compute confidence intervals
results = compute_confidence_intervals(df, n_bootstrap=10000, random_seed=42)

# Print results
print_results(results)

Sample data shape: (1000, 2)
Class distribution: {0: 712, 1: 288}

Evaluation Metrics with 95% Confidence Intervals

Average Precision:
  Mean:     0.9107
  95% CI:   [0.8882, 0.9309]

Precision At Recall 0.9:
  Mean:     0.6302
  95% CI:   [0.5874, 0.6779]

