# RSNA 2025 Intracranial Aneurysm Detection - Result Analysis

This notebook performs comprehensive cross-fold analysis of misclassifications from the 5-fold CV training.

## Analysis Framework
- **Out-of-Fold (OOF) Predictions**: Uses 4-fold ensemble for each sample (no data leakage)
- **Per-Class Analysis**: Detailed misclassification analysis for each of the 14 classes
- **Hard Sample Identification**: Identifies different types of challenging cases
- **Fold Agreement Analysis**: Analyzes consistency between fold predictions

## Key Features
- Comprehensive misclassification analysis
- Per-class error breakdown
- Hard sample case studies
- Interactive visualizations
- Actionable insights for model improvement


In [None]:
# Import required libraries
import os
import sys
import json
import pickle
import warnings
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import torch
import torch.nn as nn
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score
)
from tqdm import tqdm
import cv2

# Add project root to path
sys.path.append('..')
from train import HybridAneurysmModel, Config
from utils import LABEL_COLS, ID_COL, load_cached_volume, take_window, valid_coords

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## Configuration and Setup


In [None]:
# Configuration
EXPERIMENT_DIR = "../models/2025-09-11-20-34-47"
NUM_FOLDS = 5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load experiment configuration
with open(f"{EXPERIMENT_DIR}/used_config.yaml", 'r') as f:
    import yaml
    config_dict = yaml.safe_load(f)

# Create Config object
config = Config()
config.architecture = config_dict['model']['architecture']
config.img_size = config_dict['data']['img_size']
config.window_offsets = config_dict['data']['window_offsets']
config.roi_box_fraction = config_dict['data']['roi_box_fraction']
config.cache_dir = config_dict['paths']['cache_dir']
config.device = DEVICE

print(f"Experiment: {EXPERIMENT_DIR}")
print(f"Architecture: {config.architecture}")
print(f"Image size: {config.img_size}")
print(f"Device: {DEVICE}")


## Cross-Fold Analysis Framework


In [None]:
class CrossFoldAnalyzer:
    """Comprehensive cross-fold analysis for misclassification detection"""
    
    def __init__(self, experiment_dir: str, config: Config):
        self.experiment_dir = experiment_dir
        self.config = config
        self.device = config.device
        self.num_classes = len(LABEL_COLS)
        
        # Data storage
        self.fold_assignments = {}
        self.oof_predictions = {}  # {sample_id: [pred1, pred2, pred3, pred4]}
        self.true_labels = {}      # {sample_id: true_label_vector}
        self.fold_models = {}
        
        # Analysis results
        self.results = {}
        
    def load_fold_assignments(self, train_csv_path: str):
        """Load which fold each sample belongs to"""
        df = pd.read_csv(train_csv_path)
        self.fold_assignments = dict(zip(df[ID_COL], df['fold']))
        print(f"Loaded fold assignments for {len(self.fold_assignments)} samples")
        
    def load_fold_models(self):
        """Load all fold models"""
        for fold in range(NUM_FOLDS):
            model_path = f"{self.experiment_dir}/tf_efficientnet_b0_fold{fold}_best.pth"
            if os.path.exists(model_path):
                model = HybridAneurysmModel(self.config)
                state_dict = torch.load(model_path, map_location=self.device)
                model.load_state_dict(state_dict)
                model.to(self.device)
                model.eval()
                self.fold_models[fold] = model
                print(f"Loaded fold {fold} model")
            else:
                print(f"Warning: Model not found for fold {fold}")
        
    def predict_sample(self, model, sample_id: str) -> np.ndarray:
        """Predict single sample using a model"""
        try:
            # Load cached volume
            volume_path = f"{self.config.cache_dir}/{sample_id}.npz"
            volume = load_cached_volume(volume_path)  # (N, H, W)
            
            # Prepare windows
            N = volume.shape[0]
            all_predictions = []
            
            # Process in batches
            batch_size = 16
            for i in range(0, N, batch_size):
                batch_windows = []
                batch_coords = []
                
                for center_idx in range(i, min(i + batch_size, N)):
                    # Extract window
                    window = take_window(volume, center_idx, self.config.window_offsets)
                    
                    # Convert to HWC and resize
                    img_hwc = np.transpose(window, (1, 2, 0)).astype(np.float32)
                    img_resized = cv2.resize(img_hwc, (self.config.img_size, self.config.img_size))
                    
                    # Convert to CHW tensor
                    x_full = torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).float()
                    x_roi = x_full.clone()  # Same for both streams (no coords available)
                    coords = torch.zeros(2).float()  # No coordinates available
                    
                    batch_windows.append((x_full, x_roi, coords))
                
                # Stack and predict
                if batch_windows:
                    x_full_batch = torch.stack([x[0] for x in batch_windows]).to(self.device)
                    x_roi_batch = torch.stack([x[1] for x in batch_windows]).to(self.device)
                    coords_batch = torch.stack([x[2] for x in batch_windows]).to(self.device)
                    
                    with torch.no_grad():
                        logits = model(x_full_batch, x_roi_batch, coords_batch)
                        probs = torch.sigmoid(logits).cpu().numpy()
                        all_predictions.append(probs)
            
            if all_predictions:
                # Aggregate across windows (max aggregation)
                all_preds = np.vstack(all_predictions)
                series_pred = all_preds.max(axis=0)
                return series_pred
            else:
                return np.zeros(self.num_classes, dtype=np.float32)
                
        except Exception as e:
            print(f"Error predicting {sample_id}: {e}")
            return np.zeros(self.num_classes, dtype=np.float32)
    
    def collect_oof_predictions(self, sample_ids: List[str]):
        """Collect out-of-fold predictions for all samples"""
        print(f"Collecting OOF predictions for {len(sample_ids)} samples...")
        
        for sample_id in tqdm(sample_ids):
            sample_fold = self.fold_assignments.get(sample_id, -1)
            if sample_fold == -1:
                continue
                
            oof_predictions = []
            
            # Get predictions from the 4 folds that didn't train on this sample
            for fold in range(NUM_FOLDS):
                if fold != sample_fold and fold in self.fold_models:
                    model = self.fold_models[fold]
                    prediction = self.predict_sample(model, sample_id)
                    oof_predictions.append(prediction)
            
            # Store predictions
            if oof_predictions:
                self.oof_predictions[sample_id] = oof_predictions
    
    def create_oof_ensemble(self, sample_id: str, method: str = "mean") -> np.ndarray:
        """Create ensemble prediction from OOF predictions"""
        oof_preds = self.oof_predictions[sample_id]
        
        if method == "mean":
            return np.mean(oof_preds, axis=0)
        elif method == "max":
            return np.max(oof_preds, axis=0)
        elif method == "median":
            return np.median(oof_preds, axis=0)
        else:
            return np.mean(oof_preds, axis=0)


## Analysis Methods


In [None]:
def analyze_per_class_misclassifications(self, true_labels_df: pd.DataFrame):
    """Detailed analysis of misclassifications per class"""
    results = {}
    
    for class_idx, class_name in enumerate(LABEL_COLS):
        class_analysis = {
            'class_name': class_name,
            'total_samples': 0,
            'positive_samples': 0,
            'misclassified_samples': [],
            'false_positives': [],
            'false_negatives': [],
            'confidence_distribution': {'correct': [], 'incorrect': []},
            'fold_agreement': []
        }
        
        for sample_id in self.oof_predictions.keys():
            if sample_id not in true_labels_df[ID_COL].values:
                continue
                
            true_label = true_labels_df[true_labels_df[ID_COL] == sample_id][class_name].iloc[0]
            oof_preds = self.oof_predictions[sample_id]
            ensemble_pred = self.create_oof_ensemble(sample_id)
            
            class_analysis['total_samples'] += 1
            
            if true_label == 1:  # Positive sample
                class_analysis['positive_samples'] += 1
                
                # Check if misclassified
                if ensemble_pred[class_idx] < 0.5:
                    class_analysis['false_negatives'].append({
                        'sample_id': sample_id,
                        'true_label': true_label,
                        'prediction': ensemble_pred[class_idx],
                        'fold_predictions': [pred[class_idx] for pred in oof_preds],
                        'fold_agreement': np.std([pred[class_idx] for pred in oof_preds])
                    })
            
            else:  # Negative sample
                if ensemble_pred[class_idx] >= 0.5:
                    class_analysis['false_positives'].append({
                        'sample_id': sample_id,
                        'true_label': true_label,
                        'prediction': ensemble_pred[class_idx],
                        'fold_predictions': [pred[class_idx] for pred in oof_preds],
                        'fold_agreement': np.std([pred[class_idx] for pred in oof_preds])
                    })
            
            # Track confidence distribution
            is_correct = (true_label == 1) == (ensemble_pred[class_idx] >= 0.5)
            class_analysis['confidence_distribution']['correct' if is_correct else 'incorrect'].append(
                ensemble_pred[class_idx]
            )
            
            # Track fold agreement
            fold_scores = [pred[class_idx] for pred in oof_preds]
            class_analysis['fold_agreement'].append(np.std(fold_scores))
        
        results[class_name] = class_analysis
    
    return results

def identify_hard_samples(self, true_labels_df: pd.DataFrame):
    """Identify different types of hard samples"""
    hard_samples = {
        'high_confidence_wrong': [],      # Model very confident but wrong
        'low_confidence_correct': [],     # Model uncertain but correct
        'fold_disagreement': [],          # Folds disagree strongly
        'ambiguous_boundary': [],         # Near decision boundary
        'rare_class_misclassified': []    # Misclassified rare class samples
    }
    
    for sample_id in self.oof_predictions.keys():
        if sample_id not in true_labels_df[ID_COL].values:
            continue
            
        oof_preds = self.oof_predictions[sample_id]
        ensemble_pred = self.create_oof_ensemble(sample_id)
        true_labels = true_labels_df[true_labels_df[ID_COL] == sample_id][LABEL_COLS].iloc[0].values
        
        # Calculate fold agreement (lower std = more agreement)
        fold_agreement = np.mean([np.std(pred) for pred in oof_preds])
        
        # Check each class
        for class_idx, class_name in enumerate(LABEL_COLS):
            true_label = true_labels[class_idx]
            pred_score = ensemble_pred[class_idx]
            fold_scores = [pred[class_idx] for pred in oof_preds]
            
            is_correct = (true_label == 1) == (pred_score >= 0.5)
            confidence = max(pred_score, 1 - pred_score)
            
            sample_info = {
                'sample_id': sample_id,
                'class_name': class_name,
                'true_label': true_label,
                'prediction': pred_score,
                'confidence': confidence,
                'fold_scores': fold_scores,
                'fold_agreement': np.std(fold_scores)
            }
            
            # Categorize hard samples
            if not is_correct and confidence > 0.8:
                hard_samples['high_confidence_wrong'].append(sample_info)
            elif is_correct and confidence < 0.3:
                hard_samples['low_confidence_correct'].append(sample_info)
            elif fold_agreement > 0.3:  # High disagreement between folds
                hard_samples['fold_disagreement'].append(sample_info)
            elif 0.4 <= pred_score <= 0.6:  # Near decision boundary
                hard_samples['ambiguous_boundary'].append(sample_info)
    
    return hard_samples

def analyze_fold_agreement(self):
    """Analyze how much the 4 folds agree on predictions"""
    agreement_stats = {
        'overall_agreement': [],
        'per_class_agreement': {class_name: [] for class_name in LABEL_COLS},
        'agreement_vs_confidence': [],
        'disagreement_cases': []
    }
    
    for sample_id in self.oof_predictions.keys():
        oof_preds = self.oof_predictions[sample_id]
        
        # Calculate agreement for each class
        for class_idx, class_name in enumerate(LABEL_COLS):
            fold_scores = [pred[class_idx] for pred in oof_preds]
            agreement = 1 - np.std(fold_scores)  # Higher = more agreement
            confidence = np.mean(fold_scores)
            
            agreement_stats['per_class_agreement'][class_name].append(agreement)
            agreement_stats['agreement_vs_confidence'].append({
                'agreement': agreement,
                'confidence': confidence,
                'sample_id': sample_id,
                'class_name': class_name
            })
            
            # Track high disagreement cases
            if agreement < 0.5:  # Low agreement threshold
                agreement_stats['disagreement_cases'].append({
                    'sample_id': sample_id,
                    'class_name': class_name,
                    'fold_scores': fold_scores,
                    'agreement': agreement
                })
    
    return agreement_stats


## Visualization Methods


In [None]:
def create_visualizations(self, per_class_analysis, hard_samples, fold_agreement):
    """Create comprehensive visualizations"""
    
    # 1. Per-class error summary
    self._plot_per_class_error_summary(per_class_analysis)
    
    # 2. Confidence distribution plots
    self._plot_confidence_distributions(per_class_analysis)
    
    # 3. Fold agreement analysis
    self._plot_fold_agreement_analysis(fold_agreement)
    
    # 4. Hard sample analysis
    self._plot_hard_sample_analysis(hard_samples)
    
    # 5. ROC curves per class
    self._plot_roc_curves_per_class(per_class_analysis)

def _plot_per_class_error_summary(self, per_class_analysis):
    """Plot per-class error summary"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Error counts
    class_names = list(per_class_analysis.keys())
    false_negatives = [len(per_class_analysis[cls]['false_negatives']) for cls in class_names]
    false_positives = [len(per_class_analysis[cls]['false_positives']) for cls in class_names]
    total_errors = [fn + fp for fn, fp in zip(false_negatives, false_positives)]
    
    # Plot 1: Total errors per class
    axes[0, 0].bar(range(len(class_names)), total_errors)
    axes[0, 0].set_title('Total Misclassifications per Class')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Plot 2: False Negatives vs False Positives
    x = np.arange(len(class_names))
    width = 0.35
    axes[0, 1].bar(x - width/2, false_negatives, width, label='False Negatives', alpha=0.8)
    axes[0, 1].bar(x + width/2, false_positives, width, label='False Positives', alpha=0.8)
    axes[0, 1].set_title('False Negatives vs False Positives')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].legend()
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Plot 3: Error rate per class
    error_rates = []
    for cls in class_names:
        total_samples = per_class_analysis[cls]['total_samples']
        errors = len(per_class_analysis[cls]['false_negatives']) + len(per_class_analysis[cls]['false_positives'])
        error_rate = errors / total_samples if total_samples > 0 else 0
        error_rates.append(error_rate)
    
    axes[1, 0].bar(range(len(class_names)), error_rates)
    axes[1, 0].set_title('Error Rate per Class')
    axes[1, 0].set_ylabel('Error Rate')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Plot 4: Positive sample distribution
    positive_counts = [per_class_analysis[cls]['positive_samples'] for cls in class_names]
    axes[1, 1].bar(range(len(class_names)), positive_counts)
    axes[1, 1].set_title('Positive Samples per Class')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

def _plot_confidence_distributions(self, per_class_analysis):
    """Plot confidence distributions for correct vs incorrect predictions"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    # Select 4 classes to plot
    selected_classes = list(per_class_analysis.keys())[:4]
    
    for i, class_name in enumerate(selected_classes):
        if i >= 4:
            break
            
        correct_conf = per_class_analysis[class_name]['confidence_distribution']['correct']
        incorrect_conf = per_class_analysis[class_name]['confidence_distribution']['incorrect']
        
        axes[i].hist(correct_conf, bins=20, alpha=0.7, label='Correct', density=True)
        axes[i].hist(incorrect_conf, bins=20, alpha=0.7, label='Incorrect', density=True)
        axes[i].set_title(f'{class_name}\\nConfidence Distribution')
        axes[i].set_xlabel('Prediction Confidence')
        axes[i].set_ylabel('Density')
        axes[i].legend()
    
    plt.tight_layout()
    plt.show()

def _plot_fold_agreement_analysis(self, fold_agreement):
    """Plot fold agreement analysis"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Agreement distribution
    all_agreements = []
    for class_name in LABEL_COLS:
        all_agreements.extend(fold_agreement['per_class_agreement'][class_name])
    
    axes[0].hist(all_agreements, bins=30, alpha=0.7, edgecolor='black')
    axes[0].set_title('Fold Agreement Distribution')
    axes[0].set_xlabel('Agreement Score (1 - std)')
    axes[0].set_ylabel('Frequency')
    axes[0].axvline(np.mean(all_agreements), color='red', linestyle='--', 
                    label=f'Mean: {np.mean(all_agreements):.3f}')
    axes[0].legend()
    
    # Plot 2: Agreement vs Confidence
    agreement_data = fold_agreement['agreement_vs_confidence']
    agreements = [item['agreement'] for item in agreement_data]
    confidences = [item['confidence'] for item in agreement_data]
    
    scatter = axes[1].scatter(confidences, agreements, alpha=0.6, s=20)
    axes[1].set_title('Fold Agreement vs Prediction Confidence')
    axes[1].set_xlabel('Prediction Confidence')
    axes[1].set_ylabel('Fold Agreement')
    
    # Add correlation coefficient
    corr = np.corrcoef(confidences, agreements)[0, 1]
    axes[1].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                transform=axes[1].transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

def _plot_hard_sample_analysis(self, hard_samples):
    """Plot hard sample analysis"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Count hard samples by type
    hard_sample_counts = {
        'High Confidence Wrong': len(hard_samples['high_confidence_wrong']),
        'Low Confidence Correct': len(hard_samples['low_confidence_correct']),
        'Fold Disagreement': len(hard_samples['fold_disagreement']),
        'Ambiguous Boundary': len(hard_samples['ambiguous_boundary'])
    }
    
    # Plot 1: Hard sample counts
    axes[0, 0].bar(hard_sample_counts.keys(), hard_sample_counts.values())
    axes[0, 0].set_title('Hard Sample Counts by Type')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Plot 2: Hard samples by class
    class_hard_counts = {}
    for hard_type, samples in hard_samples.items():
        for sample in samples:
            class_name = sample['class_name']
            if class_name not in class_hard_counts:
                class_hard_counts[class_name] = 0
            class_hard_counts[class_name] += 1
    
    if class_hard_counts:
        axes[0, 1].bar(class_hard_counts.keys(), class_hard_counts.values())
        axes[0, 1].set_title('Hard Samples by Class')
        axes[0, 1].set_ylabel('Count')
        axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Plot 3: Confidence distribution for hard samples
    all_confidences = []
    all_types = []
    for hard_type, samples in hard_samples.items():
        for sample in samples:
            all_confidences.append(sample['confidence'])
            all_types.append(hard_type)
    
    if all_confidences:
        for hard_type in set(all_types):
            type_confidences = [conf for conf, t in zip(all_confidences, all_types) if t == hard_type]
            axes[1, 0].hist(type_confidences, alpha=0.6, label=hard_type, bins=15)
        axes[1, 0].set_title('Confidence Distribution by Hard Sample Type')
        axes[1, 0].set_xlabel('Confidence')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].legend()
    
    # Plot 4: Fold agreement for hard samples
    fold_agreements = []
    for hard_type, samples in hard_samples.items():
        for sample in samples:
            fold_agreements.append(sample['fold_agreement'])
    
    if fold_agreements:
        axes[1, 1].hist(fold_agreements, bins=20, alpha=0.7, edgecolor='black')
        axes[1, 1].set_title('Fold Agreement Distribution for Hard Samples')
        axes[1, 1].set_xlabel('Fold Agreement (std)')
        axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

def _plot_roc_curves_per_class(self, per_class_analysis):
    """Plot ROC curves for each class"""
    fig, axes = plt.subplots(3, 5, figsize=(20, 12))
    axes = axes.flatten()
    
    for i, class_name in enumerate(LABEL_COLS):
        if i >= len(axes):
            break
            
        # Collect true labels and predictions for this class
        y_true = []
        y_pred = []
        
        for sample_id in self.oof_predictions.keys():
            if sample_id in self.true_labels:
                true_label = self.true_labels[sample_id][i]
                ensemble_pred = self.create_oof_ensemble(sample_id)
                pred_score = ensemble_pred[i]
                
                y_true.append(true_label)
                y_pred.append(pred_score)
        
        if len(set(y_true)) > 1:  # Only plot if we have both classes
            fpr, tpr, _ = roc_curve(y_true, y_pred)
            auc = roc_auc_score(y_true, y_pred)
            
            axes[i].plot(fpr, tpr, linewidth=2, label=f'AUC = {auc:.3f}')
            axes[i].plot([0, 1], [0, 1], 'k--', alpha=0.5)
            axes[i].set_title(f'{class_name}\\nAUC: {auc:.3f}')
            axes[i].set_xlabel('False Positive Rate')
            axes[i].set_ylabel('True Positive Rate')
            axes[i].legend()
        else:
            axes[i].text(0.5, 0.5, 'Insufficient data', 
                        ha='center', va='center', transform=axes[i].transAxes)
            axes[i].set_title(class_name)
    
    # Hide unused subplots
    for i in range(len(LABEL_COLS), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()


## Main Execution


In [None]:
# Initialize analyzer
analyzer = CrossFoldAnalyzer(EXPERIMENT_DIR, config)

# Load fold assignments
train_csv_path = config_dict['paths']['train_csv']
analyzer.load_fold_assignments(train_csv_path)

# Load models
analyzer.load_fold_models()

# Get sample IDs for analysis (use a subset for testing)
all_sample_ids = list(analyzer.fold_assignments.keys())
print(f"Total samples available: {len(all_sample_ids)}")

# For testing, use a subset (remove this for full analysis)
test_sample_ids = all_sample_ids[:100]  # First 100 samples for testing
print(f"Using {len(test_sample_ids)} samples for analysis")


In [None]:
# Collect OOF predictions
analyzer.collect_oof_predictions(test_sample_ids)

print(f"Collected OOF predictions for {len(analyzer.oof_predictions)} samples")
print(f"Average predictions per sample: {np.mean([len(preds) for preds in analyzer.oof_predictions.values()]):.1f}")

# Load true labels for analysis
true_labels_df = pd.read_csv(train_csv_path)
print(f"Loaded true labels for {len(true_labels_df)} samples")


In [None]:
# Run comprehensive analysis
print("🔍 Starting Cross-Fold Analysis...")

# 1. Per-class misclassification analysis
print("📊 Analyzing per-class misclassifications...")
per_class_analysis = analyzer.analyze_per_class_misclassifications(true_labels_df)

# 2. Hard sample identification
print("🎯 Identifying hard samples...")
hard_samples = analyzer.identify_hard_samples(true_labels_df)

# 3. Fold agreement analysis
print("🤝 Analyzing fold agreement...")
fold_agreement = analyzer.analyze_fold_agreement()

print("✅ Analysis complete!")


In [None]:
# Generate visualizations
print("📈 Creating visualizations...")
analyzer.create_visualizations(per_class_analysis, hard_samples, fold_agreement)


## Results Summary


In [None]:
# Print comprehensive results summary
print("=" * 80)
print("📋 CROSS-FOLD ANALYSIS RESULTS SUMMARY")
print("=" * 80)

# Overall statistics
total_samples = len(analyzer.oof_predictions)
print(f"\\n📊 Overall Statistics:")
print(f"  • Total samples analyzed: {total_samples}")
print(f"  • Average OOF predictions per sample: {np.mean([len(preds) for preds in analyzer.oof_predictions.values()]):.1f}")

# Per-class summary
print(f"\\n🎯 Per-Class Misclassification Summary:")
for class_name, analysis in per_class_analysis.items():
    fn_count = len(analysis['false_negatives'])
    fp_count = len(analysis['false_positives'])
    total_errors = fn_count + fp_count
    error_rate = total_errors / analysis['total_samples'] if analysis['total_samples'] > 0 else 0
    
    print(f"  • {class_name}:")
    print(f"    - Total samples: {analysis['total_samples']}")
    print(f"    - Positive samples: {analysis['positive_samples']}")
    print(f"    - False Negatives: {fn_count}")
    print(f"    - False Positives: {fp_count}")
    print(f"    - Error Rate: {error_rate:.3f}")

# Hard samples summary
print(f"\\n🎯 Hard Sample Summary:")
for hard_type, samples in hard_samples.items():
    print(f"  • {hard_type}: {len(samples)} samples")

# Fold agreement summary
print(f"\\n🤝 Fold Agreement Summary:")
all_agreements = []
for class_name in LABEL_COLS:
    all_agreements.extend(fold_agreement['per_class_agreement'][class_name])

if all_agreements:
    print(f"  • Mean agreement: {np.mean(all_agreements):.3f}")
    print(f"  • Std agreement: {np.std(all_agreements):.3f}")
    print(f"  • Min agreement: {np.min(all_agreements):.3f}")
    print(f"  • Max agreement: {np.max(all_agreements):.3f}")

print("\\n" + "=" * 80)
print("✅ Analysis Complete! Check visualizations above for detailed insights.")
print("=" * 80)
