# Fase 5: Comparación Completa de Métodos de Incertidumbre y Calibración

**Objetivo**: Comparar 6 métodos lado a lado en detección, calibración y risk-coverage.

**Métodos evaluados**:
1. Baseline (sin incertidumbre, sin calibración)
2. Baseline + TS
3. MC-Dropout K=5
4. MC-Dropout K=5 + TS
5. Varianza entre capas (single-pass)
6. Varianza entre capas + TS

**Splits**:
- val_calib: ajustar temperaturas
- val_eval: evaluación final

**Métricas**:
- Detección: mAP@[0.5:0.95], AP50, AP75, por clase
- Calibración: NLL, Brier, ECE, Reliability Diagrams
- Risk-Coverage: curvas y AUC

## 1. Configuración e Imports

In [None]:
import os
import sys
import json
import yaml
import time
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import torchvision
import warnings
warnings.filterwarnings('ignore')

# Configuración
BASE_DIR = Path('..')
DATA_DIR = BASE_DIR / 'data'
OUTPUT_DIR = Path('./outputs/comparison')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CONFIG = {
    'seed': 42,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'categories': ['person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle', 'traffic light', 'traffic sign'],
    'iou_matching': 0.5,
    'conf_threshold': 0.25,
    'nms_threshold': 0.65,
    'K_mc': 5,
    'n_bins': 10
}

torch.manual_seed(CONFIG['seed'])
np.random.seed(CONFIG['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed(CONFIG['seed'])

with open(OUTPUT_DIR / 'config.yaml', 'w') as f:
    yaml.dump(CONFIG, f)

print(f"Device: {CONFIG['device']}")
print(f"Output: {OUTPUT_DIR}")
print(f"Config guardado")

## 2. Cargar Modelo y Preparar Funciones

In [None]:
from groundingdino.util.inference import load_model, load_image, predict
from groundingdino.util import box_ops

model_config = '/opt/program/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
model_weights = '/opt/program/GroundingDINO/weights/groundingdino_swint_ogc.pth'

model = load_model(model_config, model_weights)
model.to(CONFIG['device'])

TEXT_PROMPT = '. '.join(CONFIG['categories']) + '.'

print(f"Modelo cargado en {CONFIG['device']}")
print(f"Prompt: {TEXT_PROMPT}")

# Guardar referencias de módulos dropout
dropout_modules = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Dropout) and ('class_embed' in name or 'bbox_embed' in name):
        dropout_modules.append(module)

print(f"Módulos dropout en cabeza: {len(dropout_modules)}")

In [None]:
def normalize_label(label):
    synonyms = {'bike': 'bicycle', 'motorbike': 'motorcycle', 'pedestrian': 'person', 
                'stop sign': 'traffic sign', 'red light': 'traffic light'}
    label_lower = label.lower().strip()
    if label_lower in synonyms:
        return synonyms[label_lower]
    for cat in CONFIG['categories']:
        if cat in label_lower:
            return cat
    return label_lower

def compute_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - inter
    return inter / union if union > 0 else 0

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def apply_nms(detections, iou_thresh=0.65):
    if len(detections) == 0:
        return []
    boxes_t = torch.tensor([d['bbox'] for d in detections], dtype=torch.float32)
    scores_t = torch.tensor([d['score'] for d in detections], dtype=torch.float32)
    keep = torchvision.ops.nms(boxes_t, scores_t, iou_thresh)
    return [detections[i] for i in keep.numpy()]

print("Funciones auxiliares definidas")

## 3. Métodos de Inferencia

In [None]:
def inference_baseline(model, image_path, text_prompt, conf_thresh, device):
    """Método 1: Baseline single-pass sin incertidumbre"""
    model.eval()
    for module in dropout_modules:
        module.eval()
    
    image_source, image = load_image(str(image_path))
    boxes, scores, phrases = predict(model, image, text_prompt, conf_thresh, 0.25, device)
    
    if len(boxes) == 0:
        return []
    
    h, w = image_source.shape[:2]
    boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.tensor([w, h, w, h])
    
    detections = []
    for box, score, phrase in zip(boxes_xyxy.cpu().numpy(), scores.cpu().numpy(), phrases):
        cat = normalize_label(phrase)
        if cat in CONFIG['categories']:
            score_clipped = np.clip(float(score), 1e-7, 1 - 1e-7)
            logit = np.log(score_clipped / (1 - score_clipped))
            detections.append({
                'bbox': box.tolist(),
                'score': score_clipped,
                'logit': logit,
                'category': cat,
                'uncertainty': 0.0  # Sin incertidumbre
            })
    
    return apply_nms(detections, CONFIG['nms_threshold'])

print("Método 1: Baseline definido")

In [None]:
def inference_mc_dropout(model, image_path, text_prompt, conf_thresh, device, K=5):
    """Método 3: MC-Dropout con K pases"""
    model.eval()
    for module in dropout_modules:
        module.train()
    
    image_source, image = load_image(str(image_path))
    h, w = image_source.shape[:2]
    
    all_detections_k = []
    
    with torch.no_grad():
        for k in range(K):
            boxes, scores, phrases = predict(model, image, text_prompt, conf_thresh, 0.25, device)
            
            if len(boxes) == 0:
                all_detections_k.append([])
                continue
            
            boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.tensor([w, h, w, h])
            
            dets_k = []
            for box, score, phrase in zip(boxes_xyxy.cpu().numpy(), scores.cpu().numpy(), phrases):
                cat = normalize_label(phrase)
                if cat in CONFIG['categories']:
                    score_clipped = np.clip(float(score), 1e-7, 1 - 1e-7)
                    dets_k.append({
                        'bbox': box.tolist(),
                        'score': score_clipped,
                        'category': cat
                    })
            all_detections_k.append(dets_k)
    
    # Alinear detecciones entre pases
    if len(all_detections_k) == 0 or all(len(d) == 0 for d in all_detections_k):
        return []
    
    # Usar primer pase como referencia
    ref_dets = all_detections_k[0]
    
    aggregated = []
    for ref_det in ref_dets:
        scores_aligned = [ref_det['score']]
        
        for k in range(1, K):
            best_iou = 0
            best_score = None
            for det_k in all_detections_k[k]:
                if det_k['category'] != ref_det['category']:
                    continue
                iou = compute_iou(ref_det['bbox'], det_k['bbox'])
                if iou > best_iou:
                    best_iou = iou
                    best_score = det_k['score']
            
            if best_iou >= 0.5 and best_score is not None:
                scores_aligned.append(best_score)
        
        mean_score = np.mean(scores_aligned)
        variance = np.var(scores_aligned) if len(scores_aligned) > 1 else 0.0
        
        mean_score_clipped = np.clip(mean_score, 1e-7, 1 - 1e-7)
        logit = np.log(mean_score_clipped / (1 - mean_score_clipped))
        
        aggregated.append({
            'bbox': ref_det['bbox'],
            'score': mean_score_clipped,
            'logit': logit,
            'category': ref_det['category'],
            'uncertainty': variance
        })
    
    return apply_nms(aggregated, CONFIG['nms_threshold'])

print("Método 3: MC-Dropout definido")

In [None]:
def inference_decoder_variance(model, image_path, text_prompt, conf_thresh, device):
    """Método 5: Varianza entre capas del decoder (single-pass)"""
    model.eval()
    for module in dropout_modules:
        module.eval()
    
    image_source, image = load_image(str(image_path))
    h, w = image_source.shape[:2]
    
    # Hook para capturar logits de cada capa del decoder
    layer_logits = []
    
    def hook_fn(module, input, output):
        if hasattr(output, 'pred_logits'):
            layer_logits.append(output.pred_logits.detach())
    
    # Registrar hooks en capas del decoder
    hooks = []
    for name, module in model.named_modules():
        if 'decoder.layers' in name and name.endswith(')'):
            hooks.append(module.register_forward_hook(hook_fn))
    
    # Inferencia
    boxes, scores, phrases = predict(model, image, text_prompt, conf_thresh, 0.25, device)
    
    # Remover hooks
    for hook in hooks:
        hook.remove()
    
    if len(boxes) == 0:
        return []
    
    boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.tensor([w, h, w, h])
    
    detections = []
    for idx, (box, score, phrase) in enumerate(zip(boxes_xyxy.cpu().numpy(), scores.cpu().numpy(), phrases)):
        cat = normalize_label(phrase)
        if cat in CONFIG['categories']:
            score_clipped = np.clip(float(score), 1e-7, 1 - 1e-7)
            logit = np.log(score_clipped / (1 - score_clipped))
            
            # Calcular varianza entre capas si disponible
            uncertainty = 0.0
            if len(layer_logits) > 1:
                # Tomar scores de cada capa para esta detección
                layer_scores = []
                for layer_out in layer_logits:
                    if idx < layer_out.shape[1]:
                        layer_score = torch.sigmoid(layer_out[0, idx].max()).item()
                        layer_scores.append(layer_score)
                
                if len(layer_scores) > 1:
                    uncertainty = np.var(layer_scores)
            
            detections.append({
                'bbox': box.tolist(),
                'score': score_clipped,
                'logit': logit,
                'category': cat,
                'uncertainty': uncertainty
            })
    
    return apply_nms(detections, CONFIG['nms_threshold'])

print("Método 5: Decoder variance definido")

## 4. Inferencia en val_calib para Ajustar Temperaturas

In [None]:
val_calib_json = DATA_DIR / 'bdd100k_coco/val_calib.json'
val_eval_json = DATA_DIR / 'bdd100k_coco/val_eval.json'
image_dir = DATA_DIR / 'bdd100k/bdd100k/bdd100k/images/100k/val'

coco_calib = COCO(str(val_calib_json))
img_ids_calib = coco_calib.getImgIds()

print(f"Procesando {len(img_ids_calib[:500])} imágenes de val_calib para ajustar temperaturas...")

methods_calib_data = {
    'baseline': [],
    'mc_dropout': [],
    'decoder_variance': []
}

for img_id in tqdm(img_ids_calib[:500]):
    img_info = coco_calib.loadImgs(img_id)[0]
    img_path = image_dir / img_info['file_name']
    
    if not img_path.exists():
        continue
    
    gt_anns = coco_calib.loadAnns(coco_calib.getAnnIds(imgIds=img_id))
    
    # Método 1: Baseline
    preds_baseline = inference_baseline(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'])
    for pred in preds_baseline:
        is_tp = 0
        for gt in gt_anns:
            gt_cat_id = gt['category_id']
            gt_cat = CONFIG['categories'][gt_cat_id - 1] if 1 <= gt_cat_id <= len(CONFIG['categories']) else ''
            if gt_cat != pred['category']:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        methods_calib_data['baseline'].append({
            'logit': pred['logit'],
            'score': pred['score'],
            'category': pred['category'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
    
    # Método 3: MC-Dropout
    preds_mc = inference_mc_dropout(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'], CONFIG['K_mc'])
    for pred in preds_mc:
        is_tp = 0
        for gt in gt_anns:
            gt_cat_id = gt['category_id']
            gt_cat = CONFIG['categories'][gt_cat_id - 1] if 1 <= gt_cat_id <= len(CONFIG['categories']) else ''
            if gt_cat != pred['category']:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        methods_calib_data['mc_dropout'].append({
            'logit': pred['logit'],
            'score': pred['score'],
            'category': pred['category'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
    
    # Método 5: Decoder variance
    preds_dec = inference_decoder_variance(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'])
    for pred in preds_dec:
        is_tp = 0
        for gt in gt_anns:
            gt_cat_id = gt['category_id']
            gt_cat = CONFIG['categories'][gt_cat_id - 1] if 1 <= gt_cat_id <= len(CONFIG['categories']) else ''
            if gt_cat != pred['category']:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        methods_calib_data['decoder_variance'].append({
            'logit': pred['logit'],
            'score': pred['score'],
            'category': pred['category'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })

# Guardar datos de calibración
for method_name, data in methods_calib_data.items():
    df = pd.DataFrame(data)
    df.to_csv(OUTPUT_DIR / f'calib_{method_name}.csv', index=False)
    print(f"{method_name}: {len(df)} detecciones, TP={df['is_tp'].sum()}")

print("Datos de calibración guardados")

## 5. Optimizar Temperaturas

In [None]:
from scipy.optimize import minimize

def nll_loss(T, logits, labels):
    T = max(T, 0.01)
    probs = sigmoid(logits / T)
    probs = np.clip(probs, 1e-7, 1 - 1e-7)
    return -np.mean(labels * np.log(probs) + (1 - labels) * np.log(1 - probs))

temperatures = {}

for method_name in ['baseline', 'mc_dropout', 'decoder_variance']:
    df = pd.read_csv(OUTPUT_DIR / f'calib_{method_name}.csv')
    logits = df['logit'].values
    labels = df['is_tp'].values
    
    nll_before = nll_loss(1.0, logits, labels)
    result = minimize(lambda T: nll_loss(T, logits, labels), x0=1.0, bounds=[(0.01, 10.0)], method='L-BFGS-B')
    T_opt = result.x[0]
    nll_after = result.fun
    
    temperatures[method_name] = {
        'T': T_opt,
        'nll_before': nll_before,
        'nll_after': nll_after
    }
    
    print(f"{method_name}: T={T_opt:.4f}, NLL: {nll_before:.4f} → {nll_after:.4f}")

with open(OUTPUT_DIR / 'temperatures.json', 'w') as f:
    json.dump(temperatures, f, indent=2)

print(f"\nTemperaturas guardadas en: {OUTPUT_DIR / 'temperatures.json'}")

## 6. Evaluación en val_eval con COCO API

In [None]:
coco_eval = COCO(str(val_eval_json))
img_ids_eval = coco_eval.getImgIds()

print(f"Procesando {len(img_ids_eval)} imágenes de val_eval...")

methods_results = {
    'baseline': [],
    'baseline_ts': [],
    'mc_dropout': [],
    'mc_dropout_ts': [],
    'decoder_variance': [],
    'decoder_variance_ts': []
}

# Cargar temperaturas
with open(OUTPUT_DIR / 'temperatures.json', 'r') as f:
    temps = json.load(f)

for img_id in tqdm(img_ids_eval):
    img_info = coco_eval.loadImgs(img_id)[0]
    img_path = image_dir / img_info['file_name']
    
    if not img_path.exists():
        continue
    
    gt_anns = coco_eval.loadAnns(coco_eval.getAnnIds(imgIds=img_id))
    
    # Baseline
    preds_baseline = inference_baseline(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'])
    for pred in preds_baseline:
        cat_id = CONFIG['categories'].index(pred['category']) + 1
        is_tp = 0
        for gt in gt_anns:
            if gt['category_id'] != cat_id:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        
        methods_results['baseline'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': pred['score'],
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
        
        # Con TS
        score_ts = sigmoid(pred['logit'] / temps['baseline']['T'])
        methods_results['baseline_ts'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': score_ts,
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
    
    # MC-Dropout
    preds_mc = inference_mc_dropout(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'], CONFIG['K_mc'])
    for pred in preds_mc:
        cat_id = CONFIG['categories'].index(pred['category']) + 1
        is_tp = 0
        for gt in gt_anns:
            if gt['category_id'] != cat_id:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        
        methods_results['mc_dropout'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': pred['score'],
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
        
        score_ts = sigmoid(pred['logit'] / temps['mc_dropout']['T'])
        methods_results['mc_dropout_ts'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': score_ts,
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
    
    # Decoder variance
    preds_dec = inference_decoder_variance(model, img_path, TEXT_PROMPT, CONFIG['conf_threshold'], CONFIG['device'])
    for pred in preds_dec:
        cat_id = CONFIG['categories'].index(pred['category']) + 1
        is_tp = 0
        for gt in gt_anns:
            if gt['category_id'] != cat_id:
                continue
            gt_box = gt['bbox']
            gt_box_xyxy = [gt_box[0], gt_box[1], gt_box[0] + gt_box[2], gt_box[1] + gt_box[3]]
            if compute_iou(pred['bbox'], gt_box_xyxy) >= CONFIG['iou_matching']:
                is_tp = 1
                break
        
        methods_results['decoder_variance'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': pred['score'],
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })
        
        score_ts = sigmoid(pred['logit'] / temps['decoder_variance']['T'])
        methods_results['decoder_variance_ts'].append({
            'image_id': img_id,
            'category_id': cat_id,
            'bbox': [pred['bbox'][0], pred['bbox'][1], pred['bbox'][2] - pred['bbox'][0], pred['bbox'][3] - pred['bbox'][1]],
            'score': score_ts,
            'logit': pred['logit'],
            'uncertainty': pred['uncertainty'],
            'is_tp': is_tp
        })

# Guardar resultados
for method_name, results in methods_results.items():
    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_DIR / f'eval_{method_name}.csv', index=False)
    with open(OUTPUT_DIR / f'eval_{method_name}.json', 'w') as f:
        json.dump([{k: v for k, v in r.items() if k != 'is_tp'} for r in results], f)
    print(f"{method_name}: {len(df)} detecciones")

print("Evaluación completada y guardada")

## 7. Calcular Métricas de Detección (mAP)

In [None]:
detection_metrics = {}

for method_name in methods_results.keys():
    print(f"\nEvaluando {method_name}...")
    
    # Cargar predicciones en formato COCO
    preds_file = OUTPUT_DIR / f'eval_{method_name}.json'
    
    if os.path.getsize(preds_file) > 0:
        coco_dt = coco_eval.loadRes(str(preds_file))
        coco_eval_obj = COCOeval(coco_eval, coco_dt, 'bbox')
        coco_eval_obj.evaluate()
        coco_eval_obj.accumulate()
        coco_eval_obj.summarize()
        
        detection_metrics[method_name] = {
            'mAP': coco_eval_obj.stats[0],
            'AP50': coco_eval_obj.stats[1],
            'AP75': coco_eval_obj.stats[2],
            'AP_small': coco_eval_obj.stats[3],
            'AP_medium': coco_eval_obj.stats[4],
            'AP_large': coco_eval_obj.stats[5]
        }
        
        # mAP por clase
        per_class_ap = {}
        for cat_id, cat_name in enumerate(CONFIG['categories'], 1):
            coco_eval_obj.params.catIds = [cat_id]
            coco_eval_obj.evaluate()
            coco_eval_obj.accumulate()
            per_class_ap[cat_name] = coco_eval_obj.stats[0]
        
        detection_metrics[method_name]['per_class'] = per_class_ap
    else:
        detection_metrics[method_name] = {'mAP': 0.0, 'AP50': 0.0, 'AP75': 0.0}

with open(OUTPUT_DIR / 'detection_metrics.json', 'w') as f:
    json.dump(detection_metrics, f, indent=2)

print("\nMétricas de detección guardadas")

## 8. Tabla Comparativa de Detección

In [None]:
with open(OUTPUT_DIR / 'detection_metrics.json', 'r') as f:
    det_metrics = json.load(f)

# Crear tabla comparativa
rows = []
for method_name, metrics in det_metrics.items():
    row = {
        'Method': method_name,
        'mAP': metrics.get('mAP', 0.0),
        'AP50': metrics.get('AP50', 0.0),
        'AP75': metrics.get('AP75', 0.0)
    }
    
    # Agregar mAP por clase principal
    if 'per_class' in metrics:
        for cat in ['person', 'car', 'truck', 'traffic_light', 'traffic_sign']:
            cat_key = cat.replace('_', ' ')
            row[f'AP_{cat}'] = metrics['per_class'].get(cat_key, 0.0)
    
    rows.append(row)

df_detection = pd.DataFrame(rows)
df_detection.to_csv(OUTPUT_DIR / 'detection_comparison.csv', index=False)

print("\n" + "="*80)
print("TABLA COMPARATIVA DE DETECCIÓN")
print("="*80)
print(df_detection.to_string(index=False))
print("="*80)

## 9. Calcular Métricas de Calibración

In [None]:
def compute_calibration_metrics(logits, labels, T=1.0, n_bins=10):
    probs = sigmoid(logits / T)
    probs = np.clip(probs, 1e-7, 1 - 1e-7)
    
    # NLL
    nll = -np.mean(labels * np.log(probs) + (1 - labels) * np.log(1 - probs))
    
    # Brier
    brier = np.mean((probs - labels) ** 2)
    
    # ECE
    bins = np.linspace(0, 1, n_bins + 1)
    digitized = np.digitize(probs, bins) - 1
    
    ece = 0.0
    bin_data = []
    
    for i in range(n_bins):
        mask = digitized == i
        if mask.sum() > 0:
            conf = probs[mask].mean()
            acc = labels[mask].mean()
            gap = abs(conf - acc)
            ece += gap * mask.sum() / len(probs)
            bin_data.append({
                'bin': i,
                'confidence': conf,
                'accuracy': acc,
                'count': mask.sum()
            })
    
    return {'NLL': nll, 'Brier': brier, 'ECE': ece, 'bin_data': bin_data}

calibration_metrics = {}

for method_name in methods_results.keys():
    df = pd.read_csv(OUTPUT_DIR / f'eval_{method_name}.csv')
    logits = df['logit'].values
    labels = df['is_tp'].values
    
    # Sin TS (T=1)
    if '_ts' not in method_name:
        metrics = compute_calibration_metrics(logits, labels, T=1.0, n_bins=CONFIG['n_bins'])
        calibration_metrics[method_name] = metrics
    else:
        # Con TS
        base_method = method_name.replace('_ts', '')
        T = temps[base_method]['T']
        metrics = compute_calibration_metrics(logits, labels, T=T, n_bins=CONFIG['n_bins'])
        calibration_metrics[method_name] = metrics
    
    print(f"{method_name}: NLL={metrics['NLL']:.4f}, Brier={metrics['Brier']:.4f}, ECE={metrics['ECE']:.4f}")

# Guardar
with open(OUTPUT_DIR / 'calibration_metrics.json', 'w') as f:
    # Convertir para JSON serializable
    cal_save = {}
    for k, v in calibration_metrics.items():
        cal_save[k] = {
            'NLL': v['NLL'],
            'Brier': v['Brier'],
            'ECE': v['ECE']
        }
    json.dump(cal_save, f, indent=2)

print("\nMétricas de calibración guardadas")

## 10. Tabla Comparativa de Calibración

In [None]:
rows_calib = []
for method_name, metrics in calibration_metrics.items():
    rows_calib.append({
        'Method': method_name,
        'NLL': metrics['NLL'],
        'Brier': metrics['Brier'],
        'ECE': metrics['ECE']
    })

df_calibration = pd.DataFrame(rows_calib)
df_calibration.to_csv(OUTPUT_DIR / 'calibration_comparison.csv', index=False)

print("\n" + "="*80)
print("TABLA COMPARATIVA DE CALIBRACIÓN")
print("="*80)
print(df_calibration.to_string(index=False))
print("="*80)
print("\nInterpretación:")
print("  ↓ Menor es mejor para NLL, Brier, ECE")
print("  Si método+TS < método: TS mejoró calibración")

## 11. Reliability Diagrams

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

method_pairs = [
    ('baseline', 'baseline_ts'),
    ('mc_dropout', 'mc_dropout_ts'),
    ('decoder_variance', 'decoder_variance_ts')
]

for idx, (method_before, method_after) in enumerate(method_pairs):
    ax = axes[idx * 2]
    
    # Sin TS
    bin_data = calibration_metrics[method_before]['bin_data']
    if len(bin_data) > 0:
        confidences = [b['confidence'] for b in bin_data]
        accuracies = [b['accuracy'] for b in bin_data]
        counts = [b['count'] for b in bin_data]
        
        ax.bar(range(len(confidences)), accuracies, alpha=0.3, label='Accuracy', color='blue')
        ax.plot(range(len(confidences)), confidences, 'o-', label='Confidence', color='red', markersize=8)
        ax.plot([0, len(confidences)-1], [0, 1], 'k--', alpha=0.3, label='Perfect calibration')
        ax.set_xlabel('Confidence bin')
        ax.set_ylabel('Proportion')
        ax.set_title(f'{method_before}\nECE={calibration_metrics[method_before]["ECE"]:.4f}')
        ax.legend()
        ax.grid(alpha=0.3)
    
    # Con TS
    ax = axes[idx * 2 + 1]
    bin_data = calibration_metrics[method_after]['bin_data']
    if len(bin_data) > 0:
        confidences = [b['confidence'] for b in bin_data]
        accuracies = [b['accuracy'] for b in bin_data]
        
        ax.bar(range(len(confidences)), accuracies, alpha=0.3, label='Accuracy', color='blue')
        ax.plot(range(len(confidences)), confidences, 'o-', label='Confidence', color='red', markersize=8)
        ax.plot([0, len(confidences)-1], [0, 1], 'k--', alpha=0.3, label='Perfect calibration')
        ax.set_xlabel('Confidence bin')
        ax.set_ylabel('Proportion')
        ax.set_title(f'{method_after}\nECE={calibration_metrics[method_after]["ECE"]:.4f}')
        ax.legend()
        ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'reliability_diagrams.png', dpi=150, bbox_inches='tight')
print(f"Reliability diagrams guardados en: {OUTPUT_DIR / 'reliability_diagrams.png'}")
plt.close()

## 12. Risk-Coverage Analysis

## 13. Métricas de Incertidumbre: AUROC TP vs FP

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

uncertainty_auroc = {}

# Solo métodos con incertidumbre (MC-Dropout y Decoder Variance)
uncertainty_methods = ['mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']

print("="*80)
print("AUROC: ¿La incertidumbre detecta errores (FP)?")
print("="*80)
print("\nObjetivo: Usar incertidumbre para distinguir FP (errores) de TP (aciertos)")
print("Interpretación: AUROC > 0.5 (random), ideal ≥ 0.7")
print("-"*80)

for method_name in uncertainty_methods:
    df = pd.read_csv(OUTPUT_DIR / f'eval_{method_name}.csv')
    
    if len(df) > 0 and 'uncertainty' in df.columns:
        uncertainties = df['uncertainty'].values
        is_tp = df['is_tp'].values
        
        # Verificar que hay TPs y FPs
        if len(np.unique(is_tp)) > 1 and len(uncertainties) > 0:
            # AUROC: predecir FP (error) usando incertidumbre
            # Invertir labels: 1=FP (error), 0=TP (correcto)
            is_fp = 1 - is_tp
            
            try:
                auroc = roc_auc_score(is_fp, uncertainties)
                
                # Estadísticas de incertidumbre
                unc_tp = uncertainties[is_tp == 1]
                unc_fp = uncertainties[is_tp == 0]
                
                mean_unc_tp = unc_tp.mean() if len(unc_tp) > 0 else 0.0
                mean_unc_fp = unc_fp.mean() if len(unc_fp) > 0 else 0.0
                
                uncertainty_auroc[method_name] = {
                    'auroc': auroc,
                    'mean_unc_tp': mean_unc_tp,
                    'mean_unc_fp': mean_unc_fp,
                    'n_tp': int(is_tp.sum()),
                    'n_fp': int((1 - is_tp).sum())
                }
                
                print(f"\n{method_name}:")
                print(f"  AUROC (FP detection): {auroc:.4f}")
                print(f"  Mean uncertainty TP:  {mean_unc_tp:.6f}")
                print(f"  Mean uncertainty FP:  {mean_unc_fp:.6f}")
                print(f"  Ratio (FP/TP):        {mean_unc_fp/mean_unc_tp if mean_unc_tp > 0 else 0:.2f}x")
                print(f"  Samples: {int(is_tp.sum())} TP, {int((1-is_tp).sum())} FP")
                
            except Exception as e:
                print(f"\n{method_name}: Error calculando AUROC - {e}")
        else:
            print(f"\n{method_name}: Datos insuficientes para AUROC")

# Guardar resultados
with open(OUTPUT_DIR / 'uncertainty_auroc.json', 'w') as f:
    json.dump(uncertainty_auroc, f, indent=2)

print("\n" + "="*80)
print(f"Resultados guardados en: {OUTPUT_DIR / 'uncertainty_auroc.json'}")


In [None]:
# Tabla comparativa AUROC
rows_auroc = []
for method_name, metrics in uncertainty_auroc.items():
    rows_auroc.append({
        'Method': method_name,
        'AUROC (FP detection) ↑': metrics['auroc'],
        'Mean Unc. TP': metrics['mean_unc_tp'],
        'Mean Unc. FP': metrics['mean_unc_fp'],
        'Ratio (FP/TP)': metrics['mean_unc_fp'] / metrics['mean_unc_tp'] if metrics['mean_unc_tp'] > 0 else 0
    })

df_auroc = pd.DataFrame(rows_auroc)
df_auroc.to_csv(OUTPUT_DIR / 'uncertainty_auroc_comparison.csv', index=False)

print("\n" + "="*80)
print("TABLA COMPARATIVA: AUROC TP vs FP")
print("="*80)
print(df_auroc.to_string(index=False))
print("="*80)
print("\nInterpretación:")
print("  ↑ Mayor AUROC = mejor detección de errores")
print("  Ratio (FP/TP) > 1 = incertidumbre mayor en errores (deseable)")
print("  AUROC ≥ 0.7 = incertidumbre útil para rechazo selectivo")


In [None]:
# Visualización: Distribuciones de incertidumbre y ROC curves
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

methods_to_plot = ['mc_dropout', 'decoder_variance']
colors_methods = {'mc_dropout': 'blue', 'decoder_variance': 'green'}

for idx, method_name in enumerate(methods_to_plot):
    # Fila 1: Distribuciones de incertidumbre (TP vs FP)
    ax_dist = axes[0, idx]
    
    df = pd.read_csv(OUTPUT_DIR / f'eval_{method_name}.csv')
    if len(df) > 0 and 'uncertainty' in df.columns:
        unc_tp = df[df['is_tp'] == 1]['uncertainty'].values
        unc_fp = df[df['is_tp'] == 0]['uncertainty'].values
        
        ax_dist.hist(unc_tp, bins=50, alpha=0.6, label=f'TP (n={len(unc_tp)})', color='green', density=True)
        ax_dist.hist(unc_fp, bins=50, alpha=0.6, label=f'FP (n={len(unc_fp)})', color='red', density=True)
        ax_dist.axvline(unc_tp.mean(), color='green', linestyle='--', linewidth=2, label=f'Mean TP: {unc_tp.mean():.4f}')
        ax_dist.axvline(unc_fp.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean FP: {unc_fp.mean():.4f}')
        ax_dist.set_xlabel('Uncertainty', fontsize=11)
        ax_dist.set_ylabel('Density', fontsize=11)
        ax_dist.set_title(f'{method_name.replace("_", " ").title()}\nDistribución de Incertidumbre', fontsize=12, fontweight='bold')
        ax_dist.legend(fontsize=9)
        ax_dist.grid(alpha=0.3)
    
    # Fila 2: ROC curves
    ax_roc = axes[1, idx]
    
    if method_name in uncertainty_auroc:
        is_tp = df['is_tp'].values
        is_fp = 1 - is_tp
        uncertainties = df['uncertainty'].values
        
        fpr, tpr, thresholds = roc_curve(is_fp, uncertainties)
        auroc = uncertainty_auroc[method_name]['auroc']
        
        ax_roc.plot(fpr, tpr, linewidth=2, label=f'AUROC = {auroc:.4f}', color=colors_methods[method_name])
        ax_roc.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Random (0.5)')
        ax_roc.set_xlabel('False Positive Rate', fontsize=11)
        ax_roc.set_ylabel('True Positive Rate', fontsize=11)
        ax_roc.set_title(f'{method_name.replace("_", " ").title()}\nROC Curve (FP Detection)', fontsize=12, fontweight='bold')
        ax_roc.legend(fontsize=10)
        ax_roc.grid(alpha=0.3)
        ax_roc.set_xlim([0, 1])
        ax_roc.set_ylim([0, 1])

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'uncertainty_analysis.png', dpi=150, bbox_inches='tight')
print(f"\nVisualización de incertidumbre guardada en: {OUTPUT_DIR / 'uncertainty_analysis.png'}")
plt.close()


## 12. Risk-Coverage Analysis

In [None]:
def compute_risk_coverage(df, uncertainty_col='uncertainty'):
    """Calcula curva risk-coverage"""
    df_sorted = df.sort_values(uncertainty_col, ascending=False).reset_index(drop=True)
    
    coverages = []
    risks = []
    
    for i in range(1, len(df_sorted) + 1):
        coverage = i / len(df_sorted)
        risk = 1 - df_sorted.iloc[:i]['is_tp'].mean()
        coverages.append(coverage)
        risks.append(risk)
    
    # AUC (área bajo la curva)
    auc = np.trapz(risks, coverages)
    
    return coverages, risks, auc

# Calcular risk-coverage para métodos con incertidumbre
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

methods_with_uncertainty = ['mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']
colors = ['blue', 'cyan', 'red', 'orange']

risk_coverage_results = {}

for ax_idx, method_name in enumerate(['mc_dropout', 'decoder_variance']):
    ax = axes[ax_idx]
    
    for variant, color in [(method_name, 'blue'), (f'{method_name}_ts', 'red')]:
        df = pd.read_csv(OUTPUT_DIR / f'eval_{variant}.csv')
        
        if len(df) > 0 and 'uncertainty' in df.columns:
            coverages, risks, auc = compute_risk_coverage(df, 'uncertainty')
            
            label = variant.replace('_', ' ').title()
            ax.plot(coverages, risks, label=f'{label} (AUC={auc:.3f})', color=color, linewidth=2)
            
            risk_coverage_results[variant] = {
                'coverages': coverages,
                'risks': risks,
                'auc': auc
            }
    
    ax.set_xlabel('Coverage', fontsize=12)
    ax.set_ylabel('Risk (1 - Accuracy)', fontsize=12)
    ax.set_title(f'Risk-Coverage: {method_name.replace("_", " ").title()}', fontsize=14)
    ax.legend(fontsize=10)
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'risk_coverage_curves.png', dpi=150, bbox_inches='tight')
print(f"Risk-coverage curves guardadas en: {OUTPUT_DIR / 'risk_coverage_curves.png'}")
plt.close()

# Guardar AUC
auc_summary = {k: v['auc'] for k, v in risk_coverage_results.items()}
with open(OUTPUT_DIR / 'risk_coverage_auc.json', 'w') as f:
    json.dump(auc_summary, f, indent=2)

print("\nRisk-Coverage AUC:")
for method, auc in auc_summary.items():
    print(f"  {method}: {auc:.4f}")

## 14. Resumen Final y Reporte

In [None]:
print("\n" + "="*80)
print("RESUMEN FINAL - COMPARACIÓN DE MÉTODOS")
print("="*80)

# Cargar todas las métricas
det_metrics = json.load(open(OUTPUT_DIR / 'detection_metrics.json'))
cal_metrics = json.load(open(OUTPUT_DIR / 'calibration_metrics.json'))
temps = json.load(open(OUTPUT_DIR / 'temperatures.json'))
auc_summary = json.load(open(OUTPUT_DIR / 'risk_coverage_auc.json'))
uncertainty_auroc_data = json.load(open(OUTPUT_DIR / 'uncertainty_auroc.json'))

print("\n1. MÉTRICAS DE DETECCIÓN (mAP@[0.5:0.95])")
print("-" * 80)
for method in ['baseline', 'baseline_ts', 'mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']:
    mAP = det_metrics[method].get('mAP', 0.0)
    AP50 = det_metrics[method].get('AP50', 0.0)
    AP75 = det_metrics[method].get('AP75', 0.0)
    print(f"{method:25s}  mAP={mAP:.4f}  AP50={AP50:.4f}  AP75={AP75:.4f}")

print("\n2. MÉTRICAS DE CALIBRACIÓN")
print("-" * 80)
print(f"{'Method':<25s} {'NLL ↓':>10s} {'Brier ↓':>10s} {'ECE ↓':>10s}")
print("-" * 80)
for method in ['baseline', 'baseline_ts', 'mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']:
    nll = cal_metrics[method]['NLL']
    brier = cal_metrics[method]['Brier']
    ece = cal_metrics[method]['ECE']
    print(f"{method:<25s} {nll:>10.4f} {brier:>10.4f} {ece:>10.4f}")

print("\n3. TEMPERATURAS OPTIMIZADAS")
print("-" * 80)
for method in ['baseline', 'mc_dropout', 'decoder_variance']:
    T = temps[method]['T']
    nll_before = temps[method]['nll_before']
    nll_after = temps[method]['nll_after']
    improvement = nll_before - nll_after
    print(f"{method:20s}  T={T:.4f}  NLL: {nll_before:.4f} → {nll_after:.4f} (Δ={improvement:.4f})")

print("\n4. RISK-COVERAGE AUC (menor es mejor)")
print("-" * 80)
for method, auc in auc_summary.items():
    print(f"{method:25s}  AUC={auc:.4f}")

print("\n5. INCERTIDUMBRE: AUROC TP vs FP (mayor es mejor)")
print("-" * 80)
print(f"{'Method':<25s} {'AUROC ↑':>10s} {'Mean Unc TP':>15s} {'Mean Unc FP':>15s} {'Ratio':>10s}")
print("-" * 80)
for method, data in uncertainty_auroc_data.items():
    auroc = data['auroc']
    mean_tp = data['mean_unc_tp']
    mean_fp = data['mean_unc_fp']
    ratio = mean_fp / mean_tp if mean_tp > 0 else 0
    print(f"{method:<25s} {auroc:>10.4f} {mean_tp:>15.6f} {mean_fp:>15.6f} {ratio:>10.2f}x")

print("\n" + "="*80)
print("CONCLUSIONES")
print("="*80)
print("✓ Baseline: rendimiento de referencia sin incertidumbre")
print("✓ Temperature Scaling: mejora calibración sin afectar mAP")
print("✓ MC-Dropout: proporciona incertidumbre epistémica (K pases)")
print("✓ Decoder variance: incertidumbre en single-pass (más eficiente)")
print("✓ Métodos+TS: mejor calibración manteniendo detección")
print("✓ AUROC TP vs FP: valida que incertidumbre detecta errores")
print("="*80)

# Guardar reporte final
final_report = {
    'timestamp': datetime.now().isoformat(),
    'config': CONFIG,
    'detection_metrics': det_metrics,
    'calibration_metrics': cal_metrics,
    'temperatures': temps,
    'risk_coverage_auc': auc_summary,
    'uncertainty_auroc': uncertainty_auroc_data
}

with open(OUTPUT_DIR / 'final_report.json', 'w') as f:
    json.dump(final_report, f, indent=2)

print(f"\nReporte final guardado en: {OUTPUT_DIR / 'final_report.json'}")
print(f"Todos los artefactos en: {OUTPUT_DIR}")

## 15. Visualización Final Comparativa

In [None]:
fig = plt.figure(figsize=(20, 14))
gs = fig.add_gridspec(4, 3, hspace=0.3, wspace=0.3)

# 1. mAP Comparison
ax1 = fig.add_subplot(gs[0, :])
methods = ['baseline', 'baseline_ts', 'mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']
mAPs = [det_metrics[m].get('mAP', 0.0) for m in methods]
colors_map = ['lightblue', 'blue', 'lightcoral', 'red', 'lightgreen', 'green']
bars = ax1.bar(range(len(methods)), mAPs, color=colors_map, alpha=0.7)
ax1.set_xticks(range(len(methods)))
ax1.set_xticklabels([m.replace('_', '\n') for m in methods], fontsize=10)
ax1.set_ylabel('mAP@[0.5:0.95]', fontsize=12)
ax1.set_title('Comparación de mAP entre Métodos', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height, f'{mAPs[i]:.3f}', ha='center', va='bottom', fontsize=9)

# 2. Calibration Metrics Comparison
ax2 = fig.add_subplot(gs[1, 0])
nlls = [cal_metrics[m]['NLL'] for m in methods]
ax2.bar(range(len(methods)), nlls, color=colors_map, alpha=0.7)
ax2.set_xticks(range(len(methods)))
ax2.set_xticklabels([m.replace('_', '\n') for m in methods], fontsize=8)
ax2.set_ylabel('NLL ↓', fontsize=11)
ax2.set_title('Negative Log-Likelihood', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

ax3 = fig.add_subplot(gs[1, 1])
briers = [cal_metrics[m]['Brier'] for m in methods]
ax3.bar(range(len(methods)), briers, color=colors_map, alpha=0.7)
ax3.set_xticks(range(len(methods)))
ax3.set_xticklabels([m.replace('_', '\n') for m in methods], fontsize=8)
ax3.set_ylabel('Brier Score ↓', fontsize=11)
ax3.set_title('Brier Score', fontsize=12, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

ax4 = fig.add_subplot(gs[1, 2])
eces = [cal_metrics[m]['ECE'] for m in methods]
ax4.bar(range(len(methods)), eces, color=colors_map, alpha=0.7)
ax4.set_xticks(range(len(methods)))
ax4.set_xticklabels([m.replace('_', '\n') for m in methods], fontsize=8)
ax4.set_ylabel('ECE ↓', fontsize=11)
ax4.set_title('Expected Calibration Error', fontsize=12, fontweight='bold')
ax4.grid(axis='y', alpha=0.3)

# 3. Temperature Scaling Effect
ax5 = fig.add_subplot(gs[2, 0])
base_methods = ['baseline', 'mc_dropout', 'decoder_variance']
Ts = [temps[m]['T'] for m in base_methods]
ax5.bar(range(len(base_methods)), Ts, color=['blue', 'red', 'green'], alpha=0.7)
ax5.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, label='T=1 (sin calibrar)')
ax5.set_xticks(range(len(base_methods)))
ax5.set_xticklabels([m.replace('_', '\n') for m in base_methods], fontsize=10)
ax5.set_ylabel('Temperature T', fontsize=11)
ax5.set_title('Temperaturas Óptimas', fontsize=12, fontweight='bold')
ax5.legend()
ax5.grid(axis='y', alpha=0.3)

# 4. Risk-Coverage AUC
ax6 = fig.add_subplot(gs[2, 1])
unc_methods = ['mc_dropout', 'mc_dropout_ts', 'decoder_variance', 'decoder_variance_ts']
aucs = [auc_summary.get(m, 0.0) for m in unc_methods]
colors_unc = ['lightcoral', 'red', 'lightgreen', 'green']
bars_auc = ax6.bar(range(len(unc_methods)), aucs, color=colors_unc, alpha=0.7)
ax6.set_xticks(range(len(unc_methods)))
ax6.set_xticklabels([m.replace('_', '\n') for m in unc_methods], fontsize=9)
ax6.set_ylabel('AUC (Risk-Coverage) ↓', fontsize=11)
ax6.set_title('Risk-Coverage AUC', fontsize=12, fontweight='bold')
ax6.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars_auc):
    height = bar.get_height()
    ax6.text(bar.get_x() + bar.get_width()/2., height, f'{aucs[i]:.3f}', ha='center', va='bottom', fontsize=8)

# 5. AUROC TP vs FP (Nueva sección)
ax7 = fig.add_subplot(gs[2, 2])
auroc_methods = list(uncertainty_auroc_data.keys())
aurocs = [uncertainty_auroc_data[m]['auroc'] for m in auroc_methods]
colors_auroc = ['lightcoral', 'red', 'lightgreen', 'green']
bars_auroc = ax7.bar(range(len(auroc_methods)), aurocs, color=colors_auroc, alpha=0.7)
ax7.axhline(y=0.5, color='black', linestyle='--', alpha=0.5, label='Random')
ax7.axhline(y=0.7, color='orange', linestyle='--', alpha=0.5, label='Good threshold')
ax7.set_xticks(range(len(auroc_methods)))
ax7.set_xticklabels([m.replace('_', '\n') for m in auroc_methods], fontsize=9)
ax7.set_ylabel('AUROC (FP detection) ↑', fontsize=11)
ax7.set_title('AUROC: Detección de Errores', fontsize=12, fontweight='bold')
ax7.legend(fontsize=8)
ax7.grid(axis='y', alpha=0.3)
ax7.set_ylim([0, 1])
for i, bar in enumerate(bars_auroc):
    height = bar.get_height()
    ax7.text(bar.get_x() + bar.get_width()/2., height, f'{aurocs[i]:.3f}', ha='center', va='bottom', fontsize=8)

# 6. Resumen de incertidumbre (ratio FP/TP)
ax8 = fig.add_subplot(gs[3, :])
ratios = [uncertainty_auroc_data[m]['mean_unc_fp'] / uncertainty_auroc_data[m]['mean_unc_tp'] 
          if uncertainty_auroc_data[m]['mean_unc_tp'] > 0 else 0 
          for m in auroc_methods]
bars_ratio = ax8.bar(range(len(auroc_methods)), ratios, color=colors_auroc, alpha=0.7)
ax8.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, label='Ratio = 1 (sin diferencia)')
ax8.set_xticks(range(len(auroc_methods)))
ax8.set_xticklabels([m.replace('_', '\n') for m in auroc_methods], fontsize=10)
ax8.set_ylabel('Ratio Mean(Unc_FP) / Mean(Unc_TP)', fontsize=11)
ax8.set_title('Ratio de Incertidumbre: FP vs TP (>1 es deseable)', fontsize=12, fontweight='bold')
ax8.legend()
ax8.grid(axis='y', alpha=0.3)
for i, bar in enumerate(bars_ratio):
    height = bar.get_height()
    ax8.text(bar.get_x() + bar.get_width()/2., height, f'{ratios[i]:.2f}x', ha='center', va='bottom', fontsize=9)

plt.suptitle('Fase 5: Comparación Completa de Métodos de Incertidumbre y Calibración', 
             fontsize=16, fontweight='bold', y=0.997)

plt.savefig(OUTPUT_DIR / 'final_comparison_summary.png', dpi=150, bbox_inches='tight')
print(f"\nVisualización final guardada en: {OUTPUT_DIR / 'final_comparison_summary.png'}")
plt.close()

print("\n" + "="*80)
print("FASE 5 COMPLETADA")
print("="*80)
print(f"Todos los resultados guardados en: {OUTPUT_DIR}")
print("\nArchivos generados:")
print("  - config.yaml")
print("  - temperatures.json")
print("  - detection_metrics.json")
print("  - calibration_metrics.json")
print("  - risk_coverage_auc.json")
print("  - uncertainty_auroc.json")
print("  - uncertainty_auroc_comparison.csv")
print("  - final_report.json")
print("  - detection_comparison.csv")
print("  - calibration_comparison.csv")
print("  - reliability_diagrams.png")
print("  - risk_coverage_curves.png")
print("  - uncertainty_analysis.png")
print("  - final_comparison_summary.png")
print("="*80)