https://www.kaggle.com/datasets/awsaf49/bdd100k-dataset
https://www.kaggle.com/datasets/solesensei/solesensei_bdd100k?resource=download  (este use)

In [3]:
import os
HOME = os.getcwd()
print(HOME)  
BDD100K_DIR = os.path.join(HOME, "bdd100k")
os.makedirs(BDD100K_DIR, exist_ok=True)
print(BDD100K_DIR, "; exist:", os.path.exists(BDD100K_DIR))

c:\Users\SP1VEVW\Desktop\projects\OVD-Model-ADAS\data
c:\Users\SP1VEVW\Desktop\projects\OVD-Model-ADAS\data\bdd100k ; exist: True


In [None]:
#pip install pycocotools

Collecting pycocotools
  Using cached pycocotools-2.0.10-cp312-abi3-win_amd64.whl.metadata (1.3 kB)
Using cached pycocotools-2.0.10-cp312-abi3-win_amd64.whl (76 kB)
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.10
Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import os
from pathlib import Path
from datetime import datetime
import random
from pycocotools.coco import COCO

# ====== 1. VERIFICAR DATOS ======
print("="*50)
print("1. VERIFICANDO DATOS")
print("="*50)

# Rutas
VAL_IMAGES_DIR = os.path.join(BDD100K_DIR, "bdd100k/bdd100k/images/100k/val")
VAL_LABELS_FILE = os.path.join(BDD100K_DIR, "bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_val.json")

# Verificar existencia
print(f"\nImágenes Val: {os.path.exists(VAL_IMAGES_DIR)}")
print(f"Labels Val: {os.path.exists(VAL_LABELS_FILE)}")

# Contar imágenes
val_images = [f for f in os.listdir(VAL_IMAGES_DIR) if f.endswith('.jpg')]
print(f"Total imágenes Val: {len(val_images)}")

# Cargar labels
with open(VAL_LABELS_FILE, 'r') as f:
    val_labels = json.load(f)
print(f"Total anotaciones Val: {len(val_labels)}")

# Categorías en BDD100K
categories_bdd = {}
for item in val_labels[:500]:  # Verificar primeras 500
    for label in item.get('labels', []):
        cat = label.get('category')
        if cat:
            categories_bdd[cat] = categories_bdd.get(cat, 0) + 1

print(f"\nCategorías encontradas: {len(categories_bdd)}")
for cat, count in sorted(categories_bdd.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  - {cat}: {count}")

# ====== 2. CONVERTIR A FORMATO COCO ======
print("\n" + "="*50)
print("2. CONVIRTIENDO A FORMATO COCO")
print("="*50)

# Mapeo de categorías BDD100K a IDs
# Nota: BDD100K usa 'bike', no 'bicycle'
CATEGORY_MAP = {
    'person': 1, 'rider': 2, 'car': 3, 'truck': 4, 'bus': 5,
    'train': 6, 'motorcycle': 7, 'bicycle': 8, 'traffic light': 9,
    'traffic sign': 10
}

# Alias para categorías alternativas
CATEGORY_ALIASES = {
    'bike': 'bicycle'
}

def convert_to_coco(bdd_labels, images_dir, split_name):
    """Convierte BDD100K a formato COCO"""
    coco_format = {
        "info": {
            "description": f"BDD100K {split_name} Dataset - COCO Format",
            "version": "1.0",
            "year": 2024,
            "date_created": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        },
        "licenses": [],
        "images": [],
        "annotations": [],
        "categories": []
    }
    
    # Categorías
    for cat_name, cat_id in CATEGORY_MAP.items():
        coco_format["categories"].append({
            "id": cat_id,
            "name": cat_name,
            "supercategory": "object"
        })
    
    annotation_id = 0
    
    # Procesar cada imagen
    for img_idx, item in enumerate(bdd_labels):
        img_name = item['name']
        img_path = os.path.join(images_dir, img_name)
        
        # Verificar que la imagen existe
        if not os.path.exists(img_path):
            continue
        
        # Información de imagen
        image_info = {
            "id": img_idx,
            "file_name": img_name,
            "width": 1280,  # BDD100K estándar
            "height": 720
        }
        coco_format["images"].append(image_info)
        
        # Procesar anotaciones
        for label in item.get('labels', []):
            category = label.get('category')
            
            # Aplicar alias si existe (ej: bike -> bicycle)
            if category in CATEGORY_ALIASES:
                category = CATEGORY_ALIASES[category]
            
            if category not in CATEGORY_MAP:
                continue
            
            box2d = label.get('box2d')
            if not box2d:
                continue
            
            # Calcular bbox COCO (x, y, width, height)
            x1 = box2d['x1']
            y1 = box2d['y1']
            x2 = box2d['x2']
            y2 = box2d['y2']
            
            width = x2 - x1
            height = y2 - y1
            area = width * height
            
            # Validar bbox
            if width <= 0 or height <= 0:
                continue
            
            annotation = {
                "id": annotation_id,
                "image_id": img_idx,
                "category_id": CATEGORY_MAP[category],
                "bbox": [x1, y1, width, height],
                "area": area,
                "iscrowd": 0,
                "segmentation": []
            }
            coco_format["annotations"].append(annotation)
            annotation_id += 1
    
    return coco_format

# Convertir dataset completo
print("\nConvirtiendo dataset completo...")
coco_val_full = convert_to_coco(val_labels, VAL_IMAGES_DIR, "validation_full")

print(f"Total imágenes procesadas: {len(coco_val_full['images'])}")
print(f"Total anotaciones: {len(coco_val_full['annotations'])}")

# ====== 3. DIVIDIR EN 80% TRAIN / 20% VAL ======
print("\n" + "="*50)
print("3. DIVIDIENDO EN 80% TRAIN / 20% VAL")
print("="*50)

# Mezclar imágenes
all_images = coco_val_full['images'].copy()
random.seed(42)  # Para reproducibilidad
random.shuffle(all_images)

# Calcular split
total_imgs = len(all_images)
train_size = int(total_imgs * 0.8)
val_size = total_imgs - train_size

train_images = all_images[:train_size]
val_images = all_images[train_size:]

print(f"\nTotal: {total_imgs} imágenes")
print(f"Train: {train_size} imágenes (80%)")
print(f"Val: {val_size} imágenes (20%)")

# Crear diccionarios de IDs
train_img_ids = {img['id'] for img in train_images}
val_img_ids = {img['id'] for img in val_images}

# Función para crear split
def create_split(images, img_ids, split_name):
    split_data = {
        "info": coco_val_full["info"].copy(),
        "licenses": coco_val_full["licenses"],
        "images": images,
        "annotations": [],
        "categories": coco_val_full["categories"]
    }
    split_data["info"]["description"] = f"BDD100K {split_name} Dataset - COCO Format"
    
    # Filtrar anotaciones
    for ann in coco_val_full['annotations']:
        if ann['image_id'] in img_ids:
            split_data['annotations'].append(ann)
    
    return split_data

# Crear splits
train_coco = create_split(train_images, train_img_ids, "train")
val_coco = create_split(val_images, val_img_ids, "val")

print(f"\nTrain: {len(train_coco['images'])} imgs, {len(train_coco['annotations'])} anns")
print(f"Val: {len(val_coco['images'])} imgs, {len(val_coco['annotations'])} anns")

# ====== 4. GUARDAR ARCHIVOS COCO ======
print("\n" + "="*50)
print("4. GUARDANDO ARCHIVOS COCO")
print("="*50)

# Crear directorio de salida
COCO_OUTPUT_DIR = os.path.join(HOME, "bdd100k_coco")
os.makedirs(COCO_OUTPUT_DIR, exist_ok=True)

# Guardar archivos
train_file = os.path.join(COCO_OUTPUT_DIR, "val_calib.json")
val_file = os.path.join(COCO_OUTPUT_DIR, "val_eval.json")

with open(train_file, 'w') as f:
    json.dump(train_coco, f)
print(f"\n✓ Train guardado: {train_file}")

with open(val_file, 'w') as f:
    json.dump(val_coco, f)
print(f"✓ Val guardado: {val_file}")

# ====== 5. ESTADÍSTICAS FINALES ======
print("\n" + "="*50)
print("5. ESTADÍSTICAS FINALES")
print("="*50)

def print_stats(data, name):
    print(f"\n{name}:")
    print(f"  Imágenes: {len(data['images'])}")
    print(f"  Anotaciones: {len(data['annotations'])}")
    print(f"  Promedio ann/img: {len(data['annotations'])/len(data['images']):.2f}")
    
    # Distribución por categoría
    cat_dist = {}
    for ann in data['annotations']:
        cat_id = ann['category_id']
        cat_name = next(c['name'] for c in data['categories'] if c['id'] == cat_id)
        cat_dist[cat_name] = cat_dist.get(cat_name, 0) + 1
    
    print("  Top 5 categorías:")
    for cat, count in sorted(cat_dist.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"    - {cat}: {count}")

print_stats(train_coco, "TRAIN")
print_stats(val_coco, "VAL")

print("\n" + "="*50)
print("✓ PROCESO COMPLETADO")
print("="*50)
print(f"\nArchivos generados:")
print(f"1. {train_file}")
print(f"2. {val_file}")
print(f"\nAhora puedes usar COCOeval para calcular mAP, AP@50, F1, etc.")

# ====== 6. VALIDAR CON PYCOCOTOOLS ======
print("\n" + "="*50)
print("6. VALIDANDO CON PYCOCOTOOLS")
print("="*50)

# Validar archivo de calibración (80%)
print("\n✓ Validando val_calib.json (80%)...")
coco_calib = COCO(train_file)
print(f"  - Cargado exitosamente")
print(f"  - Imágenes: {len(coco_calib.getImgIds())}")
print(f"  - Categorías: {len(coco_calib.getCatIds())}")
print(f"  - Anotaciones: {len(coco_calib.getAnnIds())}")
    
# Validar archivo de evaluación (20%)
print("\n✓ Validando val_eval.json (20%)...")
coco_eval = COCO(val_file)
print(f"  - Cargado exitosamente")
print(f"  - Imágenes: {len(coco_eval.getImgIds())}")
print(f"  - Categorías: {len(coco_eval.getCatIds())}")
print(f"  - Anotaciones: {len(coco_eval.getAnnIds())}")
    
# Mostrar categorías
print("\n✓ Categorías disponibles:")
for cat in coco_calib.loadCats(coco_calib.getCatIds()):
    print(f"  - ID {cat['id']}: {cat['name']}")
    
print("\n✓ Archivos COCO validados correctamente con pycocotools")
print("  Listos para usar con COCOeval para calcular métricas")
    

1. VERIFICANDO DATOS

Imágenes Val: True
Labels Val: True
Total imágenes Val: 10000
Total anotaciones Val: 10000

Categorías encontradas: 12
  - car: 5062
  - lane: 3808
  - traffic sign: 1754
  - traffic light: 1374
  - drivable area: 873
  - person: 746
  - truck: 212
  - bus: 91
  - bike: 40
  - rider: 35

2. CONVIRTIENDO A FORMATO COCO

Convirtiendo dataset completo...
Total anotaciones Val: 10000

Categorías encontradas: 12
  - car: 5062
  - lane: 3808
  - traffic sign: 1754
  - traffic light: 1374
  - drivable area: 873
  - person: 746
  - truck: 212
  - bus: 91
  - bike: 40
  - rider: 35

2. CONVIRTIENDO A FORMATO COCO

Convirtiendo dataset completo...
Total imágenes procesadas: 10000
Total anotaciones: 185074

3. DIVIDIENDO EN 80% TRAIN / 20% VAL

Total: 10000 imágenes
Train: 8000 imágenes (80%)
Val: 2000 imágenes (20%)

Train: 8000 imgs, 148515 anns
Val: 2000 imgs, 36559 anns

4. GUARDANDO ARCHIVOS COCO
Total imágenes procesadas: 10000
Total anotaciones: 185074

3. DIVIDIENDO 