In [6]:
import os
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from PIL import Image
from tqdm import tqdm
import numpy as np
import json
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset personalizado
class CocoDataset(Dataset):
    def __init__(self, img_dir, ann_file, transforms=None):
        self.img_dir = img_dir
        self.coco = COCO(ann_file)
        self.ids = list(self.coco.imgs.keys())
        self.transforms = transforms

    def __getitem__(self, index):
        coco = self.coco
        img_id = self.ids[index]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)

        # Si no hay anotaciones, pasar a la siguiente imagen
        if len(anns) == 0:
            return self.__getitem__((index + 1) % len(self.ids))

        path = coco.loadImgs(img_id)[0]['file_name']
        img = Image.open(os.path.join(self.img_dir, path)).convert("RGB")

        boxes, labels, areas, iscrowd = [], [], [], []

        for ann in anns:
            x, y, w, h = ann['bbox']
            if w <= 0 or h <= 0:
                continue  # omitir cajas inválidas
            boxes.append([x, y, x + w, y + h])
            labels.append(ann['category_id'])
            areas.append(ann['area'])
            iscrowd.append(ann.get('iscrowd', 0))

        if len(boxes) == 0:
            return self.__getitem__((index + 1) % len(self.ids))

        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64),
            'image_id': torch.tensor([img_id]),
            'area': torch.tensor(areas, dtype=torch.float32),
            'iscrowd': torch.tensor(iscrowd, dtype=torch.int64)
        }

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.ids)

# Transforms
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((512, 512)),
    torchvision.transforms.ToTensor()
])


# Carga de datos
train_dataset = CocoDataset("modelo_yolov11_dataset_completo/train/images", "modelo_yolov11_dataset_completo/train/train_coco.json", transform)
val_dataset = CocoDataset("modelo_yolov11_dataset_completo/val/images", "modelo_yolov11_dataset_completo/val/val_coco.json", transform)

train_subset = torch.utils.data.Subset(train_dataset, list(range(0, len(train_dataset), 4)))  # 25% del dataset
train_loader = DataLoader(train_subset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

##train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
##val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

# Modelo
model = fasterrcnn_resnet50_fpn(num_classes=len(train_dataset.coco.getCatIds()) + 1)
model.to(device)

# Optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Entrenamiento
def train_one_epoch(model, optimizer, data_loader):
    model.train()
    for images, targets in tqdm(data_loader):
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

# Validación
def evaluate(model, data_loader, coco_gt):
    model.eval()
    results = []
    for images, targets in tqdm(data_loader):
        images = list(img.to(device) for img in images)
        outputs = model(images)
        for target, output in zip(targets, outputs):
            boxes = output['boxes'].cpu().detach().numpy()
            scores = output['scores'].cpu().detach().numpy()
            labels = output['labels'].cpu().detach().numpy()
            image_id = int(target['image_id'])

            for box, score, label in zip(boxes, scores, labels):
                x_min, y_min, x_max, y_max = box
                results.append({
                    "image_id": image_id,
                    "category_id": int(label),
                    "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
                    "score": float(score)
                })

    # Guardar predicciones temporales
    with open("predictions.json", "w") as f:
        json.dump(results, f, indent=2)

    coco_dt = coco_gt.loadRes("predictions.json")
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    # Métricas por clase
    cat_ids = coco_gt.getCatIds()
    for i, cat_id in enumerate(cat_ids):
        print(f"Clase: {coco_gt.loadCats(cat_id)[0]['name']}")
        precision = coco_eval.eval['precision'][:, i, :, 0, -1]
        recall = coco_eval.eval['recall'][:, i, 0, 0, -1]
        print(f"  Precision media: {np.mean(precision[precision > -1]):.4f}")
        print(f"  Recall media:    {np.mean(recall[recall > -1]):.4f}")

# Entrenamiento por epochs
for epoch in range(2):
    print(f"\nEpoch {epoch+1}")
    train_one_epoch(model, optimizer, train_loader)
    evaluate(model, val_loader, val_dataset.coco)
    print(f"⏱ Tiempo por epoch: {(time.time() - start)/60:.2f} minutos")


# Guardar modelo
torch.save(model.state_dict(), "fasterrcnn_model.pth")


loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!

Epoch 1


100%|██████████| 1030/1030 [32:47<00:00,  1.91s/it]
100%|██████████| 492/492 [12:06<00:00,  1.48s/it]


Loading and preparing results...


IndexError: list index out of range

In [7]:
for i in range(5):
    img, target = train_dataset[i]
    print(f"Imagen {i}")
    print("Cajas:", target["boxes"])
    print("Etiquetas:", target["labels"])


Imagen 0
Cajas: tensor([[1273.2336,  458.2143, 1462.0944,  639.3076]])
Etiquetas: tensor([0])
Imagen 1
Cajas: tensor([[1273.2336,  458.2143, 1462.0944,  639.3076]])
Etiquetas: tensor([0])
Imagen 2
Cajas: tensor([[1273.2336,  458.2143, 1462.0944,  639.3076]])
Etiquetas: tensor([0])
Imagen 3
Cajas: tensor([[1273.2336,  458.2143, 1462.0944,  639.3076]])
Etiquetas: tensor([0])
Imagen 4
Cajas: tensor([[1273.2336,  458.2143, 1462.0944,  639.3076]])
Etiquetas: tensor([0])
