Monitorear el entrenamiento con tensorboard (tensorboard --logdir=runs) en el directorio.

In [1]:
import os
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader
from torch import optim
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from pycocotools.coco import COCO
import zipfile
import urllib.request
import torchvision.transforms as transforms

os.environ['KMP_DUPLICATE_LIB_OK']='True'
torch.autograd.set_detect_anomaly(True)
torch.cuda.empty_cache()

In [2]:
class MyTransform:
    def __init__(self, transform):
        self.transform = transform

    def __call__(self, image, target):
        return self.transform(image), target


In [3]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [4]:
def targets_od(images, targets_tuple, device):
    TARGETS = []
    IMAGES = []
    for img, t_list in zip(images, targets_tuple):
        bbox = []
        category_id = []
        d = {}
        for t_dict in t_list:
            if list(torch.tensor(t_dict.get('bbox', [])).to(device).size()) != [0]:  # Check the size of bbox
                for k, v in t_dict.items():
                    if k == 'bbox':
                        # Convert box from [x, y, width, height] to [x1, y1, x2, y2]
                        converted_box = [v[0], v[1], v[0] + v[2], v[1] + v[3]]
                        bbox.append(converted_box)
                    elif k == 'category_id':
                        category_id.append(v)
        if bbox:  # if bbox list is not empty
            d['boxes'] = torch.tensor(bbox).to(device)
            d['labels'] = torch.tensor(category_id).to(device)
            TARGETS.append(d)
            IMAGES.append(img.to(device))
        else:
            if len(TARGETS) > 0:  # if there is any previous image in the batch
                TARGETS.append(TARGETS[-1])  # repeat the last valid target
                IMAGES.append(IMAGES[-1])  # repeat the last valid image
    return IMAGES, TARGETS


In [5]:
def main():
    writer = SummaryWriter('runs/experiment_1')
    transform = MyTransform(transforms.Compose([
                transforms.Resize((800,800)),
                transforms.ToTensor(), 
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ]))


    # Load the COCO dataset
    train_dataset = CocoDetection(root='COCO/train2017',
                                  annFile='COCO/annotations/instances_train2017.json', transforms=transform)
    val_dataset = CocoDetection(root='COCO/val2017',
                                annFile='COCO/annotations/instances_val2017.json', transforms=transform)


    # Create data loaders
    train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=3, 
    shuffle=False, 
    num_workers=0,
    collate_fn=collate_fn
    )

    val_loader = torch.utils.data.DataLoader(
    val_dataset, 
    batch_size=1, 
    shuffle=False, 
    num_workers=0,
    collate_fn=collate_fn
    )

    # Define the model
    model = fasterrcnn_resnet50_fpn(pretrained=True)

    # Move model to gpu if available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Set the optimizer and the loss function
    optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.9, weight_decay=0.0005)

    # Checkpoint saving path
    ckpt_path = 'checkpoints/model_ckpt.pt'
    start_epoch = 0
    # Load the checkpoint if it exists
    if os.path.exists(ckpt_path):
        checkpoint = torch.load(ckpt_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        print(f'Loaded checkpoint from epoch {start_epoch}')
    else:
        print("No checkpoint found, starting from scratch.")

    # Entrenar la red y guardar las pérdidas para la visualización
    train_losses = []
    val_losses = []
    val_accuracies = []
    best_val_loss = float("inf")
    epochs_no_improve = 0
    n_epochs_stop = 15

    for epoch in range(start_epoch, 4):
        running_loss = 0.0
        for i, (images, targets) in enumerate(train_loader):
            images, targets = targets_od(images, targets, device)
            
            # Forward pass
            loss_dict = model(images, targets)
            total_loss = sum(loss for loss in loss_dict.values())

            # Backward pass and optimization
            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # print statistics
            running_loss += total_loss
            if i % 100 == 99 or i == len(train_loader) - 1:  # Asume que la indexación comienza en 0
                avg_loss = running_loss / (i+1)
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, avg_loss))
                train_losses.append(avg_loss)
                writer.add_scalar('training loss', avg_loss, epoch * len(train_loader) + i)
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        correct = 0
        total = 0
        model.eval()
        with torch.no_grad():
            for images, targets in val_loader:
                images, targets = targets_od(images, targets, device)

                loss_dict = model(images, targets)
                total_vloss = sum(loss for loss in loss_dict.values())
                val_loss += total_vloss.item()

                # Calculate validation accuracy
                # Use IOU or some other suitable metric for object detection
                # ...

        avg_val_loss = val_loss / len(val_loader)
        print(f'Validation loss: {avg_val_loss:.3f}')
        val_losses.append(avg_val_loss)

        writer.add_scalar('validation loss', avg_val_loss, epoch)

        if epoch % 10 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, f'checkpoints/checkpoint{epoch}.pth')
            print(f'Saved checkpoint at epoch {epoch}')

        # Save the last checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
        }, 'checkpoints/latest_checkpoint.pth')

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == n_epochs_stop:
                print('Early stopping!')
                model.load_state_dict(torch.load('best_model.pth'))
                break

        model.train()

    print('Finished Training')

    # Plotting the training and validation loss
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    writer.close()
     
        
if __name__ == "__main__":
    main()


loading annotations into memory...
Done (t=10.42s)
creating index...
index created!
loading annotations into memory...
Done (t=0.36s)
creating index...
index created!


  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "


No checkpoint found, starting from scratch.
[1,   100] loss: 1.687
[1,   200] loss: 0.634
[1,   300] loss: 0.396
[1,   400] loss: 0.260
[1,   500] loss: 0.208
[1,   600] loss: 0.162
[1,   700] loss: 0.152
[1,   800] loss: 0.123
[1,   900] loss: 0.108
[1,  1000] loss: 0.095
[1,  1100] loss: 0.090
[1,  1200] loss: 0.085
[1,  1300] loss: 0.079
[1,  1400] loss: 0.072
[1,  1500] loss: 0.070
[1,  1600] loss: 0.063
[1,  1700] loss: 0.060
[1,  1800] loss: 0.050
[1,  1900] loss: 0.050
[1,  2000] loss: 0.047
[1,  2100] loss: 0.044
[1,  2200] loss: 0.044
[1,  2300] loss: 0.043
[1,  2400] loss: 0.040
[1,  2500] loss: 0.040
[1,  2600] loss: 0.040
[1,  2700] loss: 0.037
[1,  2800] loss: 0.035
[1,  2900] loss: 0.035
[1,  3000] loss: 0.033
[1,  3100] loss: 0.030
[1,  3200] loss: 0.029
[1,  3300] loss: 0.030
[1,  3400] loss: 0.029
[1,  3500] loss: 0.028
[1,  3600] loss: 0.028
[1,  3700] loss: 0.028
[1,  3800] loss: 0.027
[1,  3900] loss: 0.025
[1,  4000] loss: 0.024
[1,  4100] loss: 0.024
[1,  4200] lo

AssertionError: All bounding boxes should have positive height and width. Found invalid box [296.6499938964844, 388.3299865722656, 297.67999267578125, 388.3299865722656] for target at index 1.