In [1]:
%matplotlib inline
import openpifpaf

openpifpaf.show.Canvas.show = True
openpifpaf.show.Canvas.image_min_dpi = 200

In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import torch
import cv2
import copy
import time

from openpifpaf import datasets, encoder, logger, network, optimize, plugin, show, visualizer,train

from openpifpaf.network import trainer

print('OpenPifPaf version', openpifpaf.__version__)
print('PyTorch version', torch.__version__)

OpenPifPaf version 0.12.13+0.g6981019.dirty
PyTorch version 1.10.0+cu102


In [3]:
import sys
sys.argv = ['']

In [4]:
args = train.cli()

datamodule = datasets.factory(args.dataset)

net_cpu, start_epoch = network.Factory().factory(head_metas=datamodule.head_metas)
loss = network.losses.Factory().factory(net_cpu.head_nets)

checkpoint_shell = None
if not args.disable_cuda and torch.cuda.device_count() > 1 and not args.ddp:
    checkpoint_shell = copy.deepcopy(net_cpu)
    net = torch.nn.DataParallel(net_cpu.to(device=args.device))
    loss = loss.to(device=args.device)
elif not args.disable_cuda and torch.cuda.device_count() == 1 and not args.ddp:
    checkpoint_shell = copy.deepcopy(net_cpu)
    net = net_cpu.to(device=args.device)
    loss = loss.to(device=args.device)
elif not args.disable_cuda and torch.cuda.device_count() > 0:
    assert not list(loss.parameters())
    assert torch.cuda.device_count() > 0
    checkpoint_shell = copy.deepcopy(net_cpu)
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    if args.sync_batchnorm:
        net_cpu = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net_cpu)
    else:
        net = torch.nn.parallel.DistributedDataParallel(
            net_cpu.to(device=args.device),
            device_ids=[args.local_rank], output_device=args.local_rank,
            find_unused_parameters=isinstance(datamodule, datasets.MultiDataModule),
    )
    loss = loss.to(device=args.device)
else:
    net = net_cpu

# logger.train_configure(args)
train_loader = datamodule.train_loader()
val_loader = datamodule.val_loader()
if torch.distributed.is_initialized():
    train_loader = datamodule.distributed_sampler(train_loader)
    val_loader = datamodule.distributed_sampler(val_loader)

optimizer = optimize.factory_optimizer(
    args, list(net.parameters()) + list(loss.parameters()))
lr_scheduler = optimize.factory_lrscheduler(
    args, optimizer, len(train_loader), last_epoch=start_epoch)
trainer = network.Trainer(
    net, loss, optimizer, args.output,
    checkpoint_shell=checkpoint_shell,
    lr_scheduler=lr_scheduler,
    device=args.device,
    model_meta_data={
        'args': vars(args),
        'plugin_versions': plugin.versions()
    },
)

INFO:openpifpaf.train:neural network device: cuda (CUDA available: True, count: 1)
INFO:openpifpaf.encoder.factory:Config Factory
INFO:openpifpaf.network.basenetworks:resnet50: stride = 16, output features = 2048
INFO:openpifpaf.network.heads:cif config: fields = 17, confidences = 1, vectors = 1, scales = 1 kernel = 1, padding = 0, dilation = 1
INFO:openpifpaf.network.heads:Out Features: 85 , conv: Conv2d(2048, 85, kernel_size=(1, 1), stride=(1, 1)) 
INFO:openpifpaf.network.heads:caf config: fields = 19, confidences = 1, vectors = 2, scales = 2 kernel = 1, padding = 0, dilation = 1
INFO:openpifpaf.network.heads:Out Features: 171 , conv: Conv2d(2048, 171, kernel_size=(1, 1), stride=(1, 1)) 
INFO:openpifpaf.network.losses.composite:cif: n_vectors = 1, n_scales = 1
INFO:openpifpaf.network.losses.composite:caf: n_vectors = 2, n_scales = 2
INFO:openpifpaf.network.losses.multi_head:multihead loss: ['cocokp.cif.c', 'cocokp.cif.vec1', 'cocokp.cif.scales1', 'cocokp.caf.c', 'cocokp.caf.vec1', 'c

In [5]:
# trainer.loop(train_loader, val_loader, start_epoch=start_epoch)

# trainer.train(train_loader, start_epoch)

start_time = time.time()
trainer.model.train()
if trainer.fix_batch_norm is True \
    or (trainer.fix_batch_norm is not False and trainer.fix_batch_norm <= start_epoch):
    for m in trainer.model.modules():
        if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d)):
            m.eval()

trainer.ema_restore()
trainer.ema = None

epoch_loss = 0.0
head_epoch_losses = None
head_epoch_counts = None
last_batch_end = time.time()
trainer.optimizer.zero_grad()

for batch_idx, (data, target, _) in enumerate(train_loader):
    print(data.shape)
    preprocess_time = time.time() - last_batch_end

    # Train the batches on the pif paf 
    batch_start = time.time()
    apply_gradients = batch_idx % trainer.stride_apply == 0
    loss, head_losses = trainer.train_batch(data, target, apply_gradients)

    # update epoch accumulates
    if loss is not None:
        epoch_loss += loss
    if head_epoch_losses is None:
        head_epoch_losses = [0.0 for _ in head_losses]
        head_epoch_counts = [0 for _ in head_losses]
    for i, head_loss in enumerate(head_losses):
        if head_loss is None:
            continue
        head_epoch_losses[i] += head_loss
        head_epoch_counts[i] += 1

    batch_time = time.time() - batch_start

    if batch_idx % trainer.log_interval == 0:
        batch_info = {
            'type': 'train',
            'epoch': start_epoch, 'batch': batch_idx, 'n_batches': len(train_loader),
            'time': round(batch_time, 3),
            'data_time': round(preprocess_time, 3),
            'lr': round(trainer.lr(), 8),
            'loss': round(loss, 3) if loss is not None else None,
            'head_losses': [round(l, 3) if l is not None else None
                            for l in head_losses],
        }

        if hasattr(trainer.loss, 'batch_meta'):
            batch_info.update(trainer.loss.batch_meta())

    # initialize ema
    if trainer.ema is None and trainer.ema_decay:
        trainer.ema = copy.deepcopy([p.data for p in trainer.model.parameters()])

    # update learning rate
    if trainer.lr_scheduler is not None:
        trainer.lr_scheduler.step()

    if trainer.n_train_batches and batch_idx + 1 >= trainer.n_train_batches:
        break

    last_batch_end = time.time()

    break

trainer.apply_ema()
trainer.n_clipped_grad = 0
trainer.max_norm = 0.0



INFO:openpifpaf.encoder.cif:Confidence Shape: torch.Size([17, 25, 25])
INFO:openpifpaf.encoder.cif:Confidence Fields: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
