In [2]:
import time
import os
import torch
import pytorch_mask_rcnn as pmr
    
    
# ------------------ adjustable parameters ---------------------

use_cuda = True # choose to use GPU or not
epochs = 1 # total epochs during this train
train_num_samples = 1500 # number of samples per epoch
lr = 0.001 # learning rate
dataset = 'coco' # coco or voc
data_dir = 'E:/PyTorch/data/coco2017' # dataset directory
num_classes = 91  # 91 for coco, 21 for voc
ckpt_path = '../checkpoint_coco.pth' # path where to save the checkpoint.pth

# ------------------ adjustable parameters ---------------------

device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
print('cuda: {}\nuse_cuda: {}\n{} GPU(s) available'.format(torch.cuda.is_available(), use_cuda, torch.cuda.device_count()))
print('\ndevice: {}'.format(device))

trainset = pmr.datasets(dataset, data_dir, 'train', train=True, device=device)
indices = torch.randperm(len(trainset)).tolist()
trainset = torch.utils.data.Subset(trainset, indices[:train_num_samples])

torch.manual_seed(3)
model = pmr.maskrcnn_resnet50(True, num_classes).to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

cuda: True
use_cuda: True
1 GPU(s) available

device: cuda


In [3]:
if os.path.exists(ckpt_path):
    checkpoint = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
    
    del checkpoint['model']
    del checkpoint['optimizer']
    del checkpoint['lr_scheduler']
    torch.cuda.empty_cache()
else:
    checkpoint = dict(epochs=0, num_batches=0)
    
start_epoch = checkpoint['epochs']
print('already trained: {} epochs, {} batches'.
      format(start_epoch, checkpoint['num_batches']))

since = time.time()

# ------------------train---------------------

model.train()
for epoch in range(start_epoch, start_epoch + epochs):
    print('epoch: {}'.format(epoch + 1))
    for i, data in enumerate(trainset):
        optimizer.zero_grad()
        losses = model(*data)
        loss = sum(losses.values())
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            print(i, ' '.join(str(round(l.item(), 3)) for l in losses.values()))
    
    lr_scheduler.step()
            
# ------------------train---------------------

print('total time of this train: {:.2f} s'.format(time.time() - since))

checkpoint['model'] = model.state_dict()
checkpoint['optimizer']  = optimizer.state_dict()
checkpoint['lr_scheduler']  = lr_scheduler.state_dict()
checkpoint['epochs'] += epochs
checkpoint['num_batches'] += epochs * len(trainset)
torch.save(checkpoint, ckpt_path)

num_batches = checkpoint['num_batches']
del checkpoint
torch.cuda.empty_cache()

print('already trained: {} epochs, {} batches'.format(epoch + 1, num_batches))

already trained: 0 epochs, 0 batches
epoch: 1
0 0.699 0.032 0.022 0.002 0.228
100 0.227 0.133 0.071 0.102 0.172
200 0.415 0.348 0.133 0.059 0.289
300 0.131 0.051 0.051 0.018 0.278
400 0.01 0.019 0.029 0.005 0.129
500 0.152 0.197 0.12 0.043 0.533
600 0.096 0.066 0.125 0.063 0.198
700 0.035 0.003 0.06 0.031 0.326
800 0.033 0.008 0.03 0.01 0.105
900 0.32 0.048 0.197 0.007 0.181
1000 0.057 0.007 0.05 0.031 0.385
1100 0.242 0.087 0.108 0.031 0.166
1200 0.112 0.237 0.031 0.016 0.2
1300 0.189 0.989 0.14 0.031 0.409
1400 0.027 0.008 0.013 0.009 0.768
total time of this train: 835.16 s
already trained: 1 epochs, 1500 batches
