In [1]:
import time
import os
import torch
import pytorch_mask_rcnn as pmr
    
    
# ------------------ adjustable parameters ---------------------

use_cuda = True # choose to use GPU or not
epochs = 5 # total epochs during this train
train_num_samples = 1463 # number of samples per epoch, betweem 1 to 1463
lr = 0.001 # learning rate
ckpt_path = '../checkpoint.pth' # path where to save the checkpoint.pth
data_dir = 'E:/PyTorch/data/VOC2012' # dataset directory

# ------------------ adjustable parameters ---------------------

device = torch.device('cuda' if torch.cuda.is_available() and use_cuda else 'cpu')
print('cuda: {}\nuse_cuda: {}\n{} GPU(s) available'.format(torch.cuda.is_available(), use_cuda, torch.cuda.device_count()))
print('\ndevice: {}'.format(device))

trainset = pmr.VOCDataset(data_dir, 'train', True, device=device) # len=1463
indices = torch.randperm(len(trainset)).tolist()
trainset = torch.utils.data.Subset(trainset, indices[:train_num_samples])

torch.manual_seed(3)
model = pmr.maskrcnn_resnet50(True, 21).to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=0.0005)

cuda: True
use_cuda: True
1 GPU(s) available

device: cuda


In [None]:
if os.path.exists(ckpt_path):
    checkpoint = torch.load(ckpt_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    del checkpoint['model_state_dict']
    del checkpoint['optimizer_state_dict']
    torch.cuda.empty_cache()
else:
    checkpoint = dict(epochs=0, num_batches=0)
    
epoch = checkpoint['epochs']
print('already trained: {} epochs, {} batches'.format(epoch, checkpoint['num_batches']))

since = time.time()

# ------------------train---------------------

model.train()
for _ in range(epochs):
    print()
    for i, data in enumerate(trainset):
        optimizer.zero_grad()
        losses = model(*data)
        loss = sum(losses.values())
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            print(i, ' '.join(str(round(l.item(), 3)) for l in losses.values()))
    
    epoch += 1
    if epoch % 7 == 0:
        for pg in optimizer.param_groups:
            pg['lr'] = lr * 0.9 ** (epoch // 7)
            
# ------------------train---------------------

print('total time of this train: {:.2f} s'.format(time.time() - since))

checkpoint['model_state_dict'] = model.state_dict()
checkpoint['optimizer_state_dict']  = optimizer.state_dict()
checkpoint['epochs'] = epoch
checkpoint['num_batches'] += epochs * len(trainset)
torch.save(checkpoint, ckpt_path)

num_batches = checkpoint['num_batches']
del checkpoint
torch.cuda.empty_cache()

print('already trained: {} epochs, {} batches'.format(epoch, num_batches))

already trained: 1 epochs, 1463 batches

0 0.193 0.2 0.662 0.255 0.799
100 0.146 0.086 0.367 0.133 0.601
200 0.339 0.163 0.333 0.096 0.43
300 0.215 0.415 0.946 0.516 0.593
400 0.103 0.038 0.358 0.118 0.464
500 0.02 0.019 0.163 0.055 0.347
600 1.845 0.557 0.171 0.056 1.384
700 0.075 0.01 0.214 0.064 0.449
800 0.073 0.026 0.177 0.057 0.402
900 0.222 0.506 0.189 0.083 0.255
1000 0.124 0.282 0.181 0.059 0.225
1100 0.141 0.159 0.208 0.072 0.452
1200 0.054 0.029 0.128 0.051 0.267
1300 0.026 0.004 0.155 0.064 0.313
1400 0.07 0.022 0.018 0.0 0.475

0 0.078 0.266 0.277 0.125 0.445
100 0.097 0.089 0.309 0.17 0.569
200 0.236 0.14 0.257 0.057 0.421
300 0.164 0.433 0.354 0.117 0.513
400 0.073 0.033 0.319 0.105 0.457
500 0.022 0.024 0.13 0.065 0.464
600 0.109 0.362 0.199 0.074 0.233
700 0.038 0.01 0.177 0.075 0.191
800 0.06 0.021 0.13 0.036 0.259
900 0.301 0.444 0.13 0.065 0.199
1000 0.107 0.575 0.105 0.017 0.209
1100 0.097 0.133 0.152 0.073 0.26
1200 0.075 0.03 0.095 0.034 0.213
1300 0.018 0.003 0.