In [None]:
import os

# Multi-GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" # first device is the default!
use_cuda = True
use_multi_gpu = False

import itertools
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
os.chdir("utils")
from misc import store_labels, Timer
os.chdir("..")
from ssd.ssd import MatchPrior
from ssd.vgg_ssd import create_vgg_ssd
from ssd.data_preprocessing import TrainAugmentation, TestTransform
from datasets.COCO_dataset import COCODataset
from nn.multibox_loss import MultiboxLoss
from train_and_test import train, test

print('Single Shot MultiBox Detector Training With Pytorch')
datasets = "/home/user/hdd/coco/"
validation_dataset = "/home/user/hdd/coco/"
txt_folder = "/home/user/ssd1/taeholee/SSD/coco-ssd/data/"
from ssd.config import vgg_ssd_config_600
config = vgg_ssd_config_600
config_name = "vgg_ssd_config_600"

# Params for SGD
lr = 1e-3 #1e-3
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1
base_net_lr = None
extra_layers_lr = None

# Params for loading pretrained basenet or checkpoints.
checkpoint_folder = "/home/user/ssd1/taeholee/SSD/coco-ssd/models/"
base_net = checkpoint_folder + "vgg16_reducedfc.pth"
resume = None
pretrained_ssd = None
dataparallel_model = None

# Scheduler
scheduler = "multi-step"
milestones = "120,160" # Params for Multi-step Scheduler
t_max = 120.0 # Params for Cosine Annealing

# Train params
batch_size = 1
num_epochs = 2
num_workers = 1
validation_epochs = 1
debug_steps = 100

# Cuda
if use_cuda and torch.cuda.is_available():
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
    DEVICE = torch.device("cuda:0")
    print("Use CUDA")
else:
    DEVICE = torch.device("cpu")
    print("Use CPU")

# Transform
train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5)
test_transform = TestTransform(config.image_size, config.image_mean, config.image_std)

# Prepare datasets
train_dataset = COCODataset(root=datasets, txt=txt_folder,
                      transform=train_transform,
                      target_transform=target_transform,
                      is_test=False, is_validate=False)
val_dataset = COCODataset(root=validation_dataset, txt=txt_folder,
                          transform=test_transform,
                          target_transform=target_transform,
                          is_test=False, is_validate=True)
label_file = os.path.join(checkpoint_folder, "coco-model-labels.txt")
store_labels(label_file, train_dataset.class_names) # save label file
num_classes = len(train_dataset.class_names)
print("Train dataset size: {}". format(len(train_dataset)))
print("validation dataset size: {}".format(len(val_dataset)))
train_loader = DataLoader(train_dataset, batch_size,
                          num_workers=num_workers,
                          shuffle=False)
val_loader   = DataLoader(val_dataset, batch_size,
                          num_workers=num_workers,
                          shuffle=True)

# Build network
net = create_vgg_ssd(num_classes, config_name, device=DEVICE, is_test=False)
min_loss = -10000.0
last_epoch = -1
base_net_lr = base_net_lr if base_net_lr is not None else lr
extra_layers_lr = extra_layers_lr if extra_layers_lr is not None else lr
params = [{'params': net.base_net.parameters(), 'lr': base_net_lr},
          {'params': itertools.chain(net.source_layer_add_ons.parameters(),
                                     net.extras.parameters()),
           'lr': extra_layers_lr},
          {'params': itertools.chain(net.regression_headers.parameters(),
                                     net.classification_headers.parameters())}]
if dataparallel_model:
    print("Load the multi-GPU model:", dataparallel_model)
    net.load_dataparallel_model(dataparallel_model)
elif pretrained_ssd:
    print("Init from pretrained ssd:", pretrained_ssd)
    net.init_from_pretrained_ssd(pretrained_ssd)
elif resume:
    print("Resume from the model:", resume)
    net.load(resume)
elif base_net:
    print("Init from base net:", base_net)
    net.init_from_base_net(base_net)
net.to(DEVICE)

# Use multi-GPU
if use_cuda and use_multi_gpu and torch.cuda.device_count() > 1:
    print("Multi-GPU on")
    net = nn.DataParallel(net)
    #nn.dataparallel을 할 때 늘 일정한 메모리가 default gpu에 할당되는데 왜 그런것일까?

# Object
criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
                         center_variance=0.1, size_variance=0.2, device=DEVICE)
optimizer = torch.optim.SGD(params, lr=lr, momentum=momentum,
                            weight_decay=weight_decay)
if scheduler == 'multi-step':
    print("Use MultiStepLR scheduler")
    milestones = [int(v.strip()) for v in milestones.split(",")]
    scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch)
elif scheduler == 'cosine':
    print("Use CosineAnnealingLR scheduler")
    scheduler = CosineAnnealingLR(optimizer, t_max, last_epoch=last_epoch)
else:
    print("Unsupported Scheduler")
print("Learning rate:", lr, "Base net learning rate:", base_net_lr,
      "Extra Layers learning rate:", extra_layers_lr)

# Train
print("Start training from epoch 0\n")
timer = Timer()
for epoch in range(last_epoch + 1, num_epochs):
    #timer.start()
    scheduler.step()
    train(train_loader, net, criterion, optimizer,
          device=DEVICE, checkpoint_folder=checkpoint_folder,
          debug_steps=debug_steps, epoch=epoch)
    #print("Time cost1: ", timer.end())
    if epoch % validation_epochs == 0 or epoch == num_epochs - 1:
        #timer.start()
        val_loss, val_regression_loss, val_classification_loss = test(val_loader, net,
                                                                      criterion, DEVICE)
        print("Epoch:", epoch, "\n", 
              "Validation Loss:", val_loss, "\n",
              "Validation Regression Loss:", val_regression_loss, "\n",
              "Validation Classification Loss:", val_classification_loss
              )
        
        name = "vgg16-ssd"+"-Epoch-"+"%s"%epoch+"-Loss-"+"%s"%str(val_loss)[:7]+".pth"
        model_path = os.path.join(checkpoint_folder, name)
        torch.save(net.state_dict(), model_path)
        print("Saved model:", model_path, "\n")
        #print("Time cost2: ", timer.end())
        