In [2]:
import os, time
import torch
from torch.utils.data import DataLoader
from data.dataset import MonitorDetectionDataset
import torchvision.transforms as transforms
import random
import numpy as np
from models.yolov3quad import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# CONFIG
CSV_PATH = '../data/labels.csv'
IMG_PATH = '../data/images/'
FOLD = 0
SEED = 123
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
MODEL_CFG = './cfgs/yolov3.cfg'
RESUME = False
EPOCHS = 100
NUM_CLASSES = 1

In [4]:
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True

In [5]:
# LOAD DATA
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.Resize((256, 256)), 
    ])
train_dataset = MonitorDetectionDataset(IMG_PATH, CSV_PATH, transform, 
    train=True, fold=FOLD, seed=SEED)
test_dataset = MonitorDetectionDataset(IMG_PATH, CSV_PATH, transform,
    train=False, fold=FOLD, seed=SEED)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

In [8]:
# LOAD MODEL & OPTIMIZER
model = Darknet(MODEL_CFG)
start_epoch = 0
best_loss = float('inf')
os.makedirs('weights', exist_ok=True)
if RESUME:
    checkpoint = torch.load('weights/latest.pt', map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    model.to(DEVICE).train()
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
		lr=1e-3, momentum=.9, weight_decay=5e-4)
    start_epoch = checkpoint['epoch']+1
    if checkpoint['optimizer'] is not None:
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_loss = checkpoint['best_loss']
    del checkpoint
else:
    if not os.path.isfile('weights/darknet53.conv.74'):
        os.system('wget https://pjreddie.com/media/files/darknet53.conv.74 -P weights')
        load_weights(model, 'weights/darknet53.conv.74')
        model.to(DEVICE).train()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.9, weight_decay=5e-4)

--2023-01-28 20:33:32--  https://pjreddie.com/media/files/darknet53.conv.74
Resolving pjreddie.com (pjreddie.com)... 128.208.4.108
Connecting to pjreddie.com (pjreddie.com)|128.208.4.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 162482580 (155M) [application/octet-stream]
Saving to: ‘weights/darknet53.conv.74’

     0K .......... .......... .......... .......... ..........  0% 81.6K 32m23s
    50K .......... .......... .......... .......... ..........  0% 81.3K 32m27s
   100K .......... .......... .......... .......... ..........  0%  902K 22m36s
   150K .......... .......... .......... .......... ..........  0% 89.5K 24m19s
   200K .......... .......... .......... .......... ..........  0%  935K 20m1s
   250K .......... .......... .......... .......... ..........  0%  413K 17m45s
   300K .......... .......... .......... .......... ..........  0% 89.0K 19m26s
   350K .......... .......... .......... .......... ..........  0%  314K 18m3s
   400K .........

In [9]:
def train_one_epoch(train_loader, model, optimizer, epoch, n_classes):
    ui = -1
    rloss = defaultdict(float)  # running loss
    metrics = torch.zeros(3, n_classes)
    optimizer.zero_grad()
    for i, (imgs, targets) in enumerate(train_loader):
        if sum([len(x) for x in targets]) < 1:  # if no targets continue
            continue
        # SGD burn-in
        if (epoch == 0) & (i <= 1000):
            lr = 1e-4 * (i / 1000) ** 4
            for g in optimizer.param_groups:
                g['lr'] = lr
        # Compute loss, compute gradient, update parameters
        loss = model(imgs.to(DEVICE), targets, requestPrecision=True)
        loss.backward()
        accumulated_batches = 1  # accumulate gradient for 4 batches before stepping optimizer
        if ((i+1) % accumulated_batches == 0) or (i == len(train_loader) - 1):
            optimizer.step()
            optimizer.zero_grad()
        # Compute running epoch-means of tracked metrics
        ui += 1
        metrics += model.losses['metrics']
        TP, FP, FN = metrics
        for key, val in model.losses.items():
            rloss[key] = (rloss[key] * ui + val) / (ui + 1)
        # Precision
        precision = TP / (TP + FP)
        k = (TP + FP) > 0
        if k.sum() > 0:
            mean_precision = precision[k].mean()
        # Recall
        recall = TP / (TP + FN)
        k = (TP + FN) > 0
        if k.sum() > 0:
            mean_recall = recall[k].mean()
        s = ('%11s%11s' + '%11.3g' * 10) % (
            '%g/%g' % (epoch, EPOCHS - 1), '%g/%g' % (i, len(train_loader) - 1), rloss['conf'], rloss['cls'],
            rloss['loss'], mean_precision, mean_recall, model.losses['nT'], model.losses['TP'],
            model.losses['FP'], model.losses['FN'], time.time() - t1)
        t1 = time.time()
        print(s)
        return s, rloss

In [None]:
# TRAIN
t0, t1 = time.time(), time.time()
mean_recall, mean_precision = 0, 0
print('%11s' * 12 % ('Epoch', 'Batch', 'conf', 'cls', 'loss', 'P', 'R', 'nTargets', 'TP', 'FP', 'FN', 'time'))
for epoch in range(start_epoch, EPOCHS):
    # Update scheduler (manual)  at 0, 54, 61 epochs to 1e-3, 1e-4, 1e-5
    epoch_to_lr = {
        30: 1e-4, 
        60: 1e-5, 
        float('inf'): 1e-6
    }
    lr = epoch_to_lr.get(next(k for k in epoch_to_lr if k > epoch), 1e-6)
    for g in optimizer.param_groups: g['lr'] = lr

    summary, rloss = train_one_epoch(train_loader, model, optimizer, epoch, NUM_CLASSES)

    # Write epoch results
    with open('results.txt', 'a') as file:
        file.write(summary + '\n')
    # Update best loss
    loss_per_target = rloss['loss'] / rloss['nT']
    if loss_per_target < best_loss:
        best_loss = loss_per_target
    # Save latest checkpoint
    checkpoint = {
        'epoch': epoch,
        'best_loss': best_loss,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(checkpoint, 'weights/latest.pt')
    # Save best checkpoint
    if best_loss == loss_per_target:
        os.system('cp weights/latest.pt weights/best.pt')
    # Save backup weights every 5 epochs
    if (epoch > 0) & (epoch % 20 == 0):
        os.system('cp weights/latest.pt weights/backup' + str(epoch) + '.pt')
# Save final model
dt = time.time() - t0
print('Finished %g epochs in %.2fs (%.2fs/epoch)' % (epoch, dt, dt / (epoch + 1)))
