In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import cv2
import glob
import xml.etree.ElementTree as ET

In [2]:
class_names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
               'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
cell = 7
bbox = 2
class_num = 20

In [3]:
class BoundingBox():
    def __init__(self, width, height, class_type, xmin, xmax, ymin, ymax):
        xcenter = (xmin + xmax) / 2
        ycenter = (ymin + ymax) / 2
        self.center = xcenter / width, ycenter / height
        self.size = (xmax - xmin) / width, (ymax - ymin) / height
        self.class_type = class_type

In [4]:
def read_image(filename):
    img = cv2.imread(filename)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

def resize_image(image, size):
    return cv2.resize(image, size)

def norm_image(image):
    image = image / 255
    return image.transpose(2, 0, 1)

In [5]:
def read_training_data(label_folder, image_folder, n=-1):
    if label_folder.endswith('/'):
        label_folder = label_foder[:-1]
    if image_folder.endswith('/'):
        image_folder = image_foder[:-1]
    label_files = glob.glob(label_folder + '/*.xml')
    raw_data = []
    for i, file in enumerate(label_files):
        if i == n:
            break
        tree = ET.parse(file)
        root = tree.getroot()
        image_filename = root.find('filename').text
        if i % 1000 == 0:
            print(image_filename)
        image = read_image(image_folder + '/' + image_filename)
        image = resize_image(image, (448, 448))
        image = norm_image(image)
        image_width = float(root.find('size').find('width').text)
        image_height = float(root.find('size').find('height').text)
        object_size = len(root.findall('object'))
        boxes = []
        for obj in root.findall('object'):
            name = obj.find('name').text
            bndbox = obj.find('bndbox')
            xmin = float(bndbox.find('xmin').text); ymin = float(bndbox.find('ymin').text)
            xmax = float(bndbox.find('xmax').text); ymax = float(bndbox.find('ymax').text)
            class_type = class_names.index(name)
            boxes.append(BoundingBox(image_width, image_height, class_type, xmin, xmax, ymin, ymax))
        raw_data.append((image, file, boxes))
    return raw_data

raw_data = read_training_data('./VOCdevkit/VOC2012/Annotations', './VOCdevkit/VOC2012/JPEGImages', 2500)

2007_000027.jpg
2008_000492.jpg
2008_002508.jpg


In [6]:
#[:, cell, cell, B * 5 + C]
def create_data_label(raw_data):
    data = []
    labels = np.empty((0, cell, cell, (bbox * 5 + class_num)))
    for rd in raw_data:
        data.append(rd[0])
        class_probs = np.zeros((cell, cell, class_num))
        object_probs = np.zeros((cell, cell, bbox))
        cx = np.zeros((cell, cell, bbox))
        cy = np.zeros((cell, cell, bbox))
        sqrt_w = np.zeros((cell, cell, bbox))
        sqrt_h = np.zeros((cell, cell, bbox))
        
        for box in rd[2]:
            icx, icy = (np.floor(np.asarray(box.center) * cell)).astype(np.int)
            b = 1 if object_probs[icy][icx][0] == 1 else 0
            object_probs[icy][icx][b] = 1
            cx_normed = box.center[0] * cell - icx
            cy_normed = box.center[1] * cell - icy
            cx[icy][icx][b] = cx_normed
            cy[icy][icx][b] = cy_normed
            sqrt_w[icy][icx][b] = np.sqrt(box.size[0])
            sqrt_h[icy][icx][b] = np.sqrt(box.size[1])
            class_probs[icy][icx][box.class_type] = 1
        
        for y in range(cell):
            for x in range(cell):
                cnt = 0
                for i in range(class_num):
                    if class_probs[y][x][i] == 1:
                        cnt += 1
                if cnt == 0:
                    continue
                for i in range(class_num):
                    if class_probs[y][x][i] == 1:
                        class_probs[y][x][i] = 1 / cnt
        
        label = np.empty((cell, cell, (bbox * 5 + class_num)))
        label[:, :, 0:5*bbox:5] = object_probs[:, :, :]
        label[:, :, 1:5*bbox:5] = cx[:, :, :]
        label[:, :, 2:5*bbox:5] = cy[:, :, :]
        label[:, :, 3:5*bbox:5] = sqrt_w[:, :, :]
        label[:, :, 4:5*bbox:5] = sqrt_h[:, :, :]
        label[:, :, 5*bbox:] = class_probs[:, :, :]
        
        labels = np.append(labels, [label], axis=0)
    data = np.asarray(data)
    return data, labels

data, labels = create_data_label(raw_data)

In [15]:
#[batch, cell, cell, B * 5 + C]
def customloss(outputs, targets):
    #[batch, cell, cell, C]
    output_class_probs = outputs[:, :, :, 5*bbox:]
    target_class_probs = targets[:, :, :, 5*bbox:]
    
    #[batch, cell, cell, B]
    output_xc_normed = outputs[:, :, :, 1:5*bbox:5]
    output_yc_normed = outputs[:, :, :, 2:5*bbox:5]
    output_sqrt_w = outputs[:, :, :, 3:5*bbox:5]
    output_sqrt_h = outputs[:, :, :, 4:5*bbox:5]
    output_w = outputs[:, :, :, 3:5*bbox:5] ** 2
    output_h = outputs[:, :, :, 4:5*bbox:5] ** 2
    target_xc_normed = targets[:, :, :, 1:5*bbox:5]
    target_yc_normed = targets[:, :, :, 2:5*bbox:5]
    target_sqrt_w = targets[:, :, :, 3:5*bbox:5]
    target_sqrt_h = targets[:, :, :, 4:5*bbox:5]
    target_w = targets[:, :, :, 3:5*bbox:5] ** 2
    target_h = targets[:, :, :, 4:5*bbox:5] ** 2
    output_xc = torch.empty_like(output_xc_normed)
    output_yc = torch.empty_like(output_xc_normed)
    target_xc = torch.empty_like(output_xc_normed)
    target_yc = torch.empty_like(output_xc_normed)
    for i in range(cell):
        output_xc[:, :, i, :] = i / cell + output_xc_normed[:, :, i, :] / cell
        output_yc[:, i, :, :] = i / cell + output_yc_normed[:, i, :, :] / cell
        target_xc[:, :, i, :] = i / cell + target_xc_normed[:, :, i, :] / cell
        target_yc[:, i, :, :] = i / cell + target_yc_normed[:, i, :, :] / cell
    dx = torch.minimum(output_xc + output_w / 2, target_xc + target_w / 2) - torch.maximum(output_xc - output_w / 2, target_xc - target_w / 2)
    dy = torch.minimum(output_yc + output_h / 2, target_yc + target_h / 2) - torch.maximum(output_yc - output_h / 2, target_yc - target_h / 2)
    IoU = torch.maximum(torch.zeros_like(output_xc), dx * dy / output_w * output_h + target_w * target_h - dx * dy)
    output_confs = IoU * outputs[:, :, :, 0:5*bbox:5]
    
    indicator_obj_perbbox = targets[:, :, :, 0:5*bbox:5]
    indicator_obj_percell = targets[:, :, :, 0]
    lambda_coord = 5
    lambda_noobj = 0.5
    loss = torch.sum((lambda_coord * indicator_obj_perbbox 
        * ((output_xc - target_xc) ** 2 + (output_yc - target_yc) ** 2
         + (output_sqrt_w - target_sqrt_w) ** 2 + (output_sqrt_h - target_sqrt_h)**2)
        + indicator_obj_perbbox * (output_confs - 1) ** 2 
        + lambda_noobj * indicator_obj_perbbox * (output_confs - 1) ** 2), dim=(1, 2, 3)) \
        + torch.sum(indicator_obj_percell * torch.sum((output_class_probs - target_class_probs) ** 2, dim=-1), dim=(1, 2))
    return torch.mean(loss)

In [8]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3)
        
        self.conv2 = nn.Conv2d(64, 192, 3)
        
        self.conv3 = nn.Conv2d(192, 128, 1)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.conv5 = nn.Conv2d(256, 256, 1)
        self.conv6 = nn.Conv2d(256, 512, 3, padding=1)
        
        self.conv7 = nn.Conv2d(512, 256, 1)
        self.conv8 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv9 = nn.Conv2d(512, 256, 1)
        self.conv10 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv11 = nn.Conv2d(512, 256, 1)
        self.conv12 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv13 = nn.Conv2d(512, 256, 1)
        self.conv14 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv15 = nn.Conv2d(512, 512, 1)
        self.conv16 = nn.Conv2d(512, 1024, 3, padding=1)
        
        self.conv17 = nn.Conv2d(1024, 512, 1)
        self.conv18 = nn.Conv2d(512, 1024, 3, padding=1)
        self.conv19 = nn.Conv2d(1024, 512, 1)
        self.conv20 = nn.Conv2d(512, 1024, 3, padding=1)
        self.conv21 = nn.Conv2d(1024, 1024, 3, padding=1)
        self.conv22 = nn.Conv2d(1024, 1024, 3, stride=2, padding=1)
        
        self.conv23 = nn.Conv2d(1024, 1024, 3, padding=1)
        self.conv24 = nn.Conv2d(1024, 1024, 3, padding=1)
        self.pool = nn.MaxPool2d(2, stride=2)
        self.fc1 = nn.Linear(7 * 7 * 1024, 4096)
        self.fc2 = nn.Linear(4096, 7 * 7 * 30)
    
    def forward(self, x):
        x = F.leaky_relu(self.conv1(x), 0.1)
        x = self.pool(x)
        
        x = F.leaky_relu(self.conv2(x), 0.1)
        x = self.pool(x)
        
        x = F.leaky_relu(self.conv3(x), 0.1)
        x = F.leaky_relu(self.conv4(x), 0.1)
        x = F.leaky_relu(self.conv5(x), 0.1)
        x = F.leaky_relu(self.conv6(x), 0.1)
        x = self.pool(x)
        
        x = F.leaky_relu(self.conv7(x), 0.1)
        x = F.leaky_relu(self.conv8(x), 0.1)
        x = F.leaky_relu(self.conv9(x), 0.1)
        x = F.leaky_relu(self.conv10(x), 0.1)
        x = F.leaky_relu(self.conv11(x), 0.1)
        x = F.leaky_relu(self.conv12(x), 0.1)
        x = F.leaky_relu(self.conv13(x), 0.1)
        x = F.leaky_relu(self.conv14(x), 0.1)
        x = F.leaky_relu(self.conv15(x), 0.1)
        x = F.leaky_relu(self.conv16(x), 0.1)
        x = self.pool(x)
        
        x = F.leaky_relu(self.conv17(x), 0.1)
        x = F.leaky_relu(self.conv18(x), 0.1)
        x = F.leaky_relu(self.conv19(x), 0.1)
        x = F.leaky_relu(self.conv20(x), 0.1)
        x = F.leaky_relu(self.conv21(x), 0.1)
        x = F.leaky_relu(self.conv22(x), 0.1)
        
        x = F.leaky_relu(self.conv23(x), 0.1)
        x = F.leaky_relu(self.conv24(x), 0.1)
        
        x = x.reshape(-1,  7 * 7 * 1024)
        x = F.leaky_relu(self.fc1(x), 0.1)
        x = F.relu(self.fc2(x))
        return x.view(-1, 7, 7, 30)

In [9]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.transform = transform
        self.data = torch.from_numpy(data[0]).type(torch.float)
        self.label = torch.from_numpy(data[1]).type(torch.float)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        out_data = self.data[idx]
        out_label = self.label[idx]
        if self.transform:
            out_data = self.transform(out_data)
        return out_data, out_label

In [10]:
dataset = MyDataset((data, labels), transform=None)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, num_workers=0)

In [11]:
train_iter = iter(trainloader)
train_iter.next()[1].shape

torch.Size([64, 7, 7, 30])

In [17]:
device = torch.device('cpu')

net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)
epochs = 100
for epoch in range(epochs):
    running_loss = 0
    for i, data in enumerate(trainloader):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = customloss(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        print(running_loss)
#         if i % 2500 == 2499:
#             print(f'epoch: {epoch} batch: {i} loss: {running_loss/2500}')
#             running_loss = 0

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 37748736 bytes. Buy new RAM!