In [None]:
import torch
from torch import nn
from torch.autograd import Function
import torchvision
from torchvision import transforms
from torch.utils import data

import os
import cv2
import numpy as np
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import pdb
import tarfile
import xml.etree.ElementTree as ET

In [None]:
def point_form(boxes):
    """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
    representation for comparison to point form ground truth data.
    Args:
        boxes: (tensor) center-size default boxes from priorbox layers.
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax


def center_size(boxes):
    """ Convert prior_boxes to (cx, cy, w, h)
    representation for comparison to center-size form ground truth data.
    Args:
        boxes: (tensor) point_form boxes
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h


def intersect(box_a, box_b):
    """ We resize both tensors to [A,B,2] without new malloc:
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then we compute the area of intersect between box_a and box_b.
    Args:
      box_a: (tensor) bounding boxes, Shape: [A,4].
      box_b: (tensor) bounding boxes, Shape: [B,4].
    Return:
      (tensor) intersection area, Shape: [A,B].
    """
    A = box_a.size(0)
    B = box_b.size(0)
    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]


def jaccard(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.  Here we operate on
    ground truth boxes and default boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
    Return:
        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
    """
    inter = intersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]


def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
    """Match each prior box with the ground truth box of the highest jaccard
    overlap, encode the bounding boxes, then return the matched indices
    corresponding to both confidence and location preds.
    Args:
        threshold: (float) The overlap threshold used when mathing boxes.
        truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
        priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
        variances: (tensor) Variances corresponding to each prior coord,
            Shape: [num_priors, 4].
        labels: (tensor) All the class labels for the image, Shape: [num_obj].
        loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
        conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
        idx: (int) current batch index
    Return:
        The matched indices corresponding to 1)location and 2)confidence preds.
    """
    # jaccard index
    overlaps = jaccard(
        truths,
        point_form(priors)
    )
    # (Bipartite Matching)
    # [1,num_objects] best prior for each ground truth
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
    # [1,num_priors] best ground truth for each prior
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
    best_truth_idx.squeeze_(0)
    best_truth_overlap.squeeze_(0)
    best_prior_idx.squeeze_(1)
    best_prior_overlap.squeeze_(1)
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
    # TODO refactor: index  best_prior_idx with long tensor
    # ensure every gt matches with its prior of max overlap
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j
    matches = truths[best_truth_idx]          # Shape: [num_priors,4]
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]
    conf[best_truth_overlap < threshold] = 0  # label as background
    loc = encode(matches, priors, variances)
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
    conf_t[idx] = conf  # [num_priors] top class label for each prior


def encode(matched, priors, variances):
    """Encode the variances from the priorbox layers into the ground truth boxes
    we have matched (based on jaccard overlap) with the prior boxes.
    Args:
        matched: (tensor) Coords of ground truth for each prior in point-form
            Shape: [num_priors, 4].
        priors: (tensor) Prior boxes in center-offset form
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        encoded boxes (tensor), Shape: [num_priors, 4]
    """

    # dist b/t match center and prior's center
    g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
    # encode variance
    g_cxcy /= (variances[0] * priors[:, 2:])
    # match wh / prior wh
    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]


# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes


def log_sum_exp(x):
    """Utility function for computing log_sum_exp while determining
    This will be used to determine unaveraged confidence loss across
    all examples in a batch.
    Args:
        x (Variable(tensor)): conf_preds from conf layers
    """
    x_max = x.data.max()
    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max


# Original author: Francisco Massa:
# https://github.com/fmassa/object-detection.torch
# Ported to PyTorch by Max deGroot (02/01/2017)
def nms(boxes, scores, overlap=0.5, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """

    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0:
        return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count

In [None]:


def intersect_numpy(box_a, box_b):
    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
    min_xy = np.maximum(box_a[:, :2], box_b[:2])
    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
    return inter[:, 0] * inter[:, 1]


def jaccard_numpy(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
        box_b: Single bounding box, Shape: [4]
    Return:
        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
    """
    inter = intersect_numpy(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1]))  # [A,B]
    area_b = ((box_b[2]-box_b[0]) *
              (box_b[3]-box_b[1]))  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]


class Compose(object):
    """Composes several augmentations together.
    Args:
        transforms (List[Transform]): list of transforms to compose.
    Example:
        >>> augmentations.Compose([
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> ])
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, boxes=None, labels=None):
        for t in self.transforms:
            img, boxes, labels = t(img, boxes, labels)
        return img, boxes, labels


class Lambda(object):
    """Applies a lambda as a transform."""

    def __init__(self, lambd):
        assert isinstance(lambd, types.LambdaType)
        self.lambd = lambd

    def __call__(self, img, boxes=None, labels=None):
        return self.lambd(img, boxes, labels)


class ConvertFromInts(object):
    def __call__(self, image, boxes=None, labels=None):
        return image.astype(np.float32), boxes, labels


class SubtractMeans(object):
    def __init__(self, mean):
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        image = image.astype(np.float32)
        image -= self.mean
        return image.astype(np.float32), boxes, labels


class ToAbsoluteCoords(object):
    def __call__(self, image, boxes=None, labels=None):
        height, width, channels = image.shape
        boxes[:, 0] *= width
        boxes[:, 2] *= width
        boxes[:, 1] *= height
        boxes[:, 3] *= height

        return image, boxes, labels


class ToPercentCoords(object):
    def __call__(self, image, boxes=None, labels=None):
        height, width, channels = image.shape
        boxes[:, 0] /= width
        boxes[:, 2] /= width
        boxes[:, 1] /= height
        boxes[:, 3] /= height

        return image, boxes, labels


class Resize(object):
    def __init__(self, size=300):
        self.size = size

    def __call__(self, image, boxes=None, labels=None):
        image = cv2.resize(image, (self.size,
                                 self.size))
        return image, boxes, labels


class RandomSaturation(object):
    def __init__(self, lower=0.5, upper=1.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            image[:, :, 1] *= random.uniform(self.lower, self.upper)

        return image, boxes, labels


class RandomHue(object):
    def __init__(self, delta=18.0):
        assert delta >= 0.0 and delta <= 360.0
        self.delta = delta

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            image[:, :, 0] += random.uniform(-self.delta, self.delta)
            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
        return image, boxes, labels


class RandomLightingNoise(object):
    def __init__(self):
        self.perms = ((0, 1, 2), (0, 2, 1),
                      (1, 0, 2), (1, 2, 0),
                      (2, 0, 1), (2, 1, 0))

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            swap = self.perms[random.randint(len(self.perms))]
            shuffle = SwapChannels(swap)  # shuffle channels
            image = shuffle(image)
        return image, boxes, labels


class ConvertColor(object):
    def __init__(self, current='BGR', transform='HSV'):
        self.transform = transform
        self.current = current

    def __call__(self, image, boxes=None, labels=None):
        if self.current == 'BGR' and self.transform == 'HSV':
            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        elif self.current == 'HSV' and self.transform == 'BGR':
            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
        else:
            raise NotImplementedError
        return image, boxes, labels


class RandomContrast(object):
    def __init__(self, lower=0.5, upper=1.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."

    # expects float image
    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            alpha = random.uniform(self.lower, self.upper)
            image *= alpha
        return image, boxes, labels


class RandomBrightness(object):
    def __init__(self, delta=32):
        assert delta >= 0.0
        assert delta <= 255.0
        self.delta = delta

    def __call__(self, image, boxes=None, labels=None):
        if random.randint(2):
            delta = random.uniform(-self.delta, self.delta)
            image += delta
        return image, boxes, labels


class ToCV2Image(object):
    def __call__(self, tensor, boxes=None, labels=None):
        return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels


class ToTensor(object):
    def __call__(self, cvimage, boxes=None, labels=None):
        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels


class RandomSampleCrop(object):
    """Crop
    Arguments:
        img (Image): the image being input during training
        boxes (Tensor): the original bounding boxes in pt form
        labels (Tensor): the class labels for each bbox
        mode (float tuple): the min and max jaccard overlaps
    Return:
        (img, boxes, classes)
            img (Image): the cropped image
            boxes (Tensor): the adjusted bounding boxes in pt form
            labels (Tensor): the class labels for each bbox
    """
    def __init__(self):
        self.sample_options = (
            # using entire original input image
            None,
            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
            (0.1, None),
            (0.3, None),
            (0.7, None),
            (0.9, None),
            # randomly sample a patch
            (None, None),
        )

    def __call__(self, image, boxes=None, labels=None):
        height, width, _ = image.shape
        while True:
            # randomly choose a mode
            mode = random.choice(self.sample_options)
            if mode is None:
                return image, boxes, labels

            min_iou, max_iou = mode
            if min_iou is None:
                min_iou = float('-inf')
            if max_iou is None:
                max_iou = float('inf')

            # max trails (50)
            for _ in range(50):
                current_image = image

                w = random.uniform(0.3 * width, width)
                h = random.uniform(0.3 * height, height)

                # aspect ratio constraint b/t .5 & 2
                if h / w < 0.5 or h / w > 2:
                    continue

                left = random.uniform(width - w)
                top = random.uniform(height - h)

                # convert to integer rect x1,y1,x2,y2
                rect = np.array([int(left), int(top), int(left+w), int(top+h)])

                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
                overlap = jaccard_numpy(boxes, rect)

                # is min and max overlap constraint satisfied? if not try again
                if overlap.min() < min_iou and max_iou < overlap.max():
                    continue

                # cut the crop from the image
                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
                                              :]

                # keep overlap with gt box IF center in sampled patch
                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0

                # mask in all gt boxes that above and to the left of centers
                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])

                # mask in all gt boxes that under and to the right of centers
                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])

                # mask in that both m1 and m2 are true
                mask = m1 * m2

                # have any valid boxes? try again if not
                if not mask.any():
                    continue

                # take only matching gt boxes
                current_boxes = boxes[mask, :].copy()

                # take only matching gt labels
                current_labels = labels[mask]

                # should we use the box left and top corner or the crop's
                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
                                                  rect[:2])
                # adjust to crop (by substracting crop's left,top)
                current_boxes[:, :2] -= rect[:2]

                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
                                                  rect[2:])
                # adjust to crop (by substracting crop's left,top)
                current_boxes[:, 2:] -= rect[:2]

                return current_image, current_boxes, current_labels


class Expand(object):
    def __init__(self, mean):
        self.mean = mean

    def __call__(self, image, boxes, labels):
        if random.randint(2):
            return image, boxes, labels

        height, width, depth = image.shape
        ratio = random.uniform(1, 4)
        left = random.uniform(0, width*ratio - width)
        top = random.uniform(0, height*ratio - height)

        expand_image = np.zeros(
            (int(height*ratio), int(width*ratio), depth),
            dtype=image.dtype)
        expand_image[:, :, :] = self.mean
        expand_image[int(top):int(top + height),
                     int(left):int(left + width)] = image
        image = expand_image

        boxes = boxes.copy()
        boxes[:, :2] += (int(left), int(top))
        boxes[:, 2:] += (int(left), int(top))

        return image, boxes, labels


class RandomMirror(object):
    def __call__(self, image, boxes, classes):
        _, width, _ = image.shape
        if random.randint(2):
            image = image[:, ::-1]
            boxes = boxes.copy()
            boxes[:, 0::2] = width - boxes[:, 2::-2]
        return image, boxes, classes


class SwapChannels(object):
    """Transforms a tensorized image by swapping the channels in the order
     specified in the swap tuple.
    Args:
        swaps (int triple): final order of channels
            eg: (2, 1, 0)
    """

    def __init__(self, swaps):
        self.swaps = swaps

    def __call__(self, image):
        """
        Args:
            image (Tensor): image tensor to be transformed
        Return:
            a tensor with channels swapped according to swap
        """
        # if torch.is_tensor(image):
        #     image = image.data.cpu().numpy()
        # else:
        #     image = np.array(image)
        image = image[:, :, self.swaps]
        return image


class PhotometricDistort(object):
    def __init__(self):
        self.pd = [
            RandomContrast(),
            ConvertColor(transform='HSV'),
            RandomSaturation(),
            RandomHue(),
            ConvertColor(current='HSV', transform='BGR'),
            RandomContrast()
        ]
        self.rand_brightness = RandomBrightness()
        self.rand_light_noise = RandomLightingNoise()

    def __call__(self, image, boxes, labels):
        im = image.copy()
        im, boxes, labels = self.rand_brightness(im, boxes, labels)
        if random.randint(2):
            distort = Compose(self.pd[:-1])
        else:
            distort = Compose(self.pd[1:])
        im, boxes, labels = distort(im, boxes, labels)
        return self.rand_light_noise(im, boxes, labels)


class SSDAugmentation(object):
    def __init__(self, size=300, mean=(104, 117, 123)):
        self.mean = mean
        self.size = size
        self.augment = Compose([
            ConvertFromInts(),
            ToAbsoluteCoords(),
            PhotometricDistort(),
            Expand(self.mean),
            RandomSampleCrop(),
            RandomMirror(),
            ToPercentCoords(),
            Resize(self.size),
            SubtractMeans(self.mean)
        ])

    def __call__(self, img, boxes, labels):
        return self.augment(img, boxes, labels)

In [None]:
torchvision.datasets.utils.download_url('http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',os.getcwd())


Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to /content/VOCtrainval_11-May-2012.tar


  0%|          | 0/1999639040 [00:00<?, ?it/s]

In [None]:
with tarfile.open('/content/VOCtrainval_11-May-2012.tar') as tar:
    tar.extractall(path = '.')

In [None]:
def make_datapath_list(rootpath):
    imgpath_template = os.path.join(rootpath, 'JPEGImages','%s.jpg')
    annopath_template = os.path.join(rootpath, 'Annotations','%s.xml')

    train_id_names = os.path.join(rootpath + 'ImageSets/Main/train.txt')
    val_id_names = os.path.join(rootpath + 'ImageSets/Main/val.txt')

    train_img_list = []
    train_anno_list = []
    
    for line in open(train_id_names):
        file_id = line.strip()
        img_path = (imgpath_template % file_id)
        anno_path = (annopath_template % file_id)
        train_img_list.append(img_path)
        train_anno_list.append(anno_path)
    
    val_img_list = []
    val_anno_list = []

    for line in open(val_id_names):
        file_id = line.strip()
        img_path = (imgpath_template % file_id)
        anno_path = (annopath_template % file_id)
        val_img_list.append(img_path)
        val_anno_list.append(anno_path)

    return train_img_list, train_anno_list, val_img_list, val_anno_list


In [None]:
class Anno_xml2list(object):
    def __init__(self,classes):
        self.classes = classes
    
    def __call__(self, xml_path, width, height):
        ret = []
        xml = ET.parse(xml_path).getroot()
        for obj in xml.iter('object'):
            difficult = int(obj.find('difficult').text)
            if difficult ==1:
                continue
            
            bndbox = []
            name = obj.find('name').text.lower().strip()
            bbox = obj.find('bndbox')

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            for pt in pts:
                cur_pixel = int(bbox.find(pt).text) -1
                if pt == 'xmin' or pt == 'xmax' :
                    cur_pixel /= width
                else:
                    cur_pixel /= height
                bndbox.append(cur_pixel)
            
        label_idx = self.classes.index(name)
        bndbox.append(label_idx)
        ret +=[bndbox]

        return np.array(ret)

In [None]:
class DataTransform():
    def __init__(self, input_size, color_mean):
        self.data_transform = {
            'train': Compose([
                ConvertFromInts(),
                ToAbsoluteCoords(),
                PhotometricDistort(),
                Expand(color_mean),
                RandomSampleCrop(),
                RandomMirror(),
                ToPercentCoords(),
                Resize(input_size),
                SubtractMeans(color_mean)
            ]),
            'val': Compose([
                ConvertFromInts(),
                Resize(input_size),
                SubtractMeans(color_mean)
            ])
        }
    
    def __call__(self, img, phase, boxes, labels):
        return self.data_transform[phase](img,boxes,labels)

In [None]:


# image_file_path = train_img_list[0]
# img = cv2.imread(image_file_path)
# h, w, c = img.shape
# transform_anno = Anno_xml2list(voc_classes)
# anno_list = transform_anno(train_anno_list[0],w,h)
# plt.imshow(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))

# plt.show()


# transform = DataTransform(input_size, color_mean)

# phase = 'train'
# img_transformed, boxes, labels= transform(img, phase, anno_list[:,:4], anno_list[:,4])
# plt.imshow(cv2.cvtColor(img_transformed, cv2.COLOR_BGR2RGB))
# plt.show()

# phase = 'val'
# img_transformed, boxes, labels= transform(img, phase, anno_list[:,:4], anno_list[:,4])
# plt.imshow(cv2.cvtColor(img_transformed, cv2.COLOR_BGR2RGB))
# plt.show()



In [None]:
class VOCDataset(data.Dataset):
    def __init__(self,img_list, anno_list, phase, transform, transform_anno):
        self.img_list = img_list
        self.anno_list = anno_list
        self.phase = phase
        self.transform = transform
        self.transform_anno = transform_anno 
    
    def __len__(self):
        return len(self.img_list)

    def __getitem__(self,index):
        im, gt, h, w = self.pull_item(index)

        return im, gt

    def pull_item(self, index):
        image_file_path = self.img_list[index]
        img = cv2.imread(image_file_path)
        h, w, c = img.shape
        anno_file_path = self.anno_list[index]
        anno_list = self.transform_anno(anno_file_path, w,h)

        img, boxes, labels = self.transform(img, self.phase, anno_list[:,:4], anno_list[:,4])
        img = torch.from_numpy(img[:,:,(2,1,0)]).permute(2,0,1)
        gt = np.hstack((boxes, np.expand_dims(labels, axis= 1)))

        return img, gt, h, w


In [None]:


# val_dataset.__getitem__(1)

In [None]:
def od_collate_fn(batch):
    targets = []
    imgs = []
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    
    imgs = torch.stack(imgs, dim=0)
    return imgs, targets

In [None]:


# batch_iterator = iter(dataloaders_dict['val'])
# img, tar = next(batch_iterator)
# print(img.size(), len(tar), tar[1])

In [None]:
def make_vgg():
    layers = []
    in_channels = 3

    cfg = [64,64,'M',128,128,'M',256,256,256,'MC',512,512,512,'M',512,512,512]

    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size = 2, stride = 2)]
        elif v == 'MC':
            layers += [nn.MaxPool2d(kernel_size = 2, stride = 2, ceil_mode = True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size = 3, padding = 1)
            layers += [conv2d, nn.ReLU(inplace = True)]
            in_channels = v
    
    pool5 = nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1)
    conv6  = nn.Conv2d(512, 1024, kernel_size = 3, padding = 6, dilation = 6)
    conv7  = nn.Conv2d(1024, 1024, kernel_size = 1)
    layers += [pool5, conv6, nn.ReLU(inplace = True), conv7, nn.ReLU(inplace = True)]
    return nn.ModuleList(layers)

vgg_test = make_vgg()
for n, p in vgg_test.state_dict().items():
    print(n) 
# print(vgg_test) 
# state_dict = vgg_test.state_dict()
# state_dict['31.weight']

0.weight
0.bias
2.weight
2.bias
5.weight
5.bias
7.weight
7.bias
10.weight
10.bias
12.weight
12.bias
14.weight
14.bias
17.weight
17.bias
19.weight
19.bias
21.weight
21.bias
24.weight
24.bias
26.weight
26.bias
28.weight
28.bias
31.weight
31.bias
33.weight
33.bias


In [None]:
def make_extras():
    layers = []
    in_channels = 1024

    cfg = [256,512,128,256,128,256,128,256]

    layers +=[nn.Conv2d(in_channels, cfg[0], kernel_size = (1))]
    layers +=[nn.Conv2d(cfg[0], cfg[1], kernel_size = (3), stride = 2, padding = 1)]
    layers +=[nn.Conv2d(cfg[1], cfg[2], kernel_size = (1))]
    layers +=[nn.Conv2d(cfg[2], cfg[3], kernel_size = (3), stride = 2, padding = 1)]
    layers +=[nn.Conv2d(cfg[3], cfg[4], kernel_size = (1))]
    layers +=[nn.Conv2d(cfg[4], cfg[5], kernel_size = (3))]
    layers +=[nn.Conv2d(cfg[5], cfg[6], kernel_size = (1))]
    layers +=[nn.Conv2d(cfg[6], cfg[7], kernel_size = (3))]

    return nn.ModuleList(layers)

extras_test = make_extras()
for n, p in extras_test.state_dict().items():
    print(n)
print(extras_test)

0.weight
0.bias
1.weight
1.bias
2.weight
2.bias
3.weight
3.bias
4.weight
4.bias
5.weight
5.bias
6.weight
6.bias
7.weight
7.bias
ModuleList(
  (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
  (3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (4): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (6): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (7): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
)


In [None]:
def make_loc_conf(num_classes = 21, bbox_aspect_num = [4,6,6,6,4,4]):
    loc_layers = []
    conf_layers = []
    
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[0]*4, kernel_size = 3, padding  =1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[0]*num_classes, kernel_size = 3, padding  =1)]

    loc_layers += [nn.Conv2d(1024, bbox_aspect_num[1]*4, kernel_size = 3, padding = 1)]
    conf_layers += [nn.Conv2d(1024, bbox_aspect_num[1]*num_classes, kernel_size = 3, padding = 1)]

    loc_layers += [nn.Conv2d(512, bbox_aspect_num[2]*4, kernel_size = 3, padding = 1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[2]*num_classes, kernel_size = 3, padding = 1)]
    
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[3]*4, kernel_size = 3, padding = 1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[3]*num_classes, kernel_size = 3, padding = 1)]
    
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[4]*4, kernel_size = 3, padding = 1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[4]*num_classes, kernel_size = 3, padding = 1)]
    
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[5]*4, kernel_size = 3, padding = 1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[5]*num_classes, kernel_size = 3, padding = 1)]
    
    return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)
    
loc_test, conf_test = make_loc_conf()
print(loc_test, conf_test, sep='\n')

ModuleList(
  (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)


In [None]:
class L2Norm(nn.Module):
    def __init__(self, input_channels = 512, scale = 20):
        super(L2Norm, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(input_channels))
        self.scale = scale
        self.reset_parameters()
        self.eps = 1e-10

    def reset_parameters(self):
        nn.init.constant_(self.weight, self.scale)
    
    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim = True).sqrt() + self.eps
        x = torch.div(x,norm)
        weight = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x)
        out = weight * x
        return out

In [None]:
from itertools import product
from math import sqrt
class DBox():
    def __init__(self, cfg):
        super(DBox, self).__init__()
        self.image_size = cfg['input_size']
        self.feature_maps = cfg['feature_maps']
        self.steps = cfg['steps']
        self.num_priors = len(cfg['feature_maps'])
        self.min_sizes = cfg['min_sizes']
        self.max_sizes = cfg['max_sizes']
        self.aspect_ratios = cfg['aspect_ratios']

    def make_dbox_list(self):
        mean = []
        for k, f in enumerate(self.feature_maps):
            for i, j in product(range(f), repeat = 2):
                f_k = self.image_size / self.steps[k]
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k

                s_k = self.min_sizes[k] / self.image_size
                mean +=[cx, cy, s_k, s_k]

                s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size)) 
                mean +=[cx, cy, s_k_prime, s_k_prime]

                for ar in self.aspect_ratios[k]:
                    mean +=[cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)]
                    mean +=[cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
        output = torch.Tensor(mean).view(-1,4)
        output.clamp_(max = 1, min = 0)
        return output



# dbox = DBox(ssd_cfg)
# dbox_list = dbox.make_dbox_list()

# pd.DataFrame(dbox_list.numpy())


In [None]:
def decimate(tensor, m):
    """
    Decimate a tensor by a factor 'm', i.e. downsample by keeping every 'm'th value.
    This is used when we convert FC layers to equivalent Convolutional layers, BUT of a smaller size.
    :param tensor: tensor to be decimated
    :param m: list of decimation factors for each dimension of the tensor; None if not to be decimated along a dimension
    :return: decimated tensor
    """
    assert tensor.dim() == len(m)
    for d in range(tensor.dim()):
        if m[d] is not None:
            tensor = tensor.index_select(dim=d,
                                         index=torch.arange(start=0, end=tensor.size(d), step=m[d]).long())

    return tensor

In [None]:
class SSD(nn.Module):
    def __init__(self, phase, cfg):
        super(SSD, self).__init__()
        self.phase = phase 
        self.num_classes = cfg['num_classes']

        self.vgg = make_vgg()
        self.load_pretrained_layers()

        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(cfg['num_classes'], cfg['bbox_aspect_num'])
        dbox = DBox(cfg)
        self.dbox_list = dbox.make_dbox_list()

        if phase == 'inference':
            self.detect = Detect()

    def forward(self, x):
        sources = list()
        loc = list()
        conf = list()
        for k in range(23):
            x = self.vgg[k](x)
        source1 = self.L2Norm(x)
        sources.append(source1)

        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        sources.append(x)

        for k, v in enumerate(self.extras):
            x = nn.functional.relu(v(x), inplace=True)
            if k %2 == 1:
                sources.append(x)
        
        for (x,l,c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0,2,3,1).contiguous())
            conf.append(c(x).permute(0,2,3,1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc] ,1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf] ,1)

        # pdb.set_trace()

        loc = loc.view(loc.size(0), -1,4)
        conf = conf.view(conf.size(0), -1, self.num_classes)

        output = (loc, conf, self.dbox_list)

        if self.phase == 'inference':
            return self.detect(output[0], output[1], output[2])
        else :
            return output
    def load_pretrained_layers(self):
        """
        As in the paper, we use a VGG-16 pretrained on the ImageNet task as the base network.
        There's one available in PyTorch, see https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16
        We copy these parameters into our network. It's straightforward for conv1 to conv5.
        However, the original VGG-16 does not contain the conv6 and con7 layers.
        Therefore, we convert fc6 and fc7 into convolutional layers, and subsample by decimation. See 'decimate' in utils.py.
        """
        # Current state of base
        state_dict = self.state_dict()
        param_names = list(state_dict.keys())

        # Pretrained VGG base
        pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict()
        pretrained_param_names = list(pretrained_state_dict.keys())

        # Transfer conv. parameters from pretrained model to current model
        for i, param in enumerate(param_names[:-4]):  # excluding conv6 and conv7 parameters
            state_dict[param] = pretrained_state_dict[pretrained_param_names[i]]

        # Convert fc6, fc7 to convolutional layers, and subsample (by decimation) to sizes of conv6 and conv7
        # fc6
        conv_fc6_weight = pretrained_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)  # (4096, 512, 7, 7)
        conv_fc6_bias = pretrained_state_dict['classifier.0.bias']  # (4096)
        state_dict['vgg.31.weight'] = decimate(conv_fc6_weight, m=[4, None, 3, 3])  # (1024, 512, 3, 3)
        state_dict['vgg.31.bias'] = decimate(conv_fc6_bias, m=[4])  # (1024)
        # # fc7
        conv_fc7_weight = pretrained_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)  # (4096, 4096, 1, 1)
        conv_fc7_bias = pretrained_state_dict['classifier.3.bias']  # (4096)
        state_dict['vgg.33.weight'] = decimate(conv_fc7_weight, m=[4, 4, None, None])  # (1024, 1024, 1, 1)
        state_dict['vgg.33.bias'] = decimate(conv_fc7_bias, m=[4])  # (1024)

        # Note: an FC layer of size (K) operating on a flattened version (C*H*W) of a 2D image of size (C, H, W)...
        # ...is equivalent to a convolutional layer with kernel size (H, W), input channels C, output channels K...
        # ...operating on the 2D image of size (C, H, W) without padding

        self.load_state_dict(state_dict)

        print("\nLoaded base model.\n")
# ssd_test = SSD(phase = 'train', cfg = ssd_cfg)
# print(ssd_test)

In [None]:
def decode(loc, dbox_list):
    boxes = torch.cat((
        dbox_list[:,:2] + loc[:, :2] * 0.1 * dbox_list[:, 2:],
        dbox_list[:,2:] * torch.exp(loc[:,2:] *0.2)), dim = 1)
    
    boxes[:,:2] -= boxes[:,2:] /2
    boxes[:,2:] += boxes[:,:2] 
    return boxes

In [None]:

def nm_suppression(boxes, scores, overlap = .45, top_k = 200):
    count = 0
    keep = scores.new(scores.size(0).zero_().long())
    x1= boxes[:,0]
    y1= boxes[:,1]
    x2= boxes[:,2]
    y2= boxes[:,3]
    area = torch.mul(x2-x1, y2,y1)

    tmp_x1 = boxes.new_tensor()
    tmp_y1 = boxes.new_tensor()
    tmp_x2 = boxes.new_tensor()
    tmp_y2 = boxes.new_tensor()
    tmp_w = boxes.new_tensor()
    tmp_h = boxes.new_tensor()

    # pdb.set_trace()

    v, idx = scores.sort(0)
    idx = idx[-top_k:]

    while idx.numel() > 0:
        i = idx[-1]
        keep[count] = i
        count+=1

        if idx.size(0) ==1:
            break
        
        idx = idx[:-1]
        torch.index_select(x1,0,idx,out = tmp_x1)
        torch.index_select(y1,0,idx,out = tmp_y1)
        torch.index_select(x2,0,idx,out = tmp_x2)
        torch.index_select(y2,0,idx,out = tmp_y2)

        tmp_x1 = torch.clamp(tmp_x1, min = x1[i])
        tmp_y1 = torch.clamp(tmp_y1, min = y1[i])
        tmp_x2 = torch.clamp(tmp_x2, max = x2[i])
        tmp_y2 = torch.clamp(tmp_y2, max = y2[i])

        tmp_w.resize_as_(tmp_x2)
        tmp_h.resize_as_(tmp_y2)

        tmp_w = tmp_x2 - tmp_x1
        tmp_h = tmp_y2 - tmp_y1

        tmp_w = torch.clamp(tmp_w, min = 0.0)
        tmp_h = torch.clamp(tmp_h, min = 0.0)

        inter = tmp_w * tmp_h

        rem_areas = torch.index_select(area , 0, idx)
        union = (rem_areas - inter) + area[i]
        IoU = inter / union

        idx = idx[IoU.le(overlap)]

        # pdb.set_trace()

    return keep, count

In [None]:
class Detect(Function):
    def __init__(self, conf_thresh = 0.01, top_k = 200, nms_thresh = 0.45):
        self.softmax = nn.Softmax(dim = -1)
        self.conf_thresh = conf_thresh
        self.top_k = top_k
        self.nms_thresh = nms_thresh

    def forward(self, loc_data, conf_data, dbox_list):
        num_batch = loc_data.size(0)
        num_dbox = loc_data.size(1)
        num_classes = loc_data.size(2)
        
        conf_data = self.softmax(conf_data)
        output = torch.zeros(num_batch, num_classes, self.top_k, 5)
        conf_preds = conf_data.transpose(2,1)

        for i in range(num_batch):
            decoded_boxes = decode[loc_data[i], dbox_list]
            conf_scores = conf_preds[i].clone()

            for cl in range(1, num_classes):
                c_mask = conf_scores[cl].gt(self.conf_thresh)
                scores = conf_scores[cl][c_mask]

                if scores.nelement() == 0:
                    continue
                l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
                boxes = decoded_boxes[l_mask].view(-1,4)

                ids, count = nm_suppression(boxes, scores, self.nms_thresh, self.top_k)
                output[i, cl, :count] = torch.cat((scores[ids][:count].unsqueeze(1), boxes[ids[:count]]),1)
        return output

In [None]:
class MultiBoxLoss(nn.Module):
    def __init__(self, jaccard_thresh = .5, neg_pos = 3,  device = 'cpu'):
        super(MultiBoxLoss, self).__init__()
        self.jaccard_thresh = jaccard_thresh
        self.negpos_ratio = neg_pos
        self.device = device
    
    def forward(self, predictions, targets):
        """
        predictions : (loc = torch.Size([num_batch, 8732,4]),
                       conf = torch.Size([num_batch, 8732, 21]),
                       dbox = torch.Size([8732, 4])

        targets : [xmin, ymin, xmax, ymax, label_ind]
        """
        loc_data , conf_data, dbox_list = predictions
        num_batch = loc_data.size(0)
        num_dbox = loc_data.size(1)
        num_classes = conf_data.size(2)
        
        # pdb.set_trace()


        conf_t_label = torch.LongTensor(num_batch, num_dbox).to(self.device)
        loc_t = torch.Tensor(num_batch, num_dbox,4).to(self.device)

        for idx in range(num_batch):
            truths = targets[idx][:,:-1].to(self.device)
            labels = targets[idx][:,-1].to(self.device)
            dbox = dbox_list.to(self.device)
            variance = [0.1,0.2]

            match(self.jaccard_thresh, truths, dbox, variance, labels, loc_t, conf_t_label, idx)
        # match(self.jaccrad_trhes, truths, priors, variances, labels, loc_t, conf_t, idx)

        pos_mask = conf_t_label >0
        pos_idx = pos_mask.unsqueeze(pos_mask.dim()).expand_as(loc_data)
        
        loc_p = loc_data[pos_idx].view(-1,4)
        loc_t = loc_t[pos_idx].view(-1,4)
        
        
        
        loss_l = nn.functional.smooth_l1_loss(loc_p, loc_t, reduction = 'sum')

        batch_conf = conf_data.view(-1, num_classes)

        """
        """
        # pdb.set_trace()
        
        
        loss_c = nn.functional.cross_entropy(batch_conf, conf_t_label.view(-1), reduction = 'none')
        num_pos = pos_mask.long().sum(1, keepdim= True)

        loss_c = loss_c.view(num_batch, -1)
        loss_c[pos_mask] = 0

        _, loss_idx = loss_c.sort(1, descending = True)
        _, idx_rank = loss_idx.sort(1)

        num_neg = torch.clamp(num_pos * self.negpos_ratio, max = num_dbox)
        neg_mask = idx_rank < (num_neg).expand_as(idx_rank)

        pos_idx_mask = pos_mask.unsqueeze(2).expand_as(conf_data)
        neg_idx_mask = neg_mask.unsqueeze(2).expand_as(conf_data)

        conf_hnm = conf_data[(pos_idx_mask + neg_idx_mask).gt(0)].view(-1, num_classes)
        conf_t_label_hnm = conf_t_label[(pos_mask + neg_mask).gt(0)]

        loss_c = nn.functional.cross_entropy(conf_hnm, conf_t_label_hnm, reduction = 'sum')

        N = num_pos.sum()
        loss_l /= N
        loss_c /= N

        return loss_l, loss_c

In [None]:
rootpath = '/content/VOCdevkit/VOC2012/'
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(rootpath)

voc_classes = ['aeroplane','bicycle','bird','boat',
               'bottle','bus','car','cat',
               'chair','cow','diningtable','dog',
               'horse','motorbike','person','pottedplant',
               'sheep','sofa','train','tvmonitor']

color_mean = (104,117,123)
input_size = 300

train_dataset = VOCDataset(train_img_list, train_anno_list, phase = 'train', transform = DataTransform(input_size, color_mean),
                           transform_anno  = Anno_xml2list(voc_classes))
val_dataset = VOCDataset(val_img_list, val_anno_list, phase = 'val', transform = DataTransform(input_size, color_mean),
                           transform_anno  = Anno_xml2list(voc_classes))

batch_size = 4

train_dataloader = data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = od_collate_fn)
val_dataloader = data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False, collate_fn = od_collate_fn)

dataloaders_dict = {'train' : train_dataloader, 'val' : val_dataloader}

In [None]:
ssd_cfg = {
    'num_classes':21,
    'input_size': 300,
    'bbox_aspect_num' : [4,6,6,6,4,4],
    'feature_maps' : [38,19,10,5,3,1],
    'steps': [8,16,32,64,100,300],
    'min_sizes': [30,60,111,162,213,264],
    'max_sizes': [60,111,162,213,264, 315],
    'aspect_ratios' : [[2],[2,3],[2,3],[2,3],[2],[3]] 
    }

net = SSD(phase = 'train', cfg = ssd_cfg)

# vgg_weights = torch.load(torchvision.models.vgg16(pretrained=True).state_dict())
# net.vgg.load_state_dict(vgg_weights)

def weights_init(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight.data)
        if m.bias is not None :
            nn.init.constant_(m.bias, 0.0)
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


  0%|          | 0.00/528M [00:00<?, ?B/s]


Loaded base model.



In [None]:
criterion = MultiBoxLoss(jaccard_thresh = 0.5, neg_pos = 3, device = device)
optimizer = torch.optim.AdamW(net.parameters(), lr = 1e-3, weight_decay = 5e-4)

In [None]:
import time
import tqdm
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'{device}')

    net.to(device)
    torch.backends.cudnn.benchmark = True

    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    logs = []

    for epoch in range(num_epochs +1):
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('--------------------------------------')
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('--------------------------------------')

        for phase in ['val']:
            if phase == 'train':
                net.train()
                print('train mode')
            else:
                if((epoch +1) % 10 == 0):
                    net.eval()
                    print('val mode')
                else :
                    continue

            for images, targets in tqdm.tqdm(dataloaders_dict[phase]):
                
                
                # pdb.set_trace()
                
                
                images = images.to(device)
                targets = [ann.to(device) for ann in targets]
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(images)
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    if phase == 'train':
                        loss.backward()
                        nn.utils.clip_grad_value_(net.parameters(), clip_value = 2.0)
                        optimizer.step()

                        if (iteration % 10 == 0 ):
                            t_iter_finish = time.time() 
                            duration = t_iter_finish - t_iter_start
                            print(f'반복 {iteration} || Loss : {loss.item():4f} || 10iter : {duration :.4f}')

                            t_iter_start = time.time()
                        
                        epoch_train_loss += loss.item()
                        iteration +=1
                    
                    else:
                        epoch_val_loss += loss.item()

        t_epoch_finish = time.time()
        print('--------------------------------------')
        print(f'Epoch{epoch +1} || Epoch train loss : {epoch_train_loss} Epoch val loss {epoch_val_loss}')
        print('--------------------------------------')    
        print(f'timer : {t_epoch_finish - t_epoch_start : .4f}')

        log_epoch = {'epoch' : epoch+1,
                     'train_loss' : epoch_train_loss,
                     'val_loss' : epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv('log_output.csv')

        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

        if ((epoch+1) %10 == 0):
            torch.save(net.state_dict(), 'weight/ssd300_' + str(epoch+1)+'.pth')

In [None]:
# torch.cuda.empty_cache()
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
num_epochs = 50
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs = num_epochs)

cuda
--------------------------------------
Epoch 1/50
--------------------------------------
--------------------------------------
Epoch1 || Epoch train loss : 0.0 Epoch val loss 0.0
--------------------------------------
timer :  0.0001
--------------------------------------
Epoch 2/50
--------------------------------------
--------------------------------------
Epoch2 || Epoch train loss : 0.0 Epoch val loss 0.0
--------------------------------------
timer :  0.0007
--------------------------------------
Epoch 3/50
--------------------------------------
--------------------------------------
Epoch3 || Epoch train loss : 0.0 Epoch val loss 0.0
--------------------------------------
timer :  0.0001
--------------------------------------
Epoch 4/50
--------------------------------------
--------------------------------------
Epoch4 || Epoch train loss : 0.0 Epoch val loss 0.0
--------------------------------------
timer :  0.0001
--------------------------------------
Epoch 5/50
-----

100%|██████████| 1456/1456 [02:17<00:00, 10.56it/s]

--------------------------------------
Epoch10 || Epoch train loss : 0.0 Epoch val loss 10764.336997032166
--------------------------------------
timer :  137.8881





FileNotFoundError: ignored