# Data Preparation

In [None]:
!unzip -o WIDER_train.zip -d ./FaceBoxes.PyTorch/data/WIDER_FACE/images/

### 1. Download WIDER FACE dataset(WIDER_train.zip), place the images under this directory:

$FaceBoxes.PyTorch/data/WIDER_FACE/images

2. Convert WIDER FACE annotations to VOC format or download the converted annotations(annotations.rar), place them under this directory:

$FaceBoxes.PyTorch/data/WIDER_FACE/annotations

3. Also place the img_list.txt under this directory:

$FaceBoxes.PyTorch/data/WIDER_FACE/

In [4]:
from __future__ import print_function
import os
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn

# Utils

In [5]:
import cv2
import numpy as np
import random

def matrix_iou(a, b):
    """
    return iou of a and b, numpy version for data augenmentation
    """
    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
    return area_i / (area_a[:, np.newaxis] + area_b - area_i)

def matrix_iof(a, b):
    """
    return iof of a and b, numpy version for data augenmentation
    """
    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
    return area_i / np.maximum(area_a[:, np.newaxis], 1)


def _crop(image, boxes, labels, img_dim):
    height, width, _ = image.shape
    pad_image_flag = True

    for _ in range(250):
        if random.uniform(0, 1) <= 0.2:
            scale = 1
        else:
            scale = random.uniform(0.3, 1.)
        short_side = min(width, height)
        w = int(scale * short_side)
        h = w

        if width == w:
            l = 0
        else:
            l = random.randrange(width - w)
        if height == h:
            t = 0
        else:
            t = random.randrange(height - h)
        roi = np.array((l, t, l + w, t + h))

        value = matrix_iof(boxes, roi[np.newaxis])
        flag = (value >= 1)
        if not flag.any():
            continue

        centers = (boxes[:, :2] + boxes[:, 2:]) / 2
        mask_a = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1)
        boxes_t = boxes[mask_a].copy()
        labels_t = labels[mask_a].copy()

        if boxes_t.shape[0] == 0:
            continue

        image_t = image[roi[1]:roi[3], roi[0]:roi[2]]

        boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
        boxes_t[:, :2] -= roi[:2]
        boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
        boxes_t[:, 2:] -= roi[:2]

	# make sure that the cropped image contains at least one face > 16 pixel at training image scale
        b_w_t = (boxes_t[:, 2] - boxes_t[:, 0] + 1) / w * img_dim
        b_h_t = (boxes_t[:, 3] - boxes_t[:, 1] + 1) / h * img_dim
        mask_b = np.minimum(b_w_t, b_h_t) > 16.0
        boxes_t = boxes_t[mask_b]
        labels_t = labels_t[mask_b]

        if boxes_t.shape[0] == 0:
            continue

        pad_image_flag = False

        return image_t, boxes_t, labels_t, pad_image_flag
    return image, boxes, labels, pad_image_flag


def _distort(image):

    def _convert(image, alpha=1, beta=0):
        tmp = image.astype(float) * alpha + beta
        tmp[tmp < 0] = 0
        tmp[tmp > 255] = 255
        image[:] = tmp

    image = image.copy()

    if random.randrange(2):

        #brightness distortion
        if random.randrange(2):
            _convert(image, beta=random.uniform(-32, 32))

        #contrast distortion
        if random.randrange(2):
            _convert(image, alpha=random.uniform(0.5, 1.5))

        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

        #saturation distortion
        if random.randrange(2):
            _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))

        #hue distortion
        if random.randrange(2):
            tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
            tmp %= 180
            image[:, :, 0] = tmp

        image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)

    else:

        #brightness distortion
        if random.randrange(2):
            _convert(image, beta=random.uniform(-32, 32))

        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

        #saturation distortion
        if random.randrange(2):
            _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))

        #hue distortion
        if random.randrange(2):
            tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
            tmp %= 180
            image[:, :, 0] = tmp

        image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)

        #contrast distortion
        if random.randrange(2):
            _convert(image, alpha=random.uniform(0.5, 1.5))

    return image


def _expand(image, boxes, fill, p):
    if random.randrange(2):
        return image, boxes

    height, width, depth = image.shape

    scale = random.uniform(1, p)
    w = int(scale * width)
    h = int(scale * height)

    left = random.randint(0, w - width)
    top = random.randint(0, h - height)

    boxes_t = boxes.copy()
    boxes_t[:, :2] += (left, top)
    boxes_t[:, 2:] += (left, top)
    expand_image = np.empty(
        (h, w, depth),
        dtype=image.dtype)
    expand_image[:, :] = fill
    expand_image[top:top + height, left:left + width] = image
    image = expand_image

    return image, boxes_t


def _mirror(image, boxes):
    _, width, _ = image.shape
    if random.randrange(2):
        image = image[:, ::-1]
        boxes = boxes.copy()
        boxes[:, 0::2] = width - boxes[:, 2::-2]
    return image, boxes


def _pad_to_square(image, rgb_mean, pad_image_flag):
    if not pad_image_flag:
        return image
    height, width, _ = image.shape
    long_side = max(width, height)
    image_t = np.empty((long_side, long_side, 3), dtype=image.dtype)
    image_t[:, :] = rgb_mean
    image_t[0:0 + height, 0:0 + width] = image
    return image_t


def _resize_subtract_mean(image, insize, rgb_mean):
    interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
    interp_method = interp_methods[random.randrange(5)]
    image = cv2.resize(image, (insize, insize), interpolation=interp_method)
    image = image.astype(np.float32)
    image -= rgb_mean
    return image.transpose(2, 0, 1)


class preproc(object):

    def __init__(self, img_dim, rgb_means):
        self.img_dim = img_dim
        self.rgb_means = rgb_means

    def __call__(self, image, targets):
        assert targets.shape[0] > 0, "this image does not have gt"

        boxes = targets[:, :-1].copy()
        labels = targets[:, -1].copy()

        #image_t = _distort(image)
        #image_t, boxes_t = _expand(image_t, boxes, self.cfg['rgb_mean'], self.cfg['max_expand_ratio'])
        #image_t, boxes_t, labels_t = _crop(image_t, boxes, labels, self.img_dim, self.rgb_means)
        image_t, boxes_t, labels_t, pad_image_flag = _crop(image, boxes, labels, self.img_dim)
        image_t = _distort(image_t)
        image_t = _pad_to_square(image_t,self.rgb_means, pad_image_flag)
        image_t, boxes_t = _mirror(image_t, boxes_t)
        height, width, _ = image_t.shape
        image_t = _resize_subtract_mean(image_t, self.img_dim, self.rgb_means)
        boxes_t[:, 0::2] /= width
        boxes_t[:, 1::2] /= height

        labels_t = np.expand_dims(labels_t, 1)
        targets_t = np.hstack((boxes_t, labels_t))

        return image_t, targets_t

In [6]:
import os.path
import sys
import torch.utils.data as data
import cv2
import numpy as np
if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET


WIDER_CLASSES = ( '__background__', 'face')


class AnnotationTransform(object):

    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes

    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not
            (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=True):
        self.class_to_ind = class_to_ind or dict(
            zip(WIDER_CLASSES, range(len(WIDER_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target):
        """
        Arguments:
            target (annotation) : the target annotation to be made usable
                will be an ET.Element
        Returns:
            a list containing lists of bounding boxes  [bbox coords, class name]
        """
        res = np.empty((0, 5))
        for obj in target.iter('object'):
            difficult = int(obj.find('difficult').text) == 1
            if not self.keep_difficult and difficult:
                continue
            name = obj.find('name').text.lower().strip()
            bbox = obj.find('bndbox')

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text)
                bndbox.append(cur_pt)
            label_idx = self.class_to_ind[name]
            bndbox.append(label_idx)
            res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
        return res


class VOCDetection(data.Dataset):

    """VOC Detection Dataset Object

    input is image, target is annotation

    Arguments:
        root (string): filepath to WIDER folder
        target_transform (callable, optional): transformation to perform on the
            target `annotation`
            (eg: take in caption string, return tensor of word indices)
    """

    def __init__(self, root, preproc=None, target_transform=None):
        self.root = root
        self.preproc = preproc
        self.target_transform = target_transform
        self._annopath = os.path.join(self.root, 'annotations', '%s')
        self._imgpath = os.path.join(self.root, 'images', '%s')
        self.ids = list()
        with open(os.path.join(self.root, 'img_list.txt'), 'r') as f:
          self.ids = [tuple(line.split()) for line in f]

    def __getitem__(self, index):
        img_id = self.ids[index]
        target = ET.parse(self._annopath % img_id[1]).getroot()
        img = cv2.imread(self._imgpath % img_id[0], cv2.IMREAD_COLOR)
        imgid = img_id[0]
        height, width, _ = img.shape

        if self.target_transform is not None:
            target = self.target_transform(target)

        if self.preproc is not None:
            img, target = self.preproc(img, target)

        return torch.from_numpy(img), target, imgid

    def __len__(self):
        return len(self.ids)


def detection_collate(batch):
    """Custom collate fn for dealing with batches of images that have a different
    number of associated object annotations (bounding boxes).

    Arguments:
        batch: (tuple) A tuple of tensor images and lists of annotations

    Return:
        A tuple containing:
            1) (tensor) batch of images stacked on their 0 dim
            2) (list of tensors) annotations for a given image are stacked on 0 dim
    """
    targets = []
    imgs = []
    for _, sample in enumerate(batch):
        for _, tup in enumerate(sample):
            if torch.is_tensor(tup):
                imgs.append(tup)
            elif isinstance(tup, type(np.empty(0))):
                annos = torch.from_numpy(tup).float()
                targets.append(annos)

    return (torch.stack(imgs, 0), targets)

In [7]:
img_dim = 1024 # only 1024 is supported
rgb_mean = (104, 117, 123) # bgr order
num_classes = 2
initial_lr = 1e-3
gamma = 0.1
max_epoch = 5
training_dataset = './FaceBoxes.PyTorch/data/WIDER_FACE'
resume_net = None
resume_epoch = 0

In [9]:
dataset = VOCDetection(training_dataset, preproc(img_dim, rgb_mean), AnnotationTransform())

In [12]:
##every time running the following command will give a different result(matrix),but the img name remains the same
dataset[0]

(tensor([[[ -8.,  -4.,   1.,  ...,  63.,  62.,  61.],
          [ -8.,  -5.,  -1.,  ...,  63.,  62.,  61.],
          [-11.,  -8.,  -6.,  ...,  63.,  62.,  61.],
          ...,
          [-26., -25., -25.,  ..., -72., -72., -72.],
          [-27., -26., -28.,  ..., -72., -72., -72.],
          [-27., -27., -29.,  ..., -72., -72., -72.]],
 
         [[-18., -13.,  -9.,  ...,  58.,  58.,  58.],
          [-19., -16., -12.,  ...,  58.,  58.,  58.],
          [-23., -20., -17.,  ...,  58.,  58.,  58.],
          ...,
          [ -5.,  -5.,  -5.,  ..., -85., -85., -85.],
          [ -6.,  -6.,  -9.,  ..., -85., -85., -85.],
          [ -6.,  -7., -10.,  ..., -85., -85., -85.]],
 
         [[-25., -20., -13.,  ...,  52.,  52.,  52.],
          [-26., -22., -16.,  ...,  52.,  52.,  52.],
          [-30., -26., -22.,  ...,  52.,  52.,  52.],
          ...,
          [-28., -28., -31.,  ..., -91., -91., -91.],
          [-29., -30., -34.,  ..., -91., -91., -91.],
          [-30., -31., -34.,  .

In [27]:
batch_size = 1
batch_iterator = data.DataLoader(dataset,shuffle=False)

In [29]:
batch_iterator

<torch.utils.data.dataloader.DataLoader at 0x7fb41910e780>

In [36]:
for r, rec in enumerate(batch_iterator):
    print (rec)
    break

[tensor([[[[  4.,   4.,   2.,  ...,   3.,   3.,   4.],
          [  4.,   4.,   2.,  ...,   3.,   3.,   4.],
          [  2.,   2.,   3.,  ...,   4.,   4.,   5.],
          ...,
          [-23., -23., -27.,  ..., -76., -76., -76.],
          [-23., -23., -27.,  ..., -76., -76., -76.],
          [-22., -22., -21.,  ..., -76., -76., -76.]],

         [[ 18.,  18.,  16.,  ...,  -9.,  -9.,  -8.],
          [ 18.,  18.,  16.,  ...,  -9.,  -9.,  -8.],
          [ 16.,  16.,  15.,  ...,  -8.,  -8.,  -7.],
          ...,
          [ 19.,  19.,  15.,  ..., -89., -89., -89.],
          [ 19.,  19.,  15.,  ..., -89., -89., -89.],
          [ 21.,  21.,  22.,  ..., -89., -89., -89.]],

         [[ 19.,  19.,  19.,  ...,  -6.,  -6.,  -5.],
          [ 19.,  19.,  19.,  ...,  -6.,  -6.,  -5.],
          [ 15.,  15.,  17.,  ...,  -5.,  -5.,  -4.],
          ...,
          [ -9.,  -9., -13.,  ..., -95., -95., -95.],
          [ -9.,  -9., -13.,  ..., -95., -95., -95.],
          [ -8.,  -8.,  -7.,  ..