In [16]:
import numpy as np


class Box:
    r'''
    Corner boxes are encoded as (xmin, ymin, xmax, ymax)
    Center boxes are encoded as (center_x, center_y, width, height)
    '''

    def __init__(self, corner):
        self._corner = corner

    @property
    def corner(self):
        return self._corner

    @corner.setter
    def corner(self, new_corner):
        self._corner = new_corner

    @property
    def w(self):
        '''
        计算 bbox 的 宽
        '''
        return self.corner[2] - self.corner[0] + 1

    @property
    def h(self):
        '''
        计算 bbox 的 高
        '''
        return self.corner[3] - self.corner[1] + 1

    @property
    def whctrs(self):
        '''
        计算 bbox 的 中心坐标
        '''
        x_ctr = self.corner[0] + 0.5 * (self.w - 1)
        y_ctr = self.corner[1] + 0.5 * (self.h - 1)
        return [x_ctr, y_ctr]

    @property
    def corner2center(self):
        '''
        (xmin, ymin, xmax, ymax) to (center_x, center_y, width, height)
        '''
        return self.whctrs + [self.w, self.h]

    @property
    def size(self):
        return self.w * self.h

    def __and__(self, other):
        '''
        运算符：&，实现两个 box 的交集运算
        '''
        U = np.array([self.corner, other.corner])
        xmin, ymin, xmax, ymax = np.split(U, 4, axis=1)
        w = xmax.min() - xmin.max()
        h = ymax.min() - ymin.max()
        return w * h

    def __or__(self, other):
        '''
        运算符：|，实现两个 box 的并集运算
        '''
        I = self & other
        return self.size+other.size-I

    def IoU(self, other):
        I = self & other
        U = self | other
        return I / U


class Anchor(Box):
    r'''
     Parameters
    ----------
    stride : int
        Feature map stride with respect to original image.
        This is usually the ratio between original image size and feature map size.
    base_size : int
        The width(and height) of reference anchor box.
    ratios : iterable of float
        The aspect ratios of anchor boxes. We expect it to be a list or tuple.
    scales : iterable of float
        The areas of anchor boxes.
        We use the following form to compute the shapes of anchors:

        .. math::

            width_{anchor} = size_{base} \times scale \times \sqrt{ 1 / ratio}
            height_{anchor} = size_{base} \times scale \times \sqrt{ratio}

    alloc_size : tuple of int
        Allocate size for the anchor boxes as (H, W).
        Usually we generate enough anchors for large feature map, e.g. 128x128.
        Later in inference we can have variable input sizes,
        at which time we can crop corresponding anchors from this large
        anchor map so we can skip re-generating anchors for each input.
    '''

    def __init__(self,  stride, base_size, scales, ratios, alloc_size):
        if not base_size:
            raise ValueError("Invalid base_size: {}.".format(base_size))
        if not isinstance(ratios, (tuple, list)):
            ratios = [ratios]
        if not isinstance(scales, (tuple, list)):
            scales = [scales]
        corner = [0, 0, base_size-1, base_size-1]
        super().__init__(corner)
        self.ratios = ratios
        self.scales = scales
        self.alloc_size = alloc_size
        self.stride = stride

    @property
    def base_sizes(self):
        base_sizes = []
        px, py = self.whctrs
        for r in self.ratios:
            for s in self.scales:
                ratio_size = self.size/r
                ws = np.round(np.sqrt(ratio_size))
                hs = np.round(ws * r)
                w = (ws * s - 1) * 0.5
                h = (hs * s - 1) * 0.5
                base_sizes.append([px - w, py - h, px + w, py + h])
        return np.array(base_sizes)

    @property
    def anchors(self):
        height, width = self.alloc_size
        offset_x = np.arange(0, width * self.stride, self.stride)
        offset_y = np.arange(0, height * self.stride, self.stride)
        offset_x, offset_y = np.meshgrid(offset_x, offset_y)
        offsets = np.stack((offset_x.ravel(), offset_y.ravel(),
                            offset_x.ravel(), offset_y.ravel()), axis=1)
        # broadcast_add (1, N, 4) + (M, 1, 4)
        anchors = (self.base_sizes.reshape((1, -1, 4)) +
                   offsets.reshape((-1, 1, 4)))
        anchors = anchors.reshape((1, 1, height, width, -1)).astype(np.float32)
        return anchors

In [17]:
base_size = 16
stride = 16  # 特征图的每个像素感受野大小，通常为原图和特征图尺寸比例
scales = [8, 16, 32]  # 尺度，面积比
ratios = [0.5, 1, 2]  # window（滑动窗口） 与锚框的面积的比率（aspect ratios）
alloc_size = (50, 50)  # 默认的特征图大小(H,W)，以后每次生成直接索引切片
A = Anchor(stride, base_size, scales, ratios, alloc_size)

In [27]:
from mxnet import gluon, nd
from mxnet.gluon import nn

In [22]:
class RPNAnchorGenerator(gluon.HybridBlock):
    def __init__(self, anchors, **kwargs):
        super().__init__(**kwargs)
        self._num_depth = len(A.ratios) * len(A.scales)
        self.anchors = self.params.get_constant('anchor_', A.anchors)
        
    @property
    def num_depth(self):
        """Number of anchors at each pixel."""
        return self._num_depth

    # pylint: disable=arguments-differ
    def hybrid_forward(self, F, x, anchors):
        """Slice anchors given the input image shape.

        Inputs:
            - **x**: input tensor with (1 x C x H x W) shape.
        Outputs:
            - **out**: output anchor with (1, N, 4) shape. N is the number of anchors.

        """
        a = F.slice_like(anchors, x * 0, axes=(2, 3))
        return a.reshape((1, -1, 4))

In [26]:
anchor_generator = RPNAnchorGenerator(A)
anchor_generator.initialize()

In [32]:
x = nd.ones((1, 3, 22, 22))

In [33]:
anchor_generator(x)


[[[ -84.  -40.   99.   55.]
  [-176.  -88.  191.  103.]
  [-360. -184.  375.  199.]
  ...
  [ 300.  256.  387.  431.]
  [ 256.  168.  431.  519.]
  [ 168.   -8.  519.  695.]]]
<NDArray 1x4356x4 @cpu(0)>

In [24]:
class RPN(gluon.HybridBlock):
    r"""Region Proposal Network.

    Parameters
    ----------
    channels : int
        Channel number used in convolutional layers.
    stride : int
        Feature map stride with respect to original image.
        This is usually the ratio between original image size and feature map size.
    base_size : int
        The width(and height) of reference anchor box.
    scales : iterable of float
        The areas of anchor boxes.
        We use the following form to compute the shapes of anchors:

        .. math::

            width_{anchor} = size_{base} \times scale \times \sqrt{ 1 / ratio}
            height_{anchor} = size_{base} \times scale \times \sqrt{ratio}

    ratios : iterable of float
        The aspect ratios of anchor boxes. We expect it to be a list or tuple.
    alloc_size : tuple of int
        Allocate size for the anchor boxes as (H, W).
        Usually we generate enough anchors for large feature map, e.g. 128x128.
        Later in inference we can have variable input sizes,
        at which time we can crop corresponding anchors from this large
        anchor map so we can skip re-generating anchors for each input.
    clip : float
        Clip bounding box target to this value.
    nms_thresh : float
        IOU threshold for NMS. It is used to remove overlapping proposals.
    train_pre_nms : int
        Filter top proposals before NMS in training.
    train_post_nms : int
        Return top proposal results after NMS in training.
    test_pre_nms : int
        Filter top proposals before NMS in testing.
    test_post_nms : int
        Return top proposal results after NMS in testing.
    min_size : int
        Proposals whose size is smaller than ``min_size`` will be discarded.

    """

    def __init__(self, channels, A,
                 clip, nms_thresh, train_pre_nms, train_post_nms,
                 test_pre_nms, test_post_nms, min_size, **kwargs):
        super().__init__(**kwargs)
        weight_initializer = mx.init.Normal(0.01)
        with self.name_scope():
            self.anchor_generator = RPNAnchorGenerator(A)
            anchor_depth = self.anchor_generator.num_depth
            self.region_proposaler = RPNProposal(
                clip, nms_thresh, train_pre_nms, train_post_nms,
                test_pre_nms, test_post_nms, min_size, stds=(1., 1., 1., 1.))
            self.conv1 = nn.HybridSequential()
            self.conv1.add(nn.Conv2D(channels, 3, 1, 1,
                                     weight_initializer=weight_initializer))
            self.conv1.add(nn.Activation('relu'))
            # use sigmoid instead of softmax, reduce channel numbers
            self.score = nn.Conv2D(anchor_depth, 1, 1, 0,
                                   weight_initializer=weight_initializer)
            self.loc = nn.Conv2D(anchor_depth * 4, 1, 1, 0,
                                 weight_initializer=weight_initializer)

    # pylint: disable=arguments-differ
    def hybrid_forward(self, F, x, img):
        """Forward RPN.

        The behavior during traing and inference is different.

        Parameters
        ----------
        x : mxnet.nd.NDArray or mxnet.symbol
            Feature tensor.
        img : mxnet.nd.NDArray or mxnet.symbol
            The original input image.

        Returns
        -------
        (rpn_score, rpn_box)
            Returns predicted scores and regions which are candidates of objects.

        """
        anchors = self.anchor_generator(x)
        x = self.conv1(x)
        raw_rpn_scores = self.score(x).transpose(axes=(0, 2, 3, 1)).reshape((0, -1, 2))
        rpn_scores = F.sigmoid(F.stop_gradient(raw_rpn_scores))
        rpn_box_pred = self.loc(x).transpose(
            axes=(0, 2, 3, 1)).reshape((0, -1, 4))
        rpn_score, rpn_box = self.region_proposaler(
            anchors, rpn_scores, F.stop_gradient(rpn_box_pred), img)
        if autograd.is_training():
            # return raw predictions as well in training for bp
            return rpn_score, rpn_box, raw_rpn_scores, rpn_box_pred, anchors
        return rpn_score, rpn_box

RPNAnchorGenerator(

)