In [2]:
from keras.engine.topology import Layer
import keras.backend as K


class RoiPoolingConv(Layer):
    """ROI pooling layer for 2D inputs.
    See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition,
    K. He, X. Zhang, S. Ren, J. Sun
    # Arguments
        pool_size: int
            Size of pooling region to use. pool_size = 7 will result in a 7x7 region.
        num_rois: number of regions of interest to be used
    # Input shape
        list of two 4D tensors [X_img,X_roi] with shape:
        X_img:
        `(1, channels, rows, cols)` if dim_ordering='th'
        or 4D tensor with shape:
        `(1, rows, cols, channels)` if dim_ordering='tf'.
        X_roi:
        `(1,num_rois,4)` list of rois, with ordering (x,y,w,h)
    # Output shape
        3D tensor with shape:
        `(1, num_rois, channels, pool_size, pool_size)`
    """

    def __init__(self, pool_size, num_rois, **kwargs):
        self.pool_size = pool_size
        self.num_rois = num_rois

        super(RoiPoolingConv, self).__init__(**kwargs)

    def build(self, input_shape):
        self.nb_channels = input_shape[0][3]

    def compute_output_shape(self, input_shape):
        return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels

    def call(self, x, mask=None):

        assert (len(x) == 2)

        img = x[0]
        rois = x[1]

        input_shape = K.shape(img)

        outputs = []

        for roi_idx in range(self.num_rois):

            x = rois[0, roi_idx, 0]
            y = rois[0, roi_idx, 1]
            w = rois[0, roi_idx, 2]
            h = rois[0, roi_idx, 3]

            row_length = w / float(self.pool_size)
            col_length = h / float(self.pool_size)

            num_pool_regions = self.pool_size

            for jy in range(num_pool_regions):
                for ix in range(num_pool_regions):
                    x1 = x + ix * row_length
                    x2 = x1 + row_length
                    y1 = y + jy * col_length
                    y2 = y1 + col_length

                    x1 = K.cast(x1, 'int32')
                    x2 = K.cast(x2, 'int32')
                    y1 = K.cast(y1, 'int32')
                    y2 = K.cast(y2, 'int32')

                    new_shape = [input_shape[0], y2 - y1,
                                 x2 - x1, input_shape[3]]
                    x_crop = img[:, y1:y2, x1:x2, :]
                    xm = K.reshape(x_crop, new_shape)
                    pooled_val = K.max(xm, axis=(1, 2))
                    outputs.append(pooled_val)

        final_output = K.concatenate(outputs, axis=0)
        final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))

        final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4))

        return final_output

In [3]:
import pdb

import keras.backend as K
import numpy as np
from keras.layers import Input
from keras.models import Model

pooling_regions = 2
num_rois = 4
num_channels = 3

in_img = Input(shape=(None, None, num_channels))

in_roi = Input(shape=(num_rois, 4))

out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([in_img, in_roi])

model = Model([in_img, in_roi], out_roi_pool)
model.summary()

model.compile(loss='mse', optimizer='sgd')

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4, 4)         0                                            
__________________________________________________________________________________________________
roi_pooling_conv_1 (RoiPoolingC (None, 4, 2, 2, 3)   0           input_1[0][0]                    
                                                                 input_2[0][0]                    
Total params: 0
Trainable params: 0
Non-trainable params: 0
__________________________________________________________________________________________________


In [4]:
for img_size in [32]:
    X_img = np.random.rand(1, img_size, img_size, num_channels)
    row_length = [float(X_img.shape[1]) / pooling_regions]
    col_length = [float(X_img.shape[2]) / pooling_regions]

    X_roi = np.array([[0, 0, img_size / 2, img_size / 2],
                      [0, img_size / 2, img_size / 2, img_size / 2],
                      [img_size / 2, 0, img_size / 2, img_size / 2],
                      [img_size / 2, img_size / 2, img_size / 2, img_size / 2]])
    
    X_roi = np.reshape(X_roi, (1, num_rois, 4))

    Y = model.predict([X_img, X_roi])
    
    




In [5]:
X_img.shape

(1, 32, 32, 3)

In [7]:
for roi in range(num_rois):
    X_curr = X_img[0, int(X_roi[0, roi, 1]):int(X_roi[0, roi, 1] + X_roi[0, roi, 3]),
             int(X_roi[0, roi, 0]):int(X_roi[0, roi, 0] + X_roi[0, roi, 2]), :]
    row_length = float(X_curr.shape[0]) / pooling_regions
    col_length = float(X_curr.shape[1]) / pooling_regions
    
    idx = 0
    
    for ix in range(pooling_regions):
        for jy in range(pooling_regions):
            for cn in range(num_channels):

                x1 = int((ix * col_length))
                x2 = int((ix * col_length + col_length))
                y1 = int((jy * row_length))
                y2 = int((jy * row_length + row_length))
                dx = max(1, x2 - x1)
                dy = max(1, y2 - y1)
                x2 = x1 + dx
                y2 = y1 + dy

                m_val = np.max(X_curr[y1:y2, x1:x2, cn])

                if abs(m_val - Y[0, roi, jy, ix, cn]) < 0.01:
                    pdb.set_trace()
                np.testing.assert_almost_equal(
                    m_val, Y[0, roi, jy, ix, cn], decimal=6)
                idx += 1

> <ipython-input-7-7806c0f0b4be>(26)<module>()
-> np.testing.assert_almost_equal(
(Pdb) print
<built-in function print>
(Pdb) print
<built-in function print>
(Pdb) list
 21  	
 22  	                m_val = np.max(X_curr[y1:y2, x1:x2, cn])
 23  	
 24  	                if abs(m_val - Y[0, roi, jy, ix, cn]) < 0.01:
 25  	                    pdb.set_trace()
 26  ->	                np.testing.assert_almost_equal(
 27  	                    m_val, Y[0, roi, jy, ix, cn], decimal=6)
 28  	                idx += 1
[EOF]
(Pdb) where
  /usr/lib/python3.7/runpy.py(193)_run_module_as_main()
-> "__main__", mod_spec)
  /usr/lib/python3.7/runpy.py(85)_run_code()
-> exec(code, run_globals)
  /home/jhjung/tensorflow_1.14.0_env/lib/python3.7/site-packages/ipykernel_launcher.py(16)<module>()
-> app.launch_new_instance()
  /home/jhjung/tensorflow_1.14.0_env/lib/python3.7/site-packages/traitlets/config/application.py(664)launch_instance()
-> app.start()
  /home/jhjung/tensorflow_1.14.0_env/lib/python3.7/site

BdbQuit: 

In [None]:
row_length