In [2]:
import cv2
import torchvision
import numpy as np
from keras.preprocessing.image import load_img
import torchvision.transforms as T

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

img_path = '2007_000032.jpg'
img = load_img(img_path, target_size=(800, 800))
transform = T.Compose([T.ToTensor()])  # Defing PyTorch Transform
img = transform(img)  # Apply the transform to the image

images = [img]
targets=None
original_image_sizes = [img.shape[-2:] for img in images]
images, targets = model.transform(images, targets)

提取特征

In [5]:
features = model.backbone(images.tensors)

提取proposals

In [6]:
proposals, proposal_losses = model.rpn(images, features, targets)
proposals[0].size()

torch.Size([1000, 4])

proposals的大小是1000，我们随机选取一个进行计算

In [7]:
proposals[0][76]

tensor([257.4335, 268.6282, 648.8815, 444.5958])

我们原始图片大小是800x800，特征大小是50x50，所以缩放比例是16

In [9]:
image_shapes = images.image_sizes
image_shapes

[torch.Size([800, 800])]

In [10]:
third_level_feature = features[2]        
third_level_feature.size()

torch.Size([1, 256, 50, 50])

In [11]:
[x/16 for x in [257.4335, 268.6282, 648.8815, 444.5958]]

[16.08959375, 16.7892625, 40.55509375, 27.7872375]

现在我们在特征上的区域是

In [12]:
import torch
rois_in_feature = torch.Tensor([[0, 16.0896, 16.7893, 40.5551, 27.7872]]) 

In [13]:
third_level_feature[0][0][16][16] 

tensor(0.0741, grad_fn=<SelectBackward>)

In [14]:
roi_start_w = 16
roi_start_h = 17
roi_end_w = 41
roi_end_h = 28

roi_width = (roi_end_w - roi_start_w + 1)
roi_height = (roi_end_h - roi_start_h + 1)

bin_size_w = roi_width / 7
bin_size_h = roi_height / 7

In [16]:
import math
self_result = []

for j in range(7):
    sub_res = []
    for i in range(7):
        sub_res.append(third_level_feature[..., int(17+bin_size_h*(j)):math.ceil(17+bin_size_h*(j+1)), int(16+bin_size_w*(i)):math.ceil(16+bin_size_w*(i+1))][0][0].max())
    self_result.append(sub_res)
self_result[0]

[tensor(0.4245, grad_fn=<MaxBackward1>),
 tensor(0.4636, grad_fn=<MaxBackward1>),
 tensor(0.3529, grad_fn=<MaxBackward1>),
 tensor(0.0751, grad_fn=<MaxBackward1>),
 tensor(0.3588, grad_fn=<MaxBackward1>),
 tensor(0.6445, grad_fn=<MaxBackward1>),
 tensor(0.0492, grad_fn=<MaxBackward1>)]

我们用torchvision的roi pooling检验下上面的实现是否正确

In [17]:
from torchvision.ops import roi_pool
result = roi_pool(
                third_level_feature, rois_in_feature,
                output_size=(7, 7)
            )
result[0][0]

tensor([[ 0.4245,  0.4636,  0.3529,  0.0751,  0.3588,  0.6445,  0.0492],
        [ 1.1585,  0.3858,  0.3529,  0.0751,  0.3801,  0.9784,  0.6303],
        [ 0.8688,  0.2752,  0.2927, -0.1014,  0.2978,  0.9784,  0.6303],
        [-0.3445, -0.1203, -0.4138, -0.6199, -0.0419,  0.5004,  0.3759],
        [ 0.1199,  0.1690,  0.9712,  0.9712,  0.6120,  0.6155,  0.5975],
        [ 0.8110,  0.5443,  1.2172,  1.2172,  0.8078,  1.0437,  1.3142],
        [ 0.8775,  0.6454,  1.1695,  1.1637,  0.6355,  1.2764,  1.3474]],
       grad_fn=<SelectBackward>)