In [1]:
from mmdet.apis import init_detector, inference_detector
from mmdet.datasets import replace_ImageToTensor
from mmdet.datasets.pipelines import Compose
from mmcv.parallel import collate, scatter
import mmcv
import os
import csv
import shutil
import numpy as np
from torchvision.models import resnet50, ResNet50_Weights, resnet152, ResNet152_Weights
from torchvision.transforms.functional import to_pil_image
import torch
import matplotlib.pyplot as plt
import datetime as dt
import cv2

  warn(f"Failed to load image Python extension: {e}")


In [2]:
# object detection model
config_file = 'configs/oln_box/faster_rcnn_r50_fpn_1x_coco.py'
checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
object_detector = init_detector(config_file, checkpoint_file, device='cuda:0')

load checkpoint from local path: checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth


In [3]:
# 

In [4]:
root = '/home/vsuciu/data/hdvila/videos/'
video_dir = os.path.join(root, 'debug')
video_fps = [os.path.join(video_dir, d) for d in os.listdir(video_dir) if d[-5:] != '.part']

## Trying to extract the pytorch module from object_detector

In [5]:
print(type(object_detector))

<class 'mmdet.models.detectors.faster_rcnn.FasterRCNN'>


In [6]:
torch_detector = torch.load('checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth')
print(type(torch_detector))
print([k for k in torch_detector.keys()])

<class 'dict'>
['meta', 'state_dict']


## Original batch rcnn attempt

In [7]:
print(object_detector.CLASSES)

('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush')


In [8]:
vid_reader = mmcv.VideoReader(video_fps[0])
frame_idx = [i for i in range(0, 100, 20)]
print(frame_idx)

[0, 20, 40, 60, 80]


In [9]:
frames = [vid_reader[i] for i in frame_idx]
batch_frames = np.stack(frames, axis=0)
print(batch_frames.shape)

(5, 1080, 1920, 3)


In [10]:
def predict_bboxes(model, frame, score_thresh):
    """
    model:
    frame:
    score_thresh:
    """
    result = inference_detector(model, frame)
    # print(result)

    labels = np.concatenate([
        np.full(bbox.shape[0], i, dtype=np.int32)
        for i, bbox in enumerate(result)
    ])
    bboxes = np.vstack(result)
    scores = bboxes[:, -1]

    if score_thresh > 0:
        inds = scores > score_thresh
        bboxes = bboxes[inds, :]
        labels = labels[inds]
        scores = scores[inds]
    
    bboxes = bboxes.astype(np.int32)

    return result, bboxes, labels, scores

In [11]:
def reformat_data(img_list, model):
    reformat_imgs = []
    for img in img_list:
        cfg = model.cfg
        device = next(model.parameters()).device  # model device

        # prepare data
        if isinstance(img, np.ndarray):
            # directly add img
            data = dict(img=img)
            cfg = cfg.copy()
            # set loading pipeline type
            cfg.data.test.pipeline[0].type = 'LoadImageFromWebcam'
        else:
            # add information into dict
            data = dict(img_info=dict(filename=img), img_prefix=None)
        # build the data pipeline
        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
        test_pipeline = Compose(cfg.data.test.pipeline)
        data = test_pipeline(data)
        data = collate([data], samples_per_gpu=1)
        # just get the actual data from DataContainer
        data['img_metas'] = [img_metas.data[0] for img_metas in data['img_metas']]
        data['img'] = [img.data[0] for img in data['img']]

        # scatter to specified GPU
        data = scatter(data, [device])[0]
        data['img'][0].to(device)

        reformat_imgs.append(data)
        

    return reformat_imgs

# Test batch inference method

Manually call `object_detector.forward()`

When done with a batch of images (`List[np.ndarray()]`), the stack trace is the following:

* `BaseDetector.forward` from `mmdetection/mmdet/models/roi_heads/standard_roi_head.py` bound to `TwoStageDetector.aug_test` from `mmdetection/mmdet/models/detectors/two_stage.py`

* `self.rpn_head.aug_test_rpn` from `mmdetection/mmdet/models/detectors/two_stage.py` bound to `(TODO, figure out binding)`

    * Find out where this leads

* `self.roi_head.aud_test` from `mmdetection/mmdet/models/detectors/two_stage.py` bound to `StandardRoIHead.aug_test` from `mmdetection/mmdet/models/roi_heads/standard_roi_head.py`

    * `BBoxTestMixin.aug_test_bboxes` from `mmdetection/mmdet/models/roi_heads/test_mixins.py`

In [12]:
reformat_imgs = reformat_data(batch_frames, object_detector)
infer = object_detector.forward(
    [reformat_imgs[i]['img'][0] for i in range(5)],
    [reformat_imgs[0]['img_metas'][0] for i in range(5)],
    return_loss=False
)
print(len(infer))
print(len(infer[0]))
for class_id, bbox in enumerate(infer[0]):
    if len(bbox) > 0:
        print(class_id, bbox)
# infer_func([batch_frames[0]], [preproc_pipelines[0]])

base aug_test




two_stage aug_test - calling self.roi_head.aug_test
1
80
0 [[460.11642     62.754158   991.1607     743.7285       0.99744415]]
27 [[5.5363916e+02 3.7316034e+02 7.4738928e+02 7.4429749e+02 1.3386989e-01]
 [5.1356573e+02 3.2176093e+02 8.9700031e+02 7.4873615e+02 8.1769660e-02]]
45 [[1.2465894e+03 1.7366795e+02 1.3327893e+03 2.8916534e+02 8.0856077e-02]]
62 [[3.3159723e+02 2.2433337e+02 4.7603149e+02 4.5161020e+02 6.0054678e-02]]
67 [[3.3062119e+02 2.3768761e+02 4.7591162e+02 4.6843781e+02 9.6066013e-02]]
68 [[1.0962365e+03 1.4370242e+02 1.3020259e+03 2.9505151e+02 7.0088357e-01]
 [1.0572594e+03 1.1019763e+02 1.3205237e+03 3.5426678e+02 1.2704824e-01]
 [1.0354945e+03 3.7179543e+01 1.3273271e+03 5.0871283e+02 7.9586133e-02]]
69 [[1.0171064e+03 5.3764824e+01 1.3313906e+03 5.9398468e+02 8.7112255e-02]]
70 [[1.1080841e+03 1.4529343e+02 1.2940260e+03 2.8342053e+02 9.2800058e-02]]
72 [[5.6196468e+01 1.3900958e+01 6.4745679e+02 7.4908209e+02 5.4173738e-01]
 [2.5777100e+02 7.5362182e+00 8.415120

In [13]:
print(type(batch_frames[0]))
result, bboxes, labels, scores = predict_bboxes(
    object_detector,
    batch_frames[0],
    0.1
)

print(labels)

<class 'numpy.ndarray'>
[ 0 41 45 45 62 68 68 68 70 72 72 72 72 73]
