# PoseCNN-Pytorch 코드 정리 본

### Running the demo
1. Download 3D models and our pre-trained checkpoints first.

2. run the following script
- ./experiments/scripts/demo.sh


### test_images.py 옵션 정리
__--gpu : 사용할 gpu ID 선택 0__

\-

__--imgdir : 테스트 이미지 저장 경로 설정__
\-

__--meta : meta 데이터 파일 불러오기__

\-  INTRINSICS: [618.0172729492188, 0.0, 312.376953125, 0.0, 618.0033569335938, 232.37530517578125,0.0, 0.0, 1.0]

__--color : imgdir 내에 있는 객체들이 있는 이미지({number}-color.png) 경로 설정__

\-

__--network : 사용할 CNN 네트워크 선택 (PoseCNN-PyTorch/lib/networks/PoseCNN.py)__

\-  \_\_all__' = ['posecnn',]


__--pretrained : 사전 훈련된 Encoder Checkpoint 경로 설정__

\-
  
__--dataset : Train 데이터셋 경로 설정__


\-

__--cfg : Config 파일 경로 설정__

\-

In [None]:
def parse_args():
    """
    Parse input arguments
    """
    # demo.sh 에서 입력한 값 불러오기
    parser = argparse.ArgumentParser(description='Test a PoseCNN network')
    parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use',
                        default=0, type=int)
    parser.add_argument('--pretrained', dest='pretrained',
                        help='initialize with pretrained checkpoint',
                        default=None, type=str)
    parser.add_argument('--pretrained_encoder', dest='pretrained_encoder',
                        help='initialize with pretrained encoder checkpoint',
                        default=None, type=str)
    parser.add_argument('--codebook', dest='codebook',
                        help='codebook',
                        default=None, type=str)
    parser.add_argument('--cfg', dest='cfg_file',
                        help='optional config file', default=None, type=str)
    parser.add_argument('--meta', dest='meta_file',
                        help='optional metadata file', default=None, type=str)
    parser.add_argument('--dataset', dest='dataset_name',
                        help='dataset to train on',
                        default='shapenet_scene_train', type=str)
    parser.add_argument('--depth', dest='depth_name',
                        help='depth image pattern',
                        default='*depth.png', type=str)
    parser.add_argument('--color', dest='color_name',
                        help='color image pattern',
                        default='*color.png', type=str)
    parser.add_argument('--imgdir', dest='imgdir',
                        help='path of the directory with the test images',
                        default='data/Images', type=str)
    parser.add_argument('--rand', dest='randomize',
                        help='randomize (do not use a fixed seed)',
                        action='store_true')
    parser.add_argument('--network', dest='network_name',
                        help='name of the network',
                        default=None, type=str)
    parser.add_argument('--background', dest='background_name',
                        help='name of the background file',
                        default=None, type=str)
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args()
    return args

# ...



## PoseCNN.py 정리

![image.png](https://ar5iv.labs.arxiv.org/html/1711.00199/assets/x2.png)

In [None]:
# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

# 라이브러리 불러오기
import torch
import torch.nn as nn
import torchvision.models as models
import math
import sys
import copy
from torch.nn.init import kaiming_normal_

from layers.hard_label import HardLabel
from layers.hough_voting import HoughVoting
from layers.roi_pooling import RoIPool
from layers.point_matching_loss import PMLoss
from layers.roi_target_layer import roi_target_layer
from layers.pose_target_layer import pose_target_layer
from fcn.config import cfg


In [None]:
# 네트워크 종류
__all__ = [
    'posecnn',
]

# VGG16 모델 초기화
vgg16 = models.vgg16(pretrained=False)

# 고차원 Log-Softmax 함수 구현
def log_softmax_high_dimension(input):
    num_classes = input.size()[1]
    m = torch.max(input, dim=1, keepdim=True)[0]
    if input.dim() == 4:
        d = input - m.repeat(1, num_classes, 1, 1)
    else:
        d = input - m.repeat(1, num_classes)
    e = torch.exp(d)
    s = torch.sum(e, dim=1, keepdim=True)
    if input.dim() == 4:
        output = d - torch.log(s.repeat(1, num_classes, 1, 1))
    else:
        output = d - torch.log(s.repeat(1, num_classes))
    return output

# 고차원 Softmax 함수 구현
def softmax_high_dimension(input):
    num_classes = input.size()[1]
    m = torch.max(input, dim=1, keepdim=True)[0]
    if input.dim() == 4:
        e = torch.exp(input - m.repeat(1, num_classes, 1, 1))
    else:
        e = torch.exp(input - m.repeat(1, num_classes))
    s = torch.sum(e, dim=1, keepdim=True)
    if input.dim() == 4:
        output = torch.div(e, s.repeat(1, num_classes, 1, 1))
    else:
        output = torch.div(e, s.repeat(1, num_classes))
    return output


# Convolutional Layer 함수 구현
def conv(in_planes, out_planes, kernel_size=3, stride=1, relu=True):
    if relu:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=True),
            nn.ReLU(inplace=True))
    else:
        return nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=True)

# Fully Connected Layer 함수 구현
def fc(in_planes, out_planes, relu=True):
    if relu:
        return nn.Sequential(
            nn.Linear(in_planes, out_planes),
            nn.LeakyReLU(0.1, inplace=True))
    else:
        return nn.Linear(in_planes, out_planes)

# Upsampling 함수 구현
def upsample(scale_factor):
    return nn.Upsample(scale_factor=scale_factor, mode='bilinear')


In [None]:
# PoseCNN 클래스 구성
class PoseCNN(nn.Module):

    def __init__(self, num_classes, num_units):
        super(PoseCNN, self).__init__()
        self.num_classes = num_classes # 분류할 클래스 수 지정

        # conv features
        features = list(vgg16.features)[:30] # VGG16 모델의 30개 Conv Features 지정
        
        # 첫 번째 RGBD Conv Layer 수정
        if cfg.INPUT == 'RGBD': 
            conv0 = conv(6, 64, kernel_size=3, relu=False) # 새로운 Conv Layer 생성
            conv0.weight.data[:, :3, :, :] = features[0].weight.data # RGB Channel 가중치
            conv0.weight.data[:, 3:, :, :] = features[0].weight.data # Depth Channel 가중치
            conv0.bias.data = features[0].bias.data # Bias
            features[0] = conv0 # 수정된 Layer 적용

        self.features = nn.ModuleList(features) # ModuleList로 Features Layer 저장
        self.classifier = vgg16.classifier[:-1] # VGG16 분류기 Last Layer 제거
        if cfg.TRAIN.SLIM: # Config 파일 내 Slim 아키텍쳐 사용시 아래 차원 설정값 적용
            dim_fc = 256
            self.classifier[0] = nn.Linear(512*7*7, 256)
            self.classifier[3] = nn.Linear(256, 256)
        else:
            dim_fc = 4096 # 기본 분류기 차원 설정 
            
        print(self.features)
        print(self.classifier)

        # 특정 Layer 고정
        if cfg.TRAIN.FREEZE_LAYERS:
            for i in [0, 2, 5, 7, 10, 12, 14]:
                self.features[i].weight.requires_grad = False # 가중치 업데이트 X
                self.features[i].bias.requires_grad = False # Bias 업데이트 X

        # semantic labeling branch 설정
        self.conv4_embed = conv(512, num_units, kernel_size=1) # conv4 Embedding Layer 설정
        self.conv5_embed = conv(512, num_units, kernel_size=1) # conv5 Embedding Layer 설정
        self.upsample_conv5_embed = upsample(2.0) # conv5 Embedding Upsampling 설정
        self.upsample_embed = upsample(8.0) # 최종 Embedding Upsampling 설정
        self.conv_score = conv(num_units, num_classes, kernel_size=1) # Class Score 계산 Layer 설정
        self.hard_label = HardLabel(threshold=cfg.TRAIN.HARD_LABEL_THRESHOLD, sample_percentage=cfg.TRAIN.HARD_LABEL_SAMPLING) # Hard Labeling 설정
        self.dropout = nn.Dropout() # Dropout Layer

        # Vertex Regression 활성화 시 아래 값 설정
        if cfg.TRAIN.VERTEX_REG:
            # center regression branch 설정
            self.conv4_vertex_embed = conv(512, 2*num_units, kernel_size=1, relu=False)
            self.conv5_vertex_embed = conv(512, 2*num_units, kernel_size=1, relu=False)
            self.upsample_conv5_vertex_embed = upsample(2.0)
            self.upsample_vertex_embed = upsample(8.0)
            self.conv_vertex_score = conv(2*num_units, 3*num_classes, kernel_size=1, relu=False)
            
            
            # hough voting 설정
            self.hough_voting = HoughVoting(is_train=0, skip_pixels=10, label_threshold=100, \
                                            inlier_threshold=0.9, voting_threshold=-1, per_threshold=0.01)

            self.roi_pool_conv4 = RoIPool(pool_height=7, pool_width=7, spatial_scale=1.0 / 8.0) # conv4 RoI Pooling Layer 설정
            self.roi_pool_conv5 = RoIPool(pool_height=7, pool_width=7, spatial_scale=1.0 / 16.0) # conv5 RoI Pooling Layer 설정
            self.fc8 = fc(dim_fc, num_classes) # Bounding Box 분류 Layer 설정
            self.fc9 = fc(dim_fc, 4 * num_classes, relu=False) # Bounding Box Regrssion Layer 설정

            # Pose Regression 활성화 여부에 따른 값 설정
            if cfg.TRAIN.POSE_REG: 
                self.fc10 = fc(dim_fc, 4 * num_classes, relu=False)
                self.pml = PMLoss(hard_angle=cfg.TRAIN.HARD_ANGLE) # Point Matching Loss 설정

        # 가중치 초기화
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                kaiming_normal_(m.weight.data) # Kaiming 초기화
                if m.bias is not None:
                    m.bias.data.zero_() # Bias 초기화
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1) # Batch Nomalization Weight 초기화
                m.bias.data.zero_() # Batch Nomalization Bias 초기화


    def forward(self, x, label_gt, meta_data, extents, gt_boxes, poses, points, symmetry):

        # conv features
        for i, model in enumerate(self.features):
            x = model(x) # 각 Layer에 Model 적용
            if i == 22:
                out_conv4_3 = x 
            if i == 29:
                out_conv5_3 = x

        # semantic labeling branch
        out_conv4_embed = self.conv4_embed(out_conv4_3) # conv4 Embedding
        out_conv5_embed = self.conv5_embed(out_conv5_3) # conv5 Embedding
        out_conv5_embed_up = self.upsample_conv5_embed(out_conv5_embed) # conv5 Embedding Upsampling
        out_embed = self.dropout(out_conv4_embed + out_conv5_embed_up) # Embed Dropout
        out_embed_up = self.upsample_embed(out_embed) # 최종 Embedding Upsampling
        out_score = self.conv_score(out_embed_up) # Class Score 계산
        out_logsoftmax = log_softmax_high_dimension(out_score) # LogSoftmax 적용
        out_prob = softmax_high_dimension(out_score) # Softmax 적용
        out_label = torch.max(out_prob, dim=1)[1].type(torch.IntTensor).cuda() # Class Label 예측
        out_weight = self.hard_label(out_prob, label_gt, torch.rand(out_prob.size()).cuda()) # Hard Label Weight 적용

        
        # Vertex Regression 활성 여부에 따른 값 적용
        if cfg.TRAIN.VERTEX_REG:
            # center regression branch
            out_conv4_vertex_embed = self.conv4_vertex_embed(out_conv4_3)
            out_conv5_vertex_embed = self.conv5_vertex_embed(out_conv5_3)
            out_conv5_vertex_embed_up = self.upsample_conv5_vertex_embed(out_conv5_vertex_embed)
            out_vertex_embed = self.dropout(out_conv4_vertex_embed + out_conv5_vertex_embed_up)
            out_vertex_embed_up = self.upsample_vertex_embed(out_vertex_embed)
            out_vertex = self.conv_vertex_score(out_vertex_embed_up)

            # hough voting
            if self.training: # 훈련모드
                self.hough_voting.is_train = 1
                self.hough_voting.label_threshold = cfg.TRAIN.HOUGH_LABEL_THRESHOLD
                self.hough_voting.voting_threshold = cfg.TRAIN.HOUGH_VOTING_THRESHOLD
                self.hough_voting.skip_pixels = cfg.TRAIN.HOUGH_SKIP_PIXELS
                self.hough_voting.inlier_threshold = cfg.TRAIN.HOUGH_INLIER_THRESHOLD
            else: # 평가모드
                self.hough_voting.is_train = 0
                self.hough_voting.label_threshold = cfg.TEST.HOUGH_LABEL_THRESHOLD
                self.hough_voting.voting_threshold = cfg.TEST.HOUGH_VOTING_THRESHOLD
                self.hough_voting.skip_pixels = cfg.TEST.HOUGH_SKIP_PIXELS
                self.hough_voting.inlier_threshold = cfg.TEST.HOUGH_INLIER_THRESHOLD
            out_box, out_pose = self.hough_voting(out_label, out_vertex, meta_data, extents)

            # bounding box classification and regression branch
            bbox_labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = roi_target_layer(out_box, gt_boxes) # RoI Target에 따른 가중치 불러오기
            out_roi_conv4 = self.roi_pool_conv4(out_conv4_3, out_box) # Conv4 RoI Pooling
            out_roi_conv5 = self.roi_pool_conv5(out_conv5_3, out_box) # Conv5 RoI Pooling
            out_roi = out_roi_conv4 + out_roi_conv5 # 위 두 Pooling 합
            
            out_roi_flatten = out_roi.view(out_roi.size(0), -1) # RoI 결과값 평탄화
            out_fc7 = self.classifier(out_roi_flatten) # 분류기 통과
            out_fc8 = self.fc8(out_fc7) # FC Layer 통과
            out_logsoftmax_box = log_softmax_high_dimension(out_fc8) # Log Softmax 적용
            bbox_prob = softmax_high_dimension(out_fc8) # Softmax 적용하여 확률 계산
            bbox_label_weights = self.hard_label(bbox_prob, bbox_labels, torch.rand(bbox_prob.size()).cuda()) # Hard Label 가중치 계산
            bbox_pred = self.fc9(out_fc7) # Bounding Box 예측

            # rotation regression branch
            rois, poses_target, poses_weight = pose_target_layer(out_box, bbox_prob, bbox_pred, gt_boxes, poses, self.training)
            
            # Pose Regression 활성 여부에 따른 값 설정
            if cfg.TRAIN.POSE_REG:    
                out_qt_conv4 = self.roi_pool_conv4(out_conv4_3, rois)
                out_qt_conv5 = self.roi_pool_conv5(out_conv5_3, rois)
                out_qt = out_qt_conv4 + out_qt_conv5
                out_qt_flatten = out_qt.view(out_qt.size(0), -1)
                out_qt_fc7 = self.classifier(out_qt_flatten)
                out_quaternion = self.fc10(out_qt_fc7)
                # point matching loss
                poses_pred = nn.functional.normalize(torch.mul(out_quaternion, poses_weight))
                if self.training:
                    loss_pose = self.pml(poses_pred, poses_target, poses_weight, points, symmetry)
        
        # Training Mode
        if self.training:
            if cfg.TRAIN.VERTEX_REG:
                if cfg.TRAIN.POSE_REG:
                    return out_logsoftmax, out_weight, out_vertex, out_logsoftmax_box, bbox_label_weights, \
                           bbox_pred, bbox_targets, bbox_inside_weights, loss_pose, poses_weight
                else:
                    return out_logsoftmax, out_weight, out_vertex, out_logsoftmax_box, bbox_label_weights, \
                           bbox_pred, bbox_targets, bbox_inside_weights
            else:
                return out_logsoftmax, out_weight
            
        # Validating Mode
        else:
            if cfg.TRAIN.VERTEX_REG:
                if cfg.TRAIN.POSE_REG:
                    return out_label, out_vertex, rois, out_pose, out_quaternion
                else:
                    return out_label, out_vertex, rois, out_pose
            else:
                return out_label

    # Weight Parameters 반환
    def weight_parameters(self):
        return [param for name, param in self.named_parameters() if 'weight' in name]

    # Bias Parameters 반환
    def bias_parameters(self):
        return [param for name, param in self.named_parameters() if 'bias' in name]




In [None]:
def posecnn(num_classes, num_units, data=None):

    # PoseCNN 모델 생성
    model = PoseCNN(num_classes, num_units)

    # 데이터 제공 여부 확인
    if data is not None:
        model_dict = model.state_dict() # Model 상태 dict 가져오기
        print('model keys')
        print('=================================================')
        for k, v in model_dict.items():
            print(k)
        print('=================================================')

        print('data keys')
        print('=================================================')
        
        # Model Key 값 출력
        for k, v in data.items():
            print(k)
        print('=================================================')

        # 데이터 내 모델 상태와 일치하는 Key-Value를 찾음
        pretrained_dict = {k: v for k, v in data.items() if k in model_dict and v.size() == model_dict[k].size()}
        print('load the following keys from the pretrained model')
        print('=================================================')
        
        # 불러올 Key 출력
        for k, v in pretrained_dict.items():
            print(k)
        print('=================================================')
        
        # 모델 상태 Update
        model_dict.update(pretrained_dict) 
        
        # 사전 훈련 가충치 불러오기
        model.load_state_dict(model_dict)

    return model # 생성된 모델 반환


# PoseCNN Import Module

from layers.hard_label import HardLabel

from layers.hough_voting import HoughVoting

from layers.roi_pooling import RoIPool

from layers.point_matching_loss import PMLoss

from layers.roi_target_layer import roi_target_layer

from layers.pose_target_layer import pose_target_layer


In [None]:
'''from layers.hard_label import HardLabel'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

import math
from torch import nn
from torch.autograd import Function
import torch
import posecnn_cuda


class HardLabelFunction(Function):
    @staticmethod
    def forward(ctx, prob, label, rand, threshold, sample_percentage):
        outputs = posecnn_cuda.hard_label_forward(threshold, sample_percentage, prob, label, rand)
        top_data = outputs[0]
        return top_data

    @staticmethod
    def backward(ctx, top_diff):
        outputs = posecnn_cuda.hard_label_backward(top_diff)
        d_prob, d_label = outputs
        return d_prob, d_label, None, None, None


class HardLabel(nn.Module):
    def __init__(self, threshold, sample_percentage):
        super(HardLabel, self).__init__()
        self.threshold = threshold
        self.sample_percentage = sample_percentage

    def forward(self, prob, label, rand):
        return HardLabelFunction.apply(prob, label, rand, self.threshold, self.sample_percentage)


In [None]:
'''from layers.hough_voting import HoughVoting'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

import math
from torch import nn
from torch.autograd import Function
import torch
import posecnn_cuda


class HoughVotingFunction(Function):
    @staticmethod
    def forward(ctx, label, vertex, meta_data, extents, is_train, skip_pixels, \
            label_threshold, inlier_threshold, voting_threshold, per_threshold):

        outputs = posecnn_cuda.hough_voting_forward(label, vertex, meta_data, extents, is_train, skip_pixels, \
            label_threshold, inlier_threshold, voting_threshold, per_threshold)

        top_box = outputs[0]
        top_pose = outputs[1]
        return top_box, top_pose

    @staticmethod
    def backward(ctx, top_diff_box, top_diff_pose):
        return None, None, None, None, None, None, None, None, None, None


class HoughVoting(nn.Module):
    def __init__(self, is_train=0, skip_pixels=10, label_threshold=100, inlier_threshold=0.9, voting_threshold=-1, per_threshold=0.01):
        super(HoughVoting, self).__init__()
        self.is_train = is_train
        self.skip_pixels = skip_pixels
        self.label_threshold = label_threshold
        self.inlier_threshold = inlier_threshold
        self.voting_threshold = voting_threshold
        self.per_threshold = per_threshold

    def forward(self, label, vertex, meta_data, extents):
        return HoughVotingFunction.apply(label, vertex, meta_data, extents, self.is_train, self.skip_pixels, \
            self.label_threshold, self.inlier_threshold, self.voting_threshold, self.per_threshold)


In [None]:
'''from layers.roi_pooling import RoIPool'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

import math
from torch import nn
from torch.autograd import Function
import torch
import posecnn_cuda


class RoIPoolFunction(Function):
    @staticmethod
    def forward(ctx, features, rois, pool_height, pool_width, spatial_scale):
        outputs = posecnn_cuda.roi_pool_forward(pool_height, pool_width, spatial_scale, features, rois)
        top_data = outputs[0]
        variables = outputs[1:]
        variables.append(rois)
        ctx.feature_size = features.size()
        ctx.spatial_scale = spatial_scale
        ctx.save_for_backward(*variables)
        return top_data

    @staticmethod
    def backward(ctx, top_diff):
        argmax_data = ctx.saved_variables[0]
        rois = ctx.saved_variables[1]
        batch_size, num_channels, data_height, data_width = ctx.feature_size
        spatial_scale = ctx.spatial_scale
        outputs = posecnn_cuda.roi_pool_backward(batch_size, data_height, data_width, spatial_scale, top_diff, rois, argmax_data)
        d_features = outputs[0]
        return d_features, None, None, None, None   

class RoIPool(nn.Module):
    def __init__(self, pool_height, pool_width, spatial_scale):
        super(RoIPool, self).__init__()

        self.pool_width = int(pool_width)
        self.pool_height = int(pool_height)
        self.spatial_scale = float(spatial_scale)

    def forward(self, features, rois):
        return RoIPoolFunction.apply(features, rois, self.pool_height, self.pool_width, self.spatial_scale)


In [None]:
'''from layers.point_matching_loss import PMLoss'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

import math
from torch import nn
from torch.autograd import Function
import torch
import posecnn_cuda

class PMLossFunction(Function):
    @staticmethod
    def forward(ctx, prediction, target, weight, points, symmetry, hard_angle):
        outputs = posecnn_cuda.pml_forward(prediction, target, weight, points, symmetry, hard_angle)
        loss = outputs[0]
        variables = outputs[1:]
        ctx.save_for_backward(*variables)

        return loss

    @staticmethod
    def backward(ctx, grad_loss):
        outputs = posecnn_cuda.pml_backward(grad_loss, *ctx.saved_variables)
        d_rotation = outputs[0]

        return d_rotation, None, None, None, None, None


class PMLoss(nn.Module):
    def __init__(self, hard_angle=15):
        super(PMLoss, self).__init__()
        self.hard_angle = hard_angle

    def forward(self, prediction, target, weight, points, symmetry):
        return PMLossFunction.apply(prediction, target, weight, points, symmetry, self.hard_angle)


In [None]:
'''from layers.roi_target_layer import roi_target_layer'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

from __future__ import absolute_import
from __future__ import division

import torch
import numpy as np
import numpy.random as npr
from fcn.config import cfg
from utils.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps

# rpn_rois: (batch_ids, cls, x1, y1, x2, y2, scores)
# gt_boxes: batch * num_classes * (x1, y1, x2, y2, cls)
def roi_target_layer(rpn_rois, gt_boxes):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    """

    rpn_rois = rpn_rois.detach().cpu().numpy()
    gt_boxes = gt_boxes.detach().cpu().numpy()
    num_classes = gt_boxes.shape[1]

    # convert boxes to (batch_ids, x1, y1, x2, y2, cls)
    roi_blob = rpn_rois[:, (0, 2, 3, 4, 5, 1)]
    gt_box_blob = np.zeros((0, 6), dtype=np.float32)
    for i in range(gt_boxes.shape[0]):
        for j in range(gt_boxes.shape[1]):
            if gt_boxes[i, j, -1] > 0:
                gt_box = np.zeros((1, 6), dtype=np.float32)
                gt_box[0, 0] = i
                gt_box[0, 1:5] = gt_boxes[i, j, :4]
                gt_box[0, 5] = gt_boxes[i, j, 4]
                gt_box_blob = np.concatenate((gt_box_blob, gt_box), axis=0)

    # sample rois with classification labels and bounding box regression targets
    labels, bbox_targets, bbox_inside_weights = _sample_rois(roi_blob, gt_box_blob, num_classes)
    bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

    # convert labels
    num = labels.shape[0]
    label_blob = np.zeros((num, num_classes), dtype=np.float32)
    if np.any(roi_blob[:, -1] > 0):
        for i in range(num):
            label_blob[i, int(labels[i])] = 1.0

    return torch.from_numpy(label_blob).cuda(), torch.from_numpy(bbox_targets).cuda(), \
        torch.from_numpy(bbox_inside_weights).cuda(), torch.from_numpy(bbox_outside_weights).cuda()


def _get_bbox_regression_labels(bbox_target_data, num_classes):
  """Bounding-box regression targets (bbox_target_data) are stored in a
  compact form N x (class, tx, ty, tw, th)

  This function expands those targets into the 4-of-4*K representation used
  by the network (i.e. only one class has non-zero targets).

  Returns:
      bbox_target (ndarray): N x 4K blob of regression targets
      bbox_inside_weights (ndarray): N x 4K blob of loss weights
  """

  clss = bbox_target_data[:, 0]
  bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
  bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
  inds = np.where(clss > 0)[0]
  for ind in inds:
    cls = clss[ind]
    start = int(4 * cls)
    end = start + 4
    bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
    bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
  return bbox_targets, bbox_inside_weights


def _compute_targets(ex_rois, gt_rois, labels):
  """Compute bounding-box regression targets for an image."""

  assert ex_rois.shape[0] == gt_rois.shape[0]
  assert ex_rois.shape[1] == 4
  assert gt_rois.shape[1] == 4

  targets = bbox_transform(ex_rois, gt_rois)
  if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
    # Optionally normalize targets by a precomputed mean and stdev
    targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
               / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
  return np.hstack(
    (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)


def _sample_rois(all_rois, gt_boxes, num_classes):
  """Generate a random sample of RoIs comprising foreground and background
  examples.
  """
  # all_rois (batch_ids, x1, y1, x2, y2, cls)
  # gt_boxes (batch_ids, x1, y1, x2, y2, cls)
  # overlaps: (rois x gt_boxes)

  if gt_boxes.shape[0] == 0:
      num = all_rois.shape[0]
      labels = np.zeros((num, 1), dtype=np.float32)
      bbox_targets = np.zeros((num, 4 * num_classes), dtype=np.float32)
      bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
  else:
      overlaps = bbox_overlaps(
        np.ascontiguousarray(all_rois[:, :5], dtype=np.float),
        np.ascontiguousarray(gt_boxes[:, :5], dtype=np.float))

      gt_assignment = overlaps.argmax(axis=1)
      max_overlaps = overlaps.max(axis=1)
      labels = gt_boxes[gt_assignment, 5]

      # Select foreground RoIs as those with >= FG_THRESH overlap
      # fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
      bg_inds = np.where(max_overlaps < cfg.TRAIN.FG_THRESH)[0]
      labels[bg_inds] = 0

      # print '{:d} rois, {:d} fg, {:d} bg'.format(all_rois.shape[0], all_rois.shape[0]-len(bg_inds), len(bg_inds))
      # print all_rois

      bbox_target_data = _compute_targets(
        all_rois[:, 1:5], gt_boxes[gt_assignment, 1:5], labels)

      bbox_targets, bbox_inside_weights = \
        _get_bbox_regression_labels(bbox_target_data, num_classes)

  return labels, bbox_targets, bbox_inside_weights


In [None]:
'''from layers.pose_target_layer import pose_target_layer'''

# Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
# This work is licensed under the NVIDIA Source Code License - Non-commercial. Full
# text can be found in LICENSE.md

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import torch
import numpy as np
from fcn.config import cfg
from utils.bbox_transform import bbox_transform_inv
from utils.cython_bbox import bbox_overlaps

# rpn_rois: (batch_ids, cls, x1, y1, x2, y2, scores)
# gt_boxes: batch * num_classes * (x1, y1, x2, y2, cls)
def pose_target_layer(rois, bbox_prob, bbox_pred, gt_boxes, poses, is_training):

    rois = rois.detach().cpu().numpy()
    bbox_prob = bbox_prob.detach().cpu().numpy()
    bbox_pred = bbox_pred.detach().cpu().numpy()
    gt_boxes = gt_boxes.detach().cpu().numpy()
    num_classes = bbox_prob.shape[1]
  
    # process boxes
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes))
        means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes))
        bbox_pred *= stds
        bbox_pred += means

    boxes = rois[:, 2:6].copy()
    pred_boxes = bbox_transform_inv(boxes, bbox_pred)

    # assign boxes
    for i in range(rois.shape[0]):
        cls = int(rois[i, 1])
        rois[i, 2:6] = pred_boxes[i, cls*4:cls*4+4]
        rois[i, 6] = bbox_prob[i, cls]

    # convert boxes to (batch_ids, x1, y1, x2, y2, cls)
    roi_blob = rois[:, (0, 2, 3, 4, 5, 1)]
    gt_box_blob = np.zeros((0, 6), dtype=np.float32)
    pose_blob = np.zeros((0, 9), dtype=np.float32)
    for i in range(gt_boxes.shape[0]):
        for j in range(gt_boxes.shape[1]):
            if gt_boxes[i, j, -1] > 0:
                gt_box = np.zeros((1, 6), dtype=np.float32)
                gt_box[0, 0] = i
                gt_box[0, 1:5] = gt_boxes[i, j, :4]
                gt_box[0, 5] = gt_boxes[i, j, 4]
                gt_box_blob = np.concatenate((gt_box_blob, gt_box), axis=0)
                poses[i, j, 0] = i
                pose_blob = np.concatenate((pose_blob, poses[i, j, :].cpu().reshape(1, 9)), axis=0)

    if gt_box_blob.shape[0] == 0:
        num = rois.shape[0]
        poses_target = np.zeros((num, 4 * num_classes), dtype=np.float32)
        poses_weight = np.zeros((num, 4 * num_classes), dtype=np.float32)
    else:
        # overlaps: (rois x gt_boxes)
        overlaps = bbox_overlaps(
            np.ascontiguousarray(roi_blob[:, :5], dtype=np.float),
            np.ascontiguousarray(gt_box_blob[:, :5], dtype=np.float))

        gt_assignment = overlaps.argmax(axis=1)
        max_overlaps = overlaps.max(axis=1)
        labels = gt_box_blob[gt_assignment, 5]
        quaternions = pose_blob[gt_assignment, 2:6]

        # Select foreground RoIs as those with >= FG_THRESH overlap
        bg_inds = np.where(max_overlaps < cfg.TRAIN.FG_THRESH_POSE)[0]
        labels[bg_inds] = 0

        bg_inds = np.where(roi_blob[:, -1] != labels)[0]
        labels[bg_inds] = 0

        # in training, only use the positive boxes for pose regression
        if is_training:
            fg_inds = np.where(labels > 0)[0]
            if len(fg_inds) > 0:
                rois = rois[fg_inds, :]
                quaternions = quaternions[fg_inds, :]
                labels = labels[fg_inds]
    
        # pose regression targets and weights
        poses_target, poses_weight = _compute_pose_targets(quaternions, labels, num_classes)

    return torch.from_numpy(rois).cuda(), torch.from_numpy(poses_target).cuda(), torch.from_numpy(poses_weight).cuda()


def _compute_pose_targets(quaternions, labels, num_classes):
    """Compute pose regression targets for an image."""

    num = quaternions.shape[0]
    poses_target = np.zeros((num, 4 * num_classes), dtype=np.float32)
    poses_weight = np.zeros((num, 4 * num_classes), dtype=np.float32)

    for i in range(num):
        cls = labels[i]
        if cls > 0 and np.linalg.norm(quaternions[i, :]) > 0:
            start = int(4 * cls)
            end = start + 4
            poses_target[i, start:end] = quaternions[i, :]
            poses_weight[i, start:end] = 1.0

    return poses_target, poses_weight
