In [1]:
import fire
from onnxruntime.tools import pytorch_export_contrib_ops
import torch
import onnx
import onnxruntime as ort
import deform_conv2d_onnx_exporter
import numpy as np
deform_conv2d_onnx_exporter.register_deform_conv2d_onnx_op()
pytorch_export_contrib_ops.register()
import sys
import os

package_path = os.path.dirname(sys.path[0])  #two folders upwards
sys.path.insert(0, package_path)

from vision_base.utils.builder import build
from vision_base.utils.utils import cfg_from_file
from vision_base.networks.utils.utils import load_models

In [2]:
cfg = cfg_from_file('../configs/res34_monodepth_odaiba.py')
# checkpoint_path = "../odaiba_single/checkpoint/monodepth.networks.models.meta_archs.monodepth2_model.MonoDepthWPose_latest.pth"
checkpoint_path = "../workdirs/odaiba_single/checkpoint/monodepth.networks.models.meta_archs.monodepth2_model.MonoDepthWPose_latest.pth"
onnx_output = "merge_multicam_monodepth_smaller.onnx"
is_export_rgb = True
gpu = 0

In [3]:
 # Force GPU selection in command line
cfg.trainer.gpu = gpu
torch.cuda.set_device(cfg.trainer.gpu)

# Create the model
meta_arch = build(**cfg.meta_arch)
meta_arch = meta_arch.cuda()

load_models(checkpoint_path, meta_arch, map_location=f'cuda:{gpu}', strict=False)
meta_arch.eval()
print(f"Loaded model from {checkpoint_path}.")

Loaded model from ../workdirs/odaiba_single/checkpoint/monodepth.networks.models.meta_archs.monodepth2_model.MonoDepthWPose_latest.pth.


In [4]:
class MultiDepthExportModel(torch.nn.Module):
    def __init__(self, meta_arch, cfg, is_export_rgb=True):
        super().__init__()
        self.meta_arch = meta_arch
        self.is_export_rgb = is_export_rgb
        self.register_buffer("rgb_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).cuda())
        self.register_buffer("rgb_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).cuda())

        w_range = np.arange(0, cfg.data.rgb_shape[1], dtype=np.float32)
        h_range = np.arange(0, cfg.data.rgb_shape[0], dtype=np.float32)
        w_grid, h_grid = np.meshgrid(w_range, h_range) #[H, W]
        base_depth_image = np.stack([w_grid, h_grid,
                                     np.ones_like(w_grid),
                                     np.ones_like(w_grid)], axis=2)[...,np.newaxis][None] # [1, H, W, 4, 1]
        self.register_buffer("base_depth_image", torch.tensor(base_depth_image).cuda())

        bottom_mask = np.zeros((cfg.data.rgb_shape[0], cfg.data.rgb_shape[1]), dtype=np.bool_)
        bottom_mask[-(cfg.data.rgb_shape[0]//4):, :] = True
        self.register_buffer("bottom_mask", torch.tensor(bottom_mask).cuda())


    def normalize_image(self, image):
        image = image / 256.0
        original_image = image.clone()
        image = image - self.rgb_mean
        image = image / self.rgb_std
        return image, original_image

    def depth_image_to_point_cloud_array(self, depth_image, K, T, rgb_image=None, mask=None):
        """  convert depth image into color pointclouds [xyzbgr]
        depth_image: [B, 1, H, W] -> fully normalized
        K: [B, 3, 4]
        T: [B, 4, 4] -> camera to base_link / world
        rgb_image: [B, 3, H, W] -> 0-1
        mask: [B, H, W]
        """
        P_expanded = torch.eye(4).cuda()
        P_expanded = P_expanded[None].repeat([depth_image.shape[0], 1, 1]) #[B, 4, 4]
        P_expanded[:, 0:3, :] = K
        P_inv = torch.inverse(P_expanded) # [B, 4 ,4]

        #[H, W, 4, 1]
        base_depth_image = self.base_depth_image.repeat([depth_image.shape[0], 1, 1, 1, 1]).clone()
        base_depth_image[:, :, :, 0:3, 0] = base_depth_image[:, :, :, 0:3, 0] * depth_image[:, 0, :, :, None]
        # B, H, W, 3 * B, H, W, 1

        # B, 1, 1, 4, 4 * B, H, W, 4, 1 -> B, H, W, 4, 1
        pc_3d = torch.matmul(P_inv[:, None, None, ...], base_depth_image) #[B, H, W, 4, 1]

        pc_3d = torch.matmul(T[:, None, None, ...], pc_3d)[..., 0:3, 0] # [B, H, W, 4, 1] -> [B, H, W, 3]

        if self.is_export_rgb:
            rgb_image = rgb_image.permute(0, 2, 3, 1).contiguous() #[B, 3, H, W] -> [B, H, W, 3]
            pc_3d = torch.cat([pc_3d, rgb_image], dim=3) #[B, H, W, 6]

        mask = torch.logical_and(mask > 0, depth_image[:, 0] < 60)
        # mask = mask * (1 - bottom_mask.float())  # [B, H, W] * depth_image[:, 0] > 0
        point_cloud = pc_3d[mask,:] # [N, 6]
        
        return point_cloud

    def forward(self, image, P, T, masks):
        image, original_image = self.normalize_image(image)
        depths = self.meta_arch.dummy_forward(image, P)['depth'] # [B, 1, H, W]
        point_cloud = self.depth_image_to_point_cloud_array(depths, P, T, original_image, masks)
        return point_cloud

In [5]:
multi_depth_model = MultiDepthExportModel(meta_arch, cfg, is_export_rgb=is_export_rgb)
multi_depth_model.eval()
multi_depth_model.cuda()

B = 2
dummy_image = torch.zeros([B, cfg.data.rgb_shape[2], cfg.data.rgb_shape[0], cfg.data.rgb_shape[1]]).cuda()
dummy_T = torch.eye(4).expand(B, -1, -1).cuda()
dummy_P = dummy_T[:, 0:3, :].clone()
dummy_masks = torch.ones([B, cfg.data.rgb_shape[0], cfg.data.rgb_shape[1]]).cuda()


In [15]:
with torch.no_grad():
    output = multi_depth_model(dummy_image, dummy_P, dummy_T, dummy_masks)
    print(output.shape)

torch.Size([720896, 6])


In [7]:
dummy_input = (dummy_image, dummy_P, dummy_T, dummy_masks)
torch.onnx.export(multi_depth_model, dummy_input, onnx_output,
                   input_names=['image', 'P2', 'T', 'masks'],
                   output_names=['point_cloud'], opset_version=11,
                    dynamic_axes={'image': {0: 'batch_size'}, 'P2': {0: 'batch_size'}, 'T': {0: 'batch_size'}, 'masks': {0: 'batch_size'}, 'point_cloud': {0: 'numbers'}})



In [9]:
ort_session = ort.InferenceSession(onnx_output, providers=[('CUDAExecutionProvider', {'device_id':gpu})])
outputs = ort_session.run(None, {'image': dummy_image.cpu().numpy(), 'P2': dummy_P.cpu().numpy(), 'T': dummy_T.cpu().numpy(), 'masks': dummy_masks.cpu().numpy()})

print(f"The actual output of onnxruntime session: outputs[0].shape={outputs[0].shape}")

2023-12-02 03:10:07.984826601 [W:onnxruntime:, session_state.cc:1169 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-12-02 03:10:07.984863999 [W:onnxruntime:, session_state.cc:1171 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


The actual output of onnxruntime session: outputs[0].shape=(573440, 6)
