# Monocular Depth Estimation on Artificially Scaled Up Camera Images for Robot Applications

This notebook will estimate the performance of monocular depth sensing on zoomed in targets in a robotics application.

## Setup

### Imports

In [1]:
#std libs
from collections import OrderedDict
import math
import os
import typing
import time

#non std libs
import torch
from torchvision.transforms import ToTensor
import cv2 as cv
import numpy as np
from pyzed import sl

In [2]:
from monocular_depth_estimation.models.model import GLPDepth

from super_resolution.models.RRDBNet_arch import RRDBNet

  from .autonotebook import tqdm as notebook_tqdm


### Parameters

Name            | Type   | Value
----------------|--------|----------------------------------------------------------------------------------------------------------------------------------------------
`SVO_FILEPATH`  | `str`  | Path to the svo recording used for evaluation.
`USE_NYU`       | `bool` | `True`: Use the weights pretrained on "nyudepthv2"<br>`False`: Use the weights pretrained on the kitti eigen split
`CROP_TO_SIZE`  | `bool` | `True`: Crop the image to size (1216, 352). This loses parts of the image completely.<br>`False`: Resize the image. This distortes the image.
`OUTPUT_SUFFIX` | `str`  | Suffix of the output csv file for evaluation results. The final file name will consist of the svo filename, NYU/KITTI, CROP/RESIZE and this.

In [57]:
SVO_FILEPATH    = "/mnt/Daten2/Uni/HiWi_Husky/recordings/test_recording_rear.svo";
USE_NYU         = False;
CROP_TO_SIZE    = True;
OUTPUT_SUFFIX   = "general";

### Constants

In [58]:
MAX_DEPTH = 20.0 #[m]

### Definition utility functions

In [59]:
def PSNR(ground_truth: np.ndarray, estimation: np.ndarray) -> typing.Tuple[float, float]:
    mse = np.nanmean((ground_truth - estimation) ** 2);
    if (mse == 0): #no noise
        return -1.0, mse;

    psnr = 20 * math.log10(255.0) - 10 * math.log10(mse);
    return psnr, mse;

In [60]:
def crop_image(img: np.ndarray) -> np.ndarray:
    h_im, w_im = img.shape[:2]

    margin_top = int(h_im - 352)
    margin_left = int((w_im - 1216) / 2)

    sized_image = img[margin_top:  margin_top  + 352,
                      margin_left: margin_left + 1216]

    return sized_image;

if CROP_TO_SIZE:
    resize_image = crop_image;
else:
    resize_image = lambda img: cv.resize(img, (1216, 352));

### Initialization of camera object for svo playback

In [61]:
camera_init_parameters = sl.InitParameters();

camera_init_parameters.svo_real_time_mode = False;
camera_init_parameters.open_timeout_sec = 30;
camera_init_parameters.coordinate_units = sl.UNIT.METER;

camera_init_parameters.set_from_svo_file(SVO_FILEPATH);

In [62]:
camera = sl.Camera();
error_code = camera.open(camera_init_parameters);
if (error_code != sl.ERROR_CODE.SUCCESS):
    print("Failed to open Camera object:", error_code);

In [63]:
nr_frames = camera.get_svo_number_of_frames();
resolution = camera.get_camera_information().camera_configuration.camera_resolution;

In [64]:
color_image = sl.Mat(resolution.width, resolution.height, sl.MAT_TYPE.U8_C3, sl.MEM.CPU);
depth_image = sl.Mat(resolution.width, resolution.height, sl.MAT_TYPE.F32_C1, sl.MEM.CPU);

### Setup of pytorch

In [65]:
if torch.cuda.is_available():
    device = torch.device("cuda");
    device_prop = torch.cuda.get_device_properties(device);
    print(f"Using GPU: {device_prop.name} {round(device_prop.total_memory / 1024**3, 2)}GiB (CC: {device_prop.major}.{device_prop.minor})");
else:
    device = torch.device("cpu");
    print("Using CPU.");
torch.set_default_device(device);

Using GPU: NVIDIA GeForce RTX 2070 7.79GiB (CC: 7.5)


### Setup of monocular depth estimation model

In [66]:
class Storage:
    #a minimal storage class to somewhat mimic the behaviour of argparse.ArgumentParser
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs);

#### KITTI parameters

```bash
python3 test.py \
--dataset kitti \
--kitti_crop garg_crop \
--data_path ../data/ \
--max_depth 80.0 \
--max_depth_eval 80.0 \
--backbone swin_large_v2 \
--depths 2 2 18 2 \
--num_filters 32 32 32 \
--deconv_kernels 2 2 2 \
--window_size 22 22 22 11 \
--pretrain_window_size 12 12 12 6 \
--use_shift True True False False \
--flip_test \
--shift_window_test \
--shift_size 16 \
--do_evaluate \
--ckpt_dir ckpt/kitti_swin_large.ckpt
```

In [67]:
kitti_args = Storage(
    #max_depth=80.0,
    backbone="swin_large_v2",
    depths=[2, 2, 18, 2],
    window_size=[22, 22, 22, 11],
    pretrain_window_size=[12, 12, 12, 6],
    drop_path_rate=0.3,
    use_checkpoint=False,
    use_shift=[True, True, False, False],
    pretrained='',
    num_deconv=3,
    num_filters=[32, 32, 32],
    deconv_kernels=[2, 2, 2],
)

#### nyudepth parameters

```bash
python3 test.py \
--dataset nyudepthv2 \
--data_path ../data/ \
--max_depth 10.0 \
--max_depth_eval 10.0  \
--backbone swin_large_v2 \
--depths 2 2 18 2 \
--num_filters 32 32 32 \
--deconv_kernels 2 2 2 \
--window_size 30 30 30 15 \
--pretrain_window_size 12 12 12 6 \
--use_shift True True False False \
--flip_test \
--shift_window_test \
--shift_size 2 \
--do_evaluate \
--ckpt_dir ckpt/nyudepthv2_swin_large.ckpt
```

In [68]:
nyudepthv2_args = Storage(
    #max_depth=10.0,
    backbone="swin_large_v2",
    depths=[2, 2, 18, 2],
    window_size=[30, 30, 30, 15],
    pretrain_window_size=[12, 12, 12, 6],
    drop_path_rate=0.3, #
    use_checkpoint=False, #
    use_shift=[True, True, False, False],
    pretrained='', #
    num_deconv=3, #
    num_filters=[32, 32, 32],
    deconv_kernels=[2, 2, 2],
)

In [69]:
# as specified for the svo recording
nyudepthv2_args.max_depth = kitti_args.max_depth = 20.0;

In [70]:
if USE_NYU:
    print("Using weights pretrained on nyudepthv2.")
    mde_model = GLPDepth(args=nyudepthv2_args);

    mde_model_weights: dict = torch.load("monocular_depth_estimation/checkpoints/nyudepthv2_swin_large.ckpt", map_location=device);
else:
    print("Using weights pretrained on kitti.")
    mde_model = GLPDepth(args=kitti_args);

    mde_model_weights: dict = torch.load("monocular_depth_estimation/checkpoints/kitti_swin_large.ckpt", map_location=device);

mde_model = mde_model.to(device=device);

if 'module' in next(iter(mde_model_weights.items()))[0]:
    model_weight = OrderedDict((k[7:], v) for k, v in mde_model_weights.items())

mde_model.load_state_dict(mde_model_weights);
mde_model.eval();

Using weights pretrained on kitti.
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(22, 22)] ==> [12]
norm8_log_bylayer: [(11, 11)] ==> [6]
norm8_log_bylayer: [(11, 11)] ==> [6]


### Setup of super resolution model

In [71]:
sr_model = RRDBNet(3, 3, 64, 23, gc=32);
sr_model.to(device=device);

sr_model_weights: dict = torch.load("super_resolution/models/RRDB_ESRGAN_x4.pth", map_location=device);

sr_model.load_state_dict(sr_model_weights);
sr_model.eval();

In [72]:
output = np.ndarray((nr_frames, 4), np.float64);

camera.set_svo_position(0);
while (True):
    # get/go to the current frame
    error_code = camera.grab();
    if (error_code == sl.ERROR_CODE.END_OF_SVOFILE_REACHED):
        print("Done" + ' ' * 60)
        break
    elif (error_code != sl.ERROR_CODE.SUCCESS):
        raise SystemExit(f"Failed to grab frame: {error_code}");

    # retrieve current camera frame
    error_code = camera.retrieve_image(color_image, sl.VIEW.LEFT, sl.MEM.CPU);
    if (error_code != sl.ERROR_CODE.SUCCESS):
        raise SystemExit(f"Failed to retrieve color image: {error_code}");

    # retrieve current depth map
    error_code = camera.retrieve_measure(depth_image, sl.MEASURE.DEPTH, sl.MEM.CPU);
    if (error_code != sl.ERROR_CODE.SUCCESS):
        raise SystemExit(f"Failed to retrieve depth image: {error_code}");

    # get processing start time
    start_time = time.time();

    # resize and convert image for model
    sized_image: np.ndarray = resize_image(color_image.get_data());
    sized_image = cv.cvtColor(sized_image, cv.COLOR_BGRA2RGB);

    downsized_image = cv.resize(sized_image, (304, 88)) # (304, 88) is exactly 1/4 of the original size

    # resize and normalize depth for comparison
    sized_depth: np.ndarray = resize_image(depth_image.get_data());

    # prepare downsized image for super resolution model
    downsized_img_tensor = ToTensor()(downsized_image);
    downsized_img_tensor = downsized_img_tensor.unsqueeze(0).to(device=device);

    # let the super resolution modek scake up the image
    with torch.no_grad():
        prediction = sr_model(downsized_img_tensor);
        pred_tensor = prediction.data;

    # prepare upsized image for depth estimation model
    img_tensor = pred_tensor.to(dtype=torch.float32).clamp_(0, 1);

    # let the depth estimation model create a depth map from the frame
    with torch.no_grad():
        prediction = mde_model(img_tensor);
        pred_tensor: torch.Tensor = prediction["pred_d"]

    # convert tensor back to numpy array
    depth_prediction: np.ndarray = pred_tensor.squeeze().to(device=torch.device("cpu")).numpy()

    # get processing end time
    end_time = time.time();

    # cleanup data -> trat inf as nan
    infmask = np.where(np.isinf(depth_prediction));
    depth_prediction[infmask] = np.nan;
    # print("estimation infmask: ", infmask);

    infmask = np.where(np.isinf(sized_depth));
    sized_depth[infmask] = np.nan;
    # print("ground_truth infmask: ", infmask);

    # debug
    # print("gt:", np.nanmin(sized_depth), np.nanmax(sized_depth));
    # print("pred:", np.nanmin(depth_prediction), np.nanmax(depth_prediction));

    # calculate PSNR
    psnr, mse = PSNR(sized_depth, depth_prediction);
    current_frame = camera.get_svo_position();

    # output.append(f"{current_frame},{color_image.timestamp.data_ns},{psnr}\n");
    output[current_frame] = [color_image.timestamp.get_milliseconds(), end_time - start_time, psnr, mse];

    print(f"[{current_frame + 1}/{nr_frames}]", f"PSNR = {round(psnr, 4)} dB", f"MSE = {round(mse, 4)}", sep='\t', end='\r');

Done                                                            


In [73]:
#make the timestamps start at 0s
output[:, 0] = (output[:, 0] - output[0, 0]) / 1000;

In [74]:
output_file_name = '-'.join([
    os.path.basename(SVO_FILEPATH).replace(".svo", ''),
    ("NYU" if USE_NYU else "KITTI"),
    ("CROPPED" if CROP_TO_SIZE else "RESIZED"),
    OUTPUT_SUFFIX
]) + ".csv";
print("Writing output to:", output_file_name);

with open(output_file_name, 'w') as file:
    file.write("Frame;Timestamp [s];Processingtime [s];PSNR [dB];MSE\n")

    for i, (timestamp, timedelta, psnr, mse) in enumerate(output):
        file.write(f"{i};{timestamp};{timedelta};{psnr};{mse}\n");

    file.write(f"Mean:;;{np.mean(output[:, 1])};{np.mean(output[:, 2])};{np.mean(output[:, 3])}\n");
    file.write(f"Median:;;{np.median(output[:, 1])};{np.median(output[:, 2])};{np.median(output[:, 3])}\n");

Writing output to: test_recording_rear-KITTI-CROPPED-output.csv
