In [11]:
import os
import random
import numpy as np
import copy
from PIL import Image  

import PIL.Image as pil
from tqdm import tqdm
import torch
import torch.utils.data as data
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import skimage.transform
from collections import Counter
import time

In [12]:
import json


## Some helper functions are used from the original implementation
#from utils import *
#from kitti_utils import *
#from layers import *


#import datasets
#import networks
from IPython import embed


cuda = torch.cuda.is_available()
cuda

True

In [13]:
def pil_loader(path):
    with open(path, 'rb') as f:
        with Image.open(f) as img:
            return img.convert('RGB')

In [59]:
class MainDataset(data.Dataset):
    
    def __init__(self,data_path,filenames,height,width,frame_idxs,num_scales,is_train=False,img_ext='.jpg'):
        super(MainDataset, self).__init__()
        
        self.data_path = data_path
        self.filenames = filenames
        self.height = height
        self.width = width
        self.num_scales = num_scales
        self.frame_idxs = frame_idxs
        self.is_train = is_train
        self.img_ext = img_ext
        
        # Interpolation Method during Scaling of the Images
        self.interp = Image.ANTIALIAS 
        
        self.loader = pil_loader
        self.to_tensor = transforms.ToTensor()
        
        # Randomly Change the Brightness, Contrast, Saturation and Hue of the Image
        # Tuple: (min,max)
        self.brig = (0.8,1.2)
        self.con = (0.8,1.2)
        self.sat = (0.8,1.2)
        self.hue = (-0.1, 0.1)
        # transforms.ColorJitter.get_params(brightness=(0.8,1.2),contrast=(0.8,1.2),saturation=(0.8,1.2),hue=(-0.1, 0.1))
        
        # Create a Dictionary (self.resize) of size of all the images after scaling
        # Scale = -1:(1242,375) - Native Resolution as loaded
        # Scale = 0:(192,640)  # Scale = 1:(96,320) # Scale = 2:(48,160) # Scale = 3:(24,80)
        # Interpolation while scaling = PIL.Image.LANCZOS
        self.resize = {}
        for i in range(self.num_scales):
            s = 2 ** i
            self.resize[i] = transforms.Resize((self.height // s, self.width // s),interpolation=self.interp)
            
        self.load_depth = self.check_depth()
        
        # Intrinsic Camera Matrix
        self.K = np.array([[0.58, 0, 0.5, 0],[0, 1.92, 0.5, 0],[0, 0, 1, 0],[0, 0, 0, 1]], dtype=np.float32)
        
        # Dimensions of the Original Image 
        self.full_res_shape = (1242, 375)
        
        self.side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
    
    def __len__(self):
        return len(self.filenames)
    
    
    # Checks if the Loaded Filenames contains Velodyne Points a.k.a. Checks Depth if it exists
    def check_depth(self):
        
        line = self.filenames[0].split() # ['2011_09_26/2011_09_26_drive_0022_sync', '473', 'r']
        scene_name = line[0] # 2011_09_26/2011_09_26_drive_0022_sync
        frame_index = int(line[1]) # 473

        velo_filename = os.path.join(self.data_path,scene_name,"velodyne_points/data/{:010d}.bin".format(int(frame_index)))
        # /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin
        
        check_depth_boolean = os.path.isfile(velo_filename)
        
        return check_depth_boolean
    
    def get_image_path(self, folder, frame_index, side):
        
        f_str = "{:010d}{}".format(frame_index, self.img_ext)
        image_path = os.path.join(self.data_path, folder, "image_0{}/data".format(self.side_map[side]), f_str)
        # Ex: /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/image_03/data/0000000473.jpg
        
        return image_path
    
    def get_depth(self, folder, frame_index, side):
        
        # Data Path Ex: /home/ubuntu/monodepth2/kitti_data
        # Folder Ex: 2011_09_26/2011_09_26_drive_0022_sync
        
        calib_path = os.path.join(self.data_path, folder.split("/")[0])
        # Calib_Path Ex: /home/ubuntu/monodepth2/kitti_data/2011_09_26

        velo_filename = os.path.join(self.data_path,folder,"velodyne_points/data/{:010d}.bin".format(int(frame_index)))
        # Velo_Filename Ex: 
        # /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin

        depth_gt = self.generate_depth_map(calib_path, velo_filename, self.side_map[side])
        # Shape of depth_gt: (375, 1242)
        depth_gt = skimage.transform.resize(depth_gt, self.full_res_shape[::-1], order=0, preserve_range=True, mode='constant')
        # Shape of depth_gt: (375, 1242)
        
        return depth_gt
    
    def load_velodyne_points(self,filename):

        points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4)
        points[:, 3] = 1.0  # homogeneous
        return points
    
    def read_calib_file(self,path):

        float_chars = set("0123456789.e+- ")
        data = {}
        with open(path, 'r') as f:
            for line in f.readlines():
                key, value = line.split(':', 1)
                value = value.strip()
                data[key] = value
                if float_chars.issuperset(value):
                    # try to cast to float array
                    try:
                        data[key] = np.array(list(map(float, value.split(' '))))
                    except ValueError:
                        # casting error: data[key] already eq. value, so pass
                        pass

        return data
    
    def sub2ind(self,matrixSize, rowSub, colSub):

        m, n = matrixSize
        return rowSub * (n-1) + colSub - 1
    
    def generate_depth_map(self,calib_dir, velo_filename, cam=2, vel_depth=False):

        # load calibration files
        cam2cam = self.read_calib_file(os.path.join(calib_dir, 'calib_cam_to_cam.txt'))
        velo2cam = self.read_calib_file(os.path.join(calib_dir, 'calib_velo_to_cam.txt'))
        velo2cam = np.hstack((velo2cam['R'].reshape(3, 3), velo2cam['T'][..., np.newaxis]))
        velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0])))

        # get image shape
        im_shape = cam2cam["S_rect_02"][::-1].astype(np.int32)

        # compute projection matrix velodyne->image plane
        R_cam2rect = np.eye(4)
        R_cam2rect[:3, :3] = cam2cam['R_rect_00'].reshape(3, 3)
        P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3, 4)
        P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam)

        # load velodyne points and remove all behind image plane (approximation)
        # each row of the velodyne data is forward, left, up, reflectance
        velo = self.load_velodyne_points(velo_filename)
        velo = velo[velo[:, 0] >= 0, :]

        # project the points to the camera
        velo_pts_im = np.dot(P_velo2im, velo.T).T
        velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis]

        if vel_depth:
            velo_pts_im[:, 2] = velo[:, 0]

        # check if in bounds
        # use minus 1 to get the exact same value as KITTI matlab code
        velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1
        velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1
        val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0)
        val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0])
        velo_pts_im = velo_pts_im[val_inds, :]

        # project to image
        depth = np.zeros((im_shape[:2]))
        depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2]

        # find the duplicate points and choose the closest depth
        inds = self.sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0])
        dupe_inds = [item for item, count in Counter(inds).items() if count > 1]
        for dd in dupe_inds:
            pts = np.where(inds == dd)[0]
            x_loc = int(velo_pts_im[pts[0], 0])
            y_loc = int(velo_pts_im[pts[0], 1])
            depth[y_loc, x_loc] = velo_pts_im[pts, 2].min()
        depth[depth < 0] = 0

        return depth
    
    def __getitem__(self,index):
        
        # The dictionary to be returned as a single training item
        inputs = {}
        
        do_color_aug = self.is_train and random.random() > 0.5
        do_flip = self.is_train and random.random() > 0.5
        
        line = self.filenames[index].split() # Ex: ['2011_09_26/2011_09_26_drive_0022_sync', '473', 'r']
        folder = line[0] # Ex: 2011_09_26/2011_09_26_drive_0022_sync
        
        if len(line) == 3:
            frame_index = int(line[1])
            side = line[2]
        else:
            frame_index = 0
            side = None
        
        
        # Append the images for each frame in the dictionary
        # Flip if do_flip is True
        for i in self.frame_idxs:
            
            if i != "s": # For Non Stereo
                
                color = self.loader(self.get_image_path(folder, frame_index + i, side))
                if do_flip == True:
                    color = color.transpose(pil.FLIP_LEFT_RIGHT)
                
                inputs[("color", i, -1)] = color
            
            else: #For Stereo
                
                other_side = {"r": "l", "l": "r"}[side]
                color = self.loader(self.get_image_path(folder, frame_index, other_side))
                if do_flip == True:
                    color = color.transpose(pil.FLIP_LEFT_RIGHT)
                    
                inputs[("color", i, -1)] = color
                
        
        # Construct the intrinsics for each scale and Append in the dictionary
        for scale in range(self.num_scales):
            
            K_scale = self.K.copy()
            K_scale[0,:] = K_scale[0,:] * (self.width // (2 ** scale))
            K_scale[1,:] = K_scale[1,:] * (self.height // (2 ** scale))
            
            # Compute the (Moore-Penrose) pseudo-inverse of a matrix.
            inverse_K_scale = np.linalg.pinv(K_scale)
            
            # Append K and its inverse for the particular 'scale'
            inputs[("K", scale)] = torch.from_numpy(K_scale)
            inputs[("inv_K", scale)] = torch.from_numpy(inverse_K_scale)
            
        # If Color Augmentation is True
        # Define color_aug to transform the image by changing its brightness, contrast, saturation and hue
        if do_color_aug == True:
            color_aug = transforms.ColorJitter.get_params(
                self.brig, self.con, self.sat, self.hue)
        else:
            color_aug = (lambda x: x)    
        
        
        # Till now, inputs contains images for all frames for only scale = -1
        # Append "color" images (and preprocessed to tensors) for all frames for other scales = 0,1,2,3 
        # Do similar and append for "color_aug" images do_color_aug is True
        # --->
        
        for k in list(inputs):
            frame = inputs[k]
            if "color" in k:
                n, im, i = k
                for i in range(self.num_scales):
                    inputs[(n, im, i)] = self.resize[i](inputs[(n, im, i - 1)])
            
        for k in list(inputs):
            f = inputs[k]
            if "color" in k:
                n, im, i = k
                inputs[(n, im, i)] = self.to_tensor(f)
                inputs[(n + "_aug", im, i)] = self.to_tensor(color_aug(f))
                
        # <---
        
        # Delete the scale of -1 as it indicates the default image resolution not required for training purposes
        for i in self.frame_idxs:
            del inputs[("color", i, -1)]
            del inputs[("color_aug", i, -1)]
                
        # Load Depth for the Images and Append to the dictionary of inputs
        if self.load_depth:
            depth_gt = self.get_depth(folder, frame_index, side)
            # Shape of depth_gt: (375, 1242)
            inputs["depth_gt"] = np.expand_dims(depth_gt, 0)
            inputs["depth_gt"] = torch.from_numpy(inputs["depth_gt"].astype(np.float32))
        
        # For Stereo
        if "s" in self.frame_idxs:
            stereo_T = np.eye(4, dtype=np.float32)
            baseline_sign =  -1 if do_flip else 1
            side_sign = -1 if side == "l" else 1
            stereo_T[0, 3] = side_sign * baseline_sign * 0.1

            inputs["stereo_T"] = torch.from_numpy(stereo_T)

        
        return inputs
        
        

In [60]:
dataset = MainDataset # Instance of the Class MainDataset
data_path = "/home/ubuntu/monodepth2/kitti_data" # Default Data Path

# Read Train Files Needed to be Loaded based on the split of Eigen_Zhou
fpath_train = "/home/ubuntu/monodepth2/splits/eigen_zhou/train_files.txt"
f_train = open(fpath_train)
train_filenames = f_train.readlines()

# Read Validation Files Needed to be Loaded
fpath_val = "/home/ubuntu/monodepth2/splits/eigen_zhou/val_files.txt"
f_val = open(fpath_val)
val_filenames = f_val.readlines()

height = 192 #Height of the input image
width = 640 #Width of the input image

# Extension of the Image
img_ext = ".jpg"

# Scales to be used in the Loss Calculation
## output scale = input scale/2**(n) for n in scales
scales = np.array([0,1,2,3])
num_scales = len(scales)

# Frames to be Loaded
# Current -> 0
# Previous -> -1
# Forward -> +1
frame_ids = [0,'s']#[0,-1,1]

# Batch Size
batch_size = 4

# Number of Workers
num_workers = 3

In [61]:
train_dataset = dataset(data_path, train_filenames, height, width,frame_ids,num_scales,is_train=True, img_ext=img_ext)
train_loader = DataLoader(train_dataset, batch_size, True,num_workers=num_workers, pin_memory=True, drop_last=True)

In [62]:
train_dataset[1]

{('K', 0): tensor([[371.2000,   0.0000, 320.0000,   0.0000],
         [  0.0000, 368.6400,  96.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 0): tensor([[ 2.6940e-03, -5.3033e-11, -8.6207e-01,  0.0000e+00],
         [-3.9697e-11,  2.7127e-03, -2.6042e-01,  0.0000e+00],
         [-8.8676e-11,  5.3253e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 1): tensor([[185.6000,   0.0000, 160.0000,   0.0000],
         [  0.0000, 184.3200,  48.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 1): tensor([[ 5.3879e-03, -1.3609e-10, -8.6207e-01,  0.0000e+00],
         [ 2.7678e-11,  5.4253e-03, -2.6042e-01,  0.0000e+00],
         [ 3.1640e-10,  7.0044e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 2): tensor([[92.8000,  0.

In [63]:
from kitti_utils import *
#from layers import *


import datasets

In [64]:
dataset = datasets.KITTIRAWDataset

In [65]:
train_dataset = dataset(data_path, train_filenames, height, width,frame_ids,num_scales,is_train=True, img_ext=img_ext)
train_loader = DataLoader(train_dataset, batch_size, True,num_workers=num_workers, pin_memory=True, drop_last=True)

In [66]:
train_dataset[1]

{('K', 0): tensor([[371.2000,   0.0000, 320.0000,   0.0000],
         [  0.0000, 368.6400,  96.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 0): tensor([[ 2.6940e-03, -5.3033e-11, -8.6207e-01,  0.0000e+00],
         [-3.9697e-11,  2.7127e-03, -2.6042e-01,  0.0000e+00],
         [-8.8676e-11,  5.3253e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 1): tensor([[185.6000,   0.0000, 160.0000,   0.0000],
         [  0.0000, 184.3200,  48.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 1): tensor([[ 5.3879e-03, -1.3609e-10, -8.6207e-01,  0.0000e+00],
         [ 2.7678e-11,  5.4253e-03, -2.6042e-01,  0.0000e+00],
         [ 3.1640e-10,  7.0044e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 2): tensor([[92.8000,  0.

In [35]:
for batch_idx, inputs in enumerate(train_loader):
    print(batch_idx)
    print(inputs.keys())
    break

0
dict_keys([('K', 0), ('inv_K', 0), ('K', 1), ('inv_K', 1), ('K', 2), ('inv_K', 2), ('K', 3), ('inv_K', 3), ('color', 0, 0), ('color', 0, 1), ('color', 0, 2), ('color', 0, 3), ('color', 's', 0), ('color', 's', 1), ('color', 's', 2), ('color', 's', 3), ('color_aug', 0, 0), ('color_aug', 0, 1), ('color_aug', 0, 2), ('color_aug', 0, 3), ('color_aug', 's', 0), ('color_aug', 's', 1), ('color_aug', 's', 2), ('color_aug', 's', 3), 'depth_gt', 'stereo_T'])


In [4]:
class MainDataset(data.Dataset):
    
    def __init__(self,data_path,filenames,height,width,frame_idxs,num_scales,is_train=False,img_ext='.jpg'):
        super(MainDataset, self).__init__()
        
        self.data_path = data_path
        self.filenames = filenames
        self.height = height
        self.width = width
        self.num_scales = num_scales
        self.frame_idxs = frame_idxs
        self.is_train = is_train
        self.img_ext = img_ext
        
        # Interpolation Method during Scaling of the Images
        self.interp = Image.ANTIALIAS 
        
        self.loader = pil_loader
        self.to_tensor = transforms.ToTensor()
        
        # Randomly Change the Brightness, Contrast, Saturation and Hue of the Image
        # Tuple: (min,max)
        self.brig = (0.8,1.2)
        self.con = (0.8,1.2)
        self.sat = (0.8,1.2)
        self.hue = (-0.1, 0.1)
        # transforms.ColorJitter.get_params(brightness=(0.8,1.2),contrast=(0.8,1.2),saturation=(0.8,1.2),hue=(-0.1, 0.1))
        
        # Create a Dictionary (self.resize) of size of all the images after scaling
        # Scale = -1:(1242,375) - Native Resolution as loaded
        # Scale = 0:(192,640)  # Scale = 1:(96,320) # Scale = 2:(48,160) # Scale = 3:(24,80)
        # Interpolation while scaling = PIL.Image.LANCZOS
        self.resize = {}
        for i in range(self.num_scales):
            s = 2 ** i
            self.resize[i] = transforms.Resize((self.height // s, self.width // s),interpolation=self.interp)
            
        self.load_depth = self.check_depth()
        
        # Intrinsic Camera Matrix
        self.K = np.array([[0.58, 0, 0.5, 0],[0, 1.92, 0.5, 0],[0, 0, 1, 0],[0, 0, 0, 1]], dtype=np.float32)
        
        # Dimensions of the Original Image 
        self.full_res_shape = (1242, 375)
        
        self.side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
    
    def __len__(self):
        return len(self.filenames)
    
    
    # Checks if the Loaded Filenames contains Velodyne Points a.k.a. Checks Depth if it exists
    def check_depth(self):
        
        line = self.filenames[0].split() # ['2011_09_26/2011_09_26_drive_0022_sync', '473', 'r']
        scene_name = line[0] # 2011_09_26/2011_09_26_drive_0022_sync
        frame_index = int(line[1]) # 473

        velo_filename = os.path.join(self.data_path,scene_name,"velodyne_points/data/{:010d}.bin".format(int(frame_index)))
        # /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin
        
        check_depth_boolean = os.path.isfile(velo_filename)
        
        return check_depth_boolean
    
    def get_image_path(self, folder, frame_index, side):
        
        f_str = "{:010d}{}".format(frame_index, self.img_ext)
        image_path = os.path.join(self.data_path, folder, "image_0{}/data".format(self.side_map[side]), f_str)
        # Ex: /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/image_03/data/0000000473.jpg
        
        return image_path
    
    def get_depth(self, folder, frame_index, side):
        
        # Data Path Ex: /home/ubuntu/monodepth2/kitti_data
        # Folder Ex: 2011_09_26/2011_09_26_drive_0022_sync
        
        calib_path = os.path.join(self.data_path, folder.split("/")[0])
        # Calib_Path Ex: /home/ubuntu/monodepth2/kitti_data/2011_09_26

        velo_filename = os.path.join(self.data_path,folder,"velodyne_points/data/{:010d}.bin".format(int(frame_index)))
        # Velo_Filename Ex: 
        # /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin

        depth_gt = generate_depth_map(calib_path, velo_filename, self.side_map[side])
        # Shape of depth_gt: (375, 1242)
        depth_gt = skimage.transform.resize(depth_gt, self.full_res_shape[::-1], order=0, preserve_range=True, mode='constant')
        # Shape of depth_gt: (375, 1242)
        
        return depth_gt
    
    def __getitem__(self,index):
        
        # The dictionary to be returned as a single training item
        inputs = {}
        
        do_color_aug = self.is_train and random.random() > 0.5
        do_flip = self.is_train and random.random() > 0.5
        
        line = self.filenames[index].split() # Ex: ['2011_09_26/2011_09_26_drive_0022_sync', '473', 'r']
        folder = line[0] # Ex: 2011_09_26/2011_09_26_drive_0022_sync
        
        if len(line) == 3:
            frame_index = int(line[1])
            side = line[2]
        else:
            frame_index = 0
            side = None
        
        
        # Append the images for each frame in the dictionary
        # Flip if do_flip is True
        for i in self.frame_idxs:
            
            if i != "s": # For Non Stereo
                
                color = self.loader(self.get_image_path(folder, frame_index + i, side))
                if do_flip == True:
                    color = color.transpose(pil.FLIP_LEFT_RIGHT)
                
                inputs[("color", i, -1)] = color
            
            else: #For Stereo
                
                other_side = {"r": "l", "l": "r"}[side]
                color = self.loader(self.get_image_path(folder, frame_index, other_side))
                if do_flip == True:
                    color = color.transpose(pil.FLIP_LEFT_RIGHT)
                    
                inputs[("color", i, -1)] = color
                
        
        # Construct the intrinsics for each scale and Append in the dictionary
        for scale in range(self.num_scales):
            
            K_scale = self.K.copy()
            K_scale[0,:] = K_scale[0,:] * (self.width // (2 ** scale))
            K_scale[1,:] = K_scale[1,:] * (self.height // (2 ** scale))
            
            # Compute the (Moore-Penrose) pseudo-inverse of a matrix.
            inverse_K_scale = np.linalg.pinv(K_scale)
            
            # Append K and its inverse for the particular 'scale'
            inputs[("K", scale)] = torch.from_numpy(K_scale)
            inputs[("inv_K", scale)] = torch.from_numpy(inverse_K_scale)
            
        # If Color Augmentation is True
        # Define color_aug to transform the image by changing its brightness, contrast, saturation and hue
        if do_color_aug == True:
            color_aug = transforms.ColorJitter.get_params(
                self.brig, self.con, self.sat, self.hue)
        else:
            color_aug = (lambda x: x)    
        
        
        # Till now, inputs contains images for all frames for only scale = -1
        # Append "color" images (and preprocessed to tensors) for all frames for other scales = 0,1,2,3 
        # Do similar and append for "color_aug" images do_color_aug is True
        # --->
        
        for k in list(inputs):
            frame = inputs[k]
            if "color" in k:
                n, im, i = k
                for i in range(self.num_scales):
                    inputs[(n, im, i)] = self.resize[i](inputs[(n, im, i - 1)])
            
        for k in list(inputs):
            f = inputs[k]
            if "color" in k:
                n, im, i = k
                inputs[(n, im, i)] = self.to_tensor(f)
                inputs[(n + "_aug", im, i)] = self.to_tensor(color_aug(f))
                
        # <---
        
        # Delete the scale of -1 as it indicates the default image resolution not required for training purposes
        for i in self.frame_idxs:
            del inputs[("color", i, -1)]
            del inputs[("color_aug", i, -1)]
                
        # Load Depth for the Images and Append to the dictionary of inputs
        if self.load_depth:
            depth_gt = self.get_depth(folder, frame_index, side)
            # Shape of depth_gt: (375, 1242)
            inputs["depth_gt"] = np.expand_dims(depth_gt, 0)
            inputs["depth_gt"] = torch.from_numpy(inputs["depth_gt"].astype(np.float32))
        
        # For Stereo
        if "s" in self.frame_idxs:
            stereo_T = np.eye(4, dtype=np.float32)
            baseline_sign =  +1 #-1 if do_flip else 1
            side_sign = -1 if side == "l" else 1
            stereo_T[0, 3] = side_sign * baseline_sign * 0.1

            inputs["stereo_T"] = torch.from_numpy(stereo_T)

        
        return inputs
        
        

In [5]:
def load_velodyne_points(filename):

    points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4)
    points[:, 3] = 1.0  # homogeneous
    return points


def read_calib_file(path):

    float_chars = set("0123456789.e+- ")
    data = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            key, value = line.split(':', 1)
            value = value.strip()
            data[key] = value
            if float_chars.issuperset(value):
                # try to cast to float array
                try:
                    data[key] = np.array(list(map(float, value.split(' '))))
                except ValueError:
                    # casting error: data[key] already eq. value, so pass
                    pass

    return data


def sub2ind(matrixSize, rowSub, colSub):

    m, n = matrixSize
    return rowSub * (n-1) + colSub - 1


def generate_depth_map(calib_dir, velo_filename, cam=2, vel_depth=False):

    # load calibration files
    cam2cam = read_calib_file(os.path.join(calib_dir, 'calib_cam_to_cam.txt'))
    velo2cam = read_calib_file(os.path.join(calib_dir, 'calib_velo_to_cam.txt'))
    velo2cam = np.hstack((velo2cam['R'].reshape(3, 3), velo2cam['T'][..., np.newaxis]))
    velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0])))

    # get image shape
    im_shape = cam2cam["S_rect_02"][::-1].astype(np.int32)

    # compute projection matrix velodyne->image plane
    R_cam2rect = np.eye(4)
    R_cam2rect[:3, :3] = cam2cam['R_rect_00'].reshape(3, 3)
    P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3, 4)
    P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam)

    # load velodyne points and remove all behind image plane (approximation)
    # each row of the velodyne data is forward, left, up, reflectance
    velo = load_velodyne_points(velo_filename)
    velo = velo[velo[:, 0] >= 0, :]

    # project the points to the camera
    velo_pts_im = np.dot(P_velo2im, velo.T).T
    velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis]

    if vel_depth:
        velo_pts_im[:, 2] = velo[:, 0]

    # check if in bounds
    # use minus 1 to get the exact same value as KITTI matlab code
    velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1
    velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1
    val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0)
    val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0])
    velo_pts_im = velo_pts_im[val_inds, :]

    # project to image
    depth = np.zeros((im_shape[:2]))
    depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2]

    # find the duplicate points and choose the closest depth
    inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0])
    dupe_inds = [item for item, count in Counter(inds).items() if count > 1]
    for dd in dupe_inds:
        pts = np.where(inds == dd)[0]
        x_loc = int(velo_pts_im[pts[0], 0])
        y_loc = int(velo_pts_im[pts[0], 1])
        depth[y_loc, x_loc] = velo_pts_im[pts, 2].min()
    depth[depth < 0] = 0

    return depth

In [6]:
dataset = MainDataset # Instance of the Class MainDataset

In [10]:
dataset = MainDataset # Instance of the Class MainDataset
train_dataset = dataset(data_path, train_filenames, height, width,frame_ids,num_scales,is_train=True, img_ext=img_ext)
train_loader = DataLoader(train_dataset, batch_size, True,num_workers=num_workers, pin_memory=True, drop_last=True)

In [11]:
train_dataset[0]

{('K', 0): tensor([[371.2000,   0.0000, 320.0000,   0.0000],
         [  0.0000, 368.6400,  96.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 0): tensor([[ 2.6940e-03, -2.9104e-11, -8.6207e-01,  0.0000e+00],
         [-2.9104e-11,  2.7127e-03, -2.6042e-01,  0.0000e+00],
         [-1.1642e-10,  5.8208e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 1): tensor([[185.6000,   0.0000, 160.0000,   0.0000],
         [  0.0000, 184.3200,  48.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 1): tensor([[ 5.3879e-03, -5.8208e-11, -8.6207e-01,  0.0000e+00],
         [ 0.0000e+00,  5.4253e-03, -2.6042e-01,  0.0000e+00],
         [ 2.3283e-10,  5.8208e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 2): tensor([[92.8000,  0.

In [102]:
folder

'2011_09_26/2011_09_26_drive_0022_sync'

In [103]:
frame_index

473

In [104]:
side

'r'

In [117]:
full_res_shape = (1242, 375)

In [118]:
def load_velodyne_points(filename):

    points = np.fromfile(filename, dtype=np.float32).reshape(-1, 4)
    points[:, 3] = 1.0  # homogeneous
    return points


def read_calib_file(path):

    float_chars = set("0123456789.e+- ")
    data = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            key, value = line.split(':', 1)
            value = value.strip()
            data[key] = value
            if float_chars.issuperset(value):
                # try to cast to float array
                try:
                    data[key] = np.array(list(map(float, value.split(' '))))
                except ValueError:
                    # casting error: data[key] already eq. value, so pass
                    pass

    return data


def sub2ind(matrixSize, rowSub, colSub):

    m, n = matrixSize
    return rowSub * (n-1) + colSub - 1


def generate_depth_map(calib_dir, velo_filename, cam=2, vel_depth=False):

    # load calibration files
    cam2cam = read_calib_file(os.path.join(calib_dir, 'calib_cam_to_cam.txt'))
    velo2cam = read_calib_file(os.path.join(calib_dir, 'calib_velo_to_cam.txt'))
    velo2cam = np.hstack((velo2cam['R'].reshape(3, 3), velo2cam['T'][..., np.newaxis]))
    velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0])))

    # get image shape
    im_shape = cam2cam["S_rect_02"][::-1].astype(np.int32)

    # compute projection matrix velodyne->image plane
    R_cam2rect = np.eye(4)
    R_cam2rect[:3, :3] = cam2cam['R_rect_00'].reshape(3, 3)
    P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3, 4)
    P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam)

    # load velodyne points and remove all behind image plane (approximation)
    # each row of the velodyne data is forward, left, up, reflectance
    velo = load_velodyne_points(velo_filename)
    velo = velo[velo[:, 0] >= 0, :]

    # project the points to the camera
    velo_pts_im = np.dot(P_velo2im, velo.T).T
    velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., np.newaxis]

    if vel_depth:
        velo_pts_im[:, 2] = velo[:, 0]

    # check if in bounds
    # use minus 1 to get the exact same value as KITTI matlab code
    velo_pts_im[:, 0] = np.round(velo_pts_im[:, 0]) - 1
    velo_pts_im[:, 1] = np.round(velo_pts_im[:, 1]) - 1
    val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0)
    val_inds = val_inds & (velo_pts_im[:, 0] < im_shape[1]) & (velo_pts_im[:, 1] < im_shape[0])
    velo_pts_im = velo_pts_im[val_inds, :]

    # project to image
    depth = np.zeros((im_shape[:2]))
    depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2]

    # find the duplicate points and choose the closest depth
    inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0])
    dupe_inds = [item for item, count in Counter(inds).items() if count > 1]
    for dd in dupe_inds:
        pts = np.where(inds == dd)[0]
        x_loc = int(velo_pts_im[pts[0], 0])
        y_loc = int(velo_pts_im[pts[0], 1])
        depth[y_loc, x_loc] = velo_pts_im[pts, 2].min()
    depth[depth < 0] = 0

    return depth

In [127]:
def get_depth(folder, frame_index, side):
        
        print(data_path)
        print(folder)
        calib_path = os.path.join(data_path, folder.split("/")[0])
        print(calib_path)

        velo_filename = os.path.join(data_path,folder,"velodyne_points/data/{:010d}.bin".format(int(frame_index)))
        print(velo_filename)

        depth_gt = generate_depth_map(calib_path, velo_filename, side_map[side])
        print(depth_gt.shape)
        depth_gt = skimage.transform.resize(depth_gt, full_res_shape[::-1], order=0, preserve_range=True, mode='constant')
        print(depth_gt.shape)


        return depth_gt

In [128]:
d_gt = get_depth(folder,frame_index,side)

/home/ubuntu/monodepth2/kitti_data
2011_09_26/2011_09_26_drive_0022_sync
/home/ubuntu/monodepth2/kitti_data/2011_09_26
/home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin
(375, 1242)
(375, 1242)


In [95]:
dataset = MainDataset # Instance of the Class MainDataset

In [96]:
train_dataset = dataset(data_path, train_filenames, height, width,frame_ids,num_scales,is_train=True, img_ext=img_ext)
train_loader = DataLoader(train_dataset, batch_size, True,num_workers=num_workers, pin_memory=True, drop_last=True)

In [97]:
i,c = train_dataset[0]

In [98]:
i

{('color',
  0,
  -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F4A58>,
 ('color',
  -1,
  -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F40F0>,
 ('color',
  1,
  -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F4320>,
 ('K', 0): tensor([[371.2000,   0.0000, 320.0000,   0.0000],
         [  0.0000, 368.6400,  96.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 0): tensor([[ 2.6940e-03, -2.9104e-11, -8.6207e-01,  0.0000e+00],
         [-2.9104e-11,  2.7127e-03, -2.6042e-01,  0.0000e+00],
         [-1.1642e-10,  5.8208e-11,  1.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 ('K', 1): tensor([[185.6000,   0.0000, 160.0000,   0.0000],
         [  0.0000, 184.3200,  48.0000,   0.0000],
         [  0.0000,   0.0000,   1.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,   1.0000]]),
 ('inv_K', 1): tensor([[ 5.3879e

In [99]:
c

<function __main__.MainDataset.__getitem__.<locals>.<lambda>(x)>

In [100]:
to_tensor = transforms.ToTensor()
def preprocess(inputs, color_aug):
        """Resize colour images to the required scales and augment if required
        We create the color_aug object in advance and apply the same augmentation to all
        images in this item. This ensures that all images input to the pose network receive the
        same augmentation.
        """
        print('ini',inputs)
        print('list',list(inputs))
        for k in list(inputs):
            print('k',k)
            frame = inputs[k]
            print('f',frame)
            if "color" in k:
                n, im, i = k
                for i in range(num_scales):
                    inputs[(n, im, i)] = resize[i](inputs[(n, im, i - 1)])
        print('aft',inputs)
        
        for k in list(inputs):
            f = inputs[k]
            if "color" in k:
                n, im, i = k
                inputs[(n, im, i)] = to_tensor(f)
                inputs[(n + "_aug", im, i)] = to_tensor(color_aug(f))
                
        print('aft_aug',inputs)

In [101]:
preprocess(i,c)

ini {('color', 0, -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F4A58>, ('color', -1, -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F40F0>, ('color', 1, -1): <PIL.Image.Image image mode=RGB size=1242x375 at 0x7F1D4E8F4320>, ('K', 0): tensor([[371.2000,   0.0000, 320.0000,   0.0000],
        [  0.0000, 368.6400,  96.0000,   0.0000],
        [  0.0000,   0.0000,   1.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   1.0000]]), ('inv_K', 0): tensor([[ 2.6940e-03, -2.9104e-11, -8.6207e-01,  0.0000e+00],
        [-2.9104e-11,  2.7127e-03, -2.6042e-01,  0.0000e+00],
        [-1.1642e-10,  5.8208e-11,  1.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]), ('K', 1): tensor([[185.6000,   0.0000, 160.0000,   0.0000],
        [  0.0000, 184.3200,  48.0000,   0.0000],
        [  0.0000,   0.0000,   1.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   1.0000]]), ('inv_K', 1): tensor([[ 5.3879e-03, -5.8208e-11, -8.62

In [35]:
other_side = {"r": "l", "l": "r"}["r"]

In [36]:
other_side

'l'

In [13]:
interp = Image.ANTIALIAS
resize = {}
for i in range(4):
    s = 2 ** i
    resize[i] = transforms.Resize((height // s, width // s),interpolation=interp)

In [48]:
brig = (0.8,1.2)
con = (0.8,1.2)
sat = (0.8,1.2)
hue = (-0.1, 0.1)
color_aug = transforms.ColorJitter.get_params(brig, con, sat, hue)

In [49]:
color_aug

Compose(
    Lambda()
    Lambda()
    Lambda()
    Lambda()
)

In [14]:
resize

{0: Resize(size=(192, 640), interpolation=PIL.Image.LANCZOS),
 1: Resize(size=(96, 320), interpolation=PIL.Image.LANCZOS),
 2: Resize(size=(48, 160), interpolation=PIL.Image.LANCZOS),
 3: Resize(size=(24, 80), interpolation=PIL.Image.LANCZOS)}

In [34]:
len(filenames)

39810

In [41]:
folder = "2011_09_26/2011_09_26_drive_0022_sync"
frame_index = 473
side = "r"
side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
img_ext = ".jpg"

def get_image_path(folder, frame_index, side):
        
        f_str = "{:010d}{}".format(frame_index, img_ext)
        image_path = os.path.join(
            data_path, folder, "image_0{}/data".format(side_map[side]), f_str)
        
        return image_path

In [42]:
get_image_path(folder, frame_index, side)

'/home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/image_03/data/0000000473.jpg'

In [32]:
filenames = train_filenames

In [33]:
check_depth()

line ['2011_09_26/2011_09_26_drive_0022_sync', '473', 'r']
scene 2011_09_26/2011_09_26_drive_0022_sync
frame 473
velo /home/ubuntu/monodepth2/kitti_data/2011_09_26/2011_09_26_drive_0022_sync/velodyne_points/data/0000000473.bin


True