<a href="https://colab.research.google.com/github/RugvedKatole/Learning-Single-Camera-Depth-Estimation-using-Dual-Pixels/blob/main/Dual_Pixel_Net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dual Pixel Net implementation
Link to Paper: [Learning Single Camera Depth Estimation using Dual Pixels](https://arxiv.org/abs/1904.05822)


Import libraries 

In [20]:
import keras
import os
import copy
import json
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from scipy.interpolate import interp2d
import numpy.random as random
from tensorflow.keras.layers import Input, Conv2D ,Conv2DTranspose, MaxPooling2D, concatenate, Add, Dense, Dropout, Activation, Flatten, BatchNormalization, SeparableConv2D, LeakyReLU
from tensorflow.keras.optimizers import Adam

Paper uses a Unet Architecture with Residual Blocks.
Unet Architecture consists of a Encoder Decoder Network. Encoder Downsamples given images while decoder upsamples the downsampled images.k

In [44]:
import os
import sys

import tensorflow as tf
from tensorflow.keras import layers

import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

path = "../input/google-dual-pixel-test/test/scaled_images"

filelist = []

for root, dirs, files in os.walk(path):
    for file in files:
        filelist.append(os.path.join(root, file))

path1 = "../input/google-dual-pixel-test/test/merged_depth"
filelist1 = []
for root, dirs, files in os.walk(path1):
    for file in files:
        filelist1.append(os.path.join(root, file))

filelist.sort()
filelist1.sort()
data = {"image": [x for x in filelist if x.endswith(".jpg")],
       "depth": [x for x in filelist1 if x.endswith(".png")]}
df = pd.DataFrame(data)
# print(df.iloc[:,1])
df = df.sample(frac=1, random_state=42)



class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, batch_size=2, dim=[1008, 756], n_channels=3, shuffle=True):
        """
        Initialization
        """
        self.data = data
        self.indices = self.data.index.tolist()
        self.dim = dim
        self.n_channels = n_channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.min_depth = 0.1
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        if (index + 1) * self.batch_size > len(self.indices):
            self.batch_size = len(self.indices) - index * self.batch_size
        # Generate one batch of data
        # Generate indices of the batch
        index = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
        # Find list of IDs
        batch = [self.indices[k] for k in index]
        x, y = self.data_generation(batch)

        return x, y

    def on_epoch_end(self):

        """
        Updates indexes after each epoch
        """
        self.index = np.arange(len(self.indices))
        if self.shuffle == True:
            np.random.shuffle(self.index)

    def load(self, image_path, depth_map):
        """Load input and target image."""

        image_ = cv2.imread(image_path)
        image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2RGB)
#         image_ = cv2.resize(image_, self.dim)
        image_ = tf.image.convert_image_dtype(image_, tf.float32)

        depth_map = cv2.imread(depth_map)
        depth_map = cv2.cvtColor(depth_map, cv2.COLOR_BGR2RGB)
        depth_map = cv2.resize(depth_map, (378,504))
        depth_map = tf.image.convert_image_dtype(depth_map, tf.float32)
#         print("depth",depth_map.shape)
#         print("image",image_.shape)
        

#         mask = np.load(mask)
#         mask = mask > 0

#         max_depth = min(300, np.percentile(depth_map, 99))
#         depth_map = np.clip(depth_map, self.min_depth, max_depth)
#         depth_map = np.log(depth_map, where=mask)

#         depth_map = np.ma.masked_where(~mask, depth_map)

#         depth_map = np.clip(depth_map, 0.1, np.log(max_depth))
#         depth_map = cv2.resize(depth_map, self.dim)
#         depth_map = np.expand_dims(depth_map, axis=2)
#         depth_map = tf.image.convert_image_dtype(depth_map, tf.float32)
        return image_, depth_map

    def data_generation(self, batch):

        x = np.empty((self.batch_size, 1008,756, self.n_channels))
        y = np.empty((self.batch_size, 504,378, self.n_channels))
#         print(x.shape)
#         print(y.shape)
        for i, batch_id in enumerate(batch):
            x[i,], y[i,] = self.load(
                self.data["image"][batch_id],
                self.data["depth"][batch_id])

        return x, y

    
train_loader = DataGenerator(data=df[:2736].reset_index(drop="true"))
validation_loader = DataGenerator(data=df[2736:].reset_index(drop="true"))

In [45]:
# Encoder block A
def EncoderA(inputs=None, i_filters=32, o=32, s=2, max_pooling=True):
    """
    Convolutional downsampling block
    
    Arguments:
        inputs -- Input tensor
        n_filters -- Number of filters for the convolutional layers 
        dropout_prob -- Dropout probability
        max_pooling -- Use MaxPooling2D to reduce the spatial dimensions of the output volume
    Returns: 
        next_layer, skip_connection --  Next layer and skip connection outputs
    """
    # first Layer of Encoder Block
    #Note E_a(i,o,s) == E(i,o,s)
    conv = BatchNormalization()(inputs)

    conv = Conv2D(i_filters, # Number of filters i.e i in paper (E(i,o,s))
                  (3,3),   # 3x3 Kernel size   
                  padding='same',
                  strides=(s,s))(conv)    # s from E(i,o,s)
    
    conv = LeakyReLU(alpha=0.05)(conv)
                
    # Second Layer of Encoder Block Is a Depthwise Separable Convolution layer with 3x3 kernel
    conv = BatchNormalization()(conv)
    conv = SeparableConv2D(i_filters,(3,3),
                            padding = 'same')(conv)
    conv = LeakyReLU(alpha=0.05)(conv)

    # Third layer of Encoder Block is 1x1 convolution Layer with o filters from E(i,o,s)
    conv = BatchNormalization()(conv)
    conv = Conv2D(o,(1,1), padding = 'same')(conv)
    conv = LeakyReLU(alpha=0.05)(conv)

    next_layer = BatchNormalization()(inputs)
    next_layer = SeparableConv2D(o,(3,3),
                            padding = 'same')(next_layer)
    next_layer = LeakyReLU(alpha=0.05)(next_layer)
    next_layer = MaxPooling2D(pool_size=(s,s), strides=(s,s),padding='same')(next_layer)
    next_layer = Add()([conv,next_layer])
        
    skip_connection = conv
    
    return next_layer, skip_connection

In [46]:
# Encoder Block B
def EncoderB(inputs=None, o=32, s=2, max_pooling=True):
    """
    Convolutional downsampling block
    
    Arguments:
        inputs -- Input tensor
        n_filters -- Number of filters for the convolutional layers 
        dropout_prob -- Dropout probability
        max_pooling -- Use MaxPooling2D to reduce the spatial dimensions of the output volume
    Returns: 
        next_layer, skip_connection --  Next layer and skip connection outputs
    """
    # first Layer of Encoder Block
    conv = BatchNormalization()(inputs)
    conv = Conv2D(o, # Number of filters i.e o in paper (E_b(o,s))
                  (7,7),   # 3x3 Kernel size   
                  padding='same',
                  kernel_initializer='he_normal',
                  strides=(s,s))(conv)    # s from E(o,s)
    conv = LeakyReLU(alpha=0.05)(conv)

    # the output of conv is added to max pooled input images
    Pooled_input = MaxPooling2D(pool_size=(s,s), strides=(s,s))(inputs)
    next_layer = concatenate([conv,Pooled_input],axis = 3)
    skip_connection = conv
    
    return next_layer, skip_connection

Now we create a Decoder block for our Network

In [47]:
# Decoder Block
def Decoder(expansive_input, contractive_input, i_filters = 32, o = 32):
    """
    Convolutional upsampling block
    
    Arguments:
        expansive_input -- Input tensor from previous layer
        contractive_input -- Input tensor from previous skip layer
        i_filters -- Number of filters for the convolutional layers (o from (D(i,o)))
    Returns: 
        conv -- Tensor output
    """
    # first layer of decoder block i.e transpose conv to previous layer
    up = BatchNormalization()(expansive_input)
    up = Conv2DTranspose(
                i_filters,    # number of filters
                 (4,4),    # Kernel size
                 strides=(2,2),
                 padding='same')(up)
    up = LeakyReLU(alpha=0.05)(up)
    
    
    # second layer of decoder block i.e 3x3 depth seperable conv 
    up = BatchNormalization()(up)
    up = SeparableConv2D(i_filters,(3,3),
                            padding = 'same')(up)
    up = LeakyReLU(alpha=0.05)(up)

    # Third layer of Decoder Block i.e 1x1 conv with i filters
    up = BatchNormalization()(up)
    up = Conv2D(i_filters,(1,1), padding = 'same')(up)
    up = LeakyReLU(alpha=0.05)(up)

    #fourth layer of Decoder block i.e 3x3 
    up = BatchNormalization()(up)
    up = SeparableConv2D(i_filters,(3,3),strides=(2,2),padding = 'same')(up)
    up = LeakyReLU(alpha=0.05)(up)

    # fifth layer 
    up = BatchNormalization()(up)
    contractive_input = SeparableConv2D(i_filters,(3,3),
                            padding = 'same')(contractive_input)

    # BC kitne layers hai
    next_layer = Add()([up,contractive_input])
    next_layer = LeakyReLU(alpha=0.05)(next_layer)
    #Finally the final layer
    next_layer = BatchNormalization()(next_layer)
    next_layer = Conv2D(o,(1,1), padding = 'same')(next_layer)
    next_layer = LeakyReLU(alpha=0.05)(next_layer)

    return next_layer

Now we have completed the require Encoder Decoder blocks with now create our model architecture

In [48]:
def Unet_model(input_size=(1024,1024,1)):
  """
    Unet model
    
    Arguments:
        input_size -- Input shape
    Returns: 
        model -- tf.keras.Model
    """
    #Encoding
  inputs = Input(input_size)
  Block1E_b = EncoderB(inputs,8,2)
  Block1E_a = EncoderA(Block1E_b[0],11,11,1)  # E^1_a

  Block2E_a = EncoderA(Block1E_b[0],16,32,2)  
  Block2E_a = EncoderA(Block1E_b[0],16,32,1)
  Block2E_a = EncoderA(Block1E_b[0],16,32,1) # E^2_a

  Block3E_a = EncoderA(Block2E_a[0],16,64,2) 
  Block3E_a = EncoderA(Block2E_a[0],16,64,1) 
  Block3E_a = EncoderA(Block2E_a[0],16,64,1) #E^3_a
  
  Block4E_a = EncoderA(Block3E_a[0],32,128,2)
  Block4E_a = EncoderA(Block3E_a[0],32,128,1)
  Block4E_a = EncoderA(Block3E_a[0],32,128,1) #E^4_a

  Block5E_a = EncoderA(Block4E_a[0],32,128,2)
  Block5E_a = EncoderA(Block4E_a[0],32.128,1)
  Block5E_a = EncoderA(Block4E_a[0],32,128,1) 

  #Decoding

  Block4D = Decoder(Block5E_a[0],Block4E_a[1],32,128) #D^4
  
  Block3D = Decoder(Block4D,Block3E_a[1],16,64) #D^4

  Block2D = Decoder(Block3D,Block2E_a[1],16,32) #D^4

  Block1D = Decoder(Block2D,Block1E_a[1],8,3) #D^4

  #Creating model
  model = tf.keras.Model(inputs=inputs, outputs=Block1D)

  return model





In [49]:
model=Unet_model((1008,756,3))
model.compile(optimizer= Adam(beta_2 = 0.9),loss='mean_squared_error',metrics=['mse'])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1008, 756, 3 0                                            
__________________________________________________________________________________________________
batch_normalization_462 (BatchN (None, 1008, 756, 3) 12          input_7[0][0]                    
__________________________________________________________________________________________________
conv2d_210 (Conv2D)             (None, 504, 378, 8)  1184        batch_normalization_462[0][0]    
__________________________________________________________________________________________________
leaky_re_lu_462 (LeakyReLU)     (None, 504, 378, 8)  0           conv2d_210[0][0]                 
____________________________________________________________________________________________

In [50]:
with tf.device('/device:GPU:0'):
    model.fit(train_loader,epochs=10,validation_data=validation_loader,verbose='auto')

(504, 378, 3)
(504, 378, 3)
Epoch 1/10
(504, 378, 3)
(504, 378, 3)
(504, 378, 3)
(504, 378, 3)
   1/1368 [..............................] - ETA: 3:02:34 - loss: 0.7517 - mse: 0.7517(504, 378, 3)
   2/1368 [..............................] - ETA: 44:08 - loss: 0.7326 - mse: 0.7326  (504, 378, 3)
(504, 378, 3)
   3/1368 [..............................] - ETA: 45:55 - loss: 0.6963 - mse: 0.6963(504, 378, 3)
(504, 378, 3)
   4/1368 [..............................] - ETA: 46:06 - loss: 0.6754 - mse: 0.6754(504, 378, 3)
(504, 378, 3)
   5/1368 [..............................] - ETA: 46:15 - loss: 0.6668 - mse: 0.6668(504, 378, 3)
(504, 378, 3)
   6/1368 [..............................] - ETA: 46:24 - loss: 0.6491 - mse: 0.6491(504, 378, 3)
(504, 378, 3)
   7/1368 [..............................] - ETA: 46:10 - loss: 0.6369 - mse: 0.6369(504, 378, 3)


KeyboardInterrupt: 

(504, 378, 3)
(504, 378, 3)
