# Install necessary packages

If you run this notebook on Google Colab, you'll have to install timm.

In [1]:
pip install timm

Collecting timm
  Downloading timm-0.9.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from timm)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors (from timm)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, huggingface-hub, timm
Successfully installed huggingface-hub-0.18.0 safetensors-0.4.0 timm-0.9.7


# Import libraries

In [3]:
import cv2
import torch
import time
import os
import numpy as np
from IPython.display import Video

# Load MiDaS model.

We'll load MiDaS thanks to torch hub. Feel free to use different versions of the model !

In [4]:
# Let's load a MiDaS model for depth estimation task.
model_type = "DPT_Hybrid" #best of both worlds regarding accuracy and inference speed

midas = torch.hub.load("intel-isl/MiDaS", model_type)

#Move to model to GPU if it is available.
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
midas.to(device)
print('MiDaS successfully loaded.')


Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip
  model = create_fn(
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt" to /root/.cache/torch/hub/checkpoints/dpt_hybrid_384.pt
100%|██████████| 470M/470M [00:28<00:00, 17.6MB/s]


MiDaS successfully loaded.


## Load data and set image preprocessor class:

We need to load, resize and normalize our images so that MiDaS can process them correctly. Fortunately, MiDaS has a ``transforms`` that does the preprocessing for us. You can see the video whoose depth we're going to estimate.

In [5]:
# Use transforms class
transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = transforms.dpt_transform #If you used a small midas model,
                                     #use transforms.small_transform instead.

# Load video
VIDEO_PATH = "./data/interior_design.mp4"
Video(VIDEO_PATH, embed = True)

Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Now we are going to extract all of the images from the video and place them in a folder.

In [6]:

# Output directory for JPEG images
output_directory = f"./data/video_into_images"

# Ensure that the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Open the video file
cap = cv2.VideoCapture(VIDEO_PATH)

# Get the resolution of the video
frame_width = int(cap.get(3))  # Width
frame_height = int(cap.get(4))  # Height

# Check if the video was opened successfully
if not cap.isOpened():
    print("Unable to open the video. Check the file path.")
else:
    frame_count = 0

    while True:
        ret, frame = cap.read()

        # Check if reading the frame was successful
        if not ret:
            break

          # Save the frame as a JPEG image
        frame_filename = os.path.join(output_directory, f'frame_{frame_count:04d}.jpg')
        cv2.imwrite(frame_filename, frame)
        frame_count += 1

    # Release the video file
    cap.release()

    print(f'{frame_count} images have been extracted and saved in {output_directory}')

297 images have been extracted and saved in ./data/video_into_images


# Run the model

Now that we have decomposed the video into images, we can run the model for each image.

In [7]:
# Inference mode
midas.eval()

total_time = []
depth_video = []

for i in range(frame_count):

  FRAME_PATH = f"./data/video_into_images/frame_{i:04d}.jpg"
  img = cv2.imread(FRAME_PATH, cv2.IMREAD_COLOR)

  # Apply transforms:
  input_img = transform(img).to(device)

  # Prediction and preprocess output:
  with torch.no_grad():
    start = time.time()
    pred = midas(input_img)

    pred = torch.nn.functional.interpolate(pred.unsqueeze(1),
                                          size = img.shape[:2],
                                          mode = "bicubic",
                                          align_corners = False,
                                          ).squeeze()
    depth_img = pred.cpu().numpy()

    #Resizing and coloring the image
    depth_img = cv2.normalize(depth_img, None, 0, 1, norm_type = cv2.NORM_MINMAX,
                              dtype = cv2.CV_64F)
    depth_img = (depth_img*255).astype(np.uint8)
    depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA)

    end = time.time()
    total_time.append(end - start)
    depth_video.append(depth_img)

print(f"Total time: {np.sum(total_time):.2f}s")
print(f"Average time per image: {np.mean(total_time):.2f}s")

Total time: 45.55s
Average time per image: 0.15s


As we can see, the ``DPT_Hybrid`` model needs an everage of 0.15s inference + processing time, which is approximately 7 frames per second in the context of a real time depth estimation.

# Results

Now let's see the video !

In [8]:
frame_size = (1920, 1080)  # Specify the width and height of your frames
frame_rate = 30  # Frames per second
codec = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4 format

output_file = 'output_video.mp4'
out = cv2.VideoWriter(output_file, codec, frame_rate, frame_size)

for frame in depth_video:  # 'frames' is your list of RGB frames
    out.write(frame)

out.release()

In [10]:
Video(output_file, embed = False)

As we can see, the results on the video are quite detailed for a monocular camera depth estimation!