# Depth-Anything-V2-Mini Baseline

## Test out Depth-Anything-V2 with Test Image
- [Reference: Hugging Face](https://huggingface.co/docs/transformers/main/model_doc/depth_anything_v2)

In [1]:
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
image = Image.open("test.jpg")
inputs = image_processor(images=image, return_tensors="pt")

tensor_shape = inputs['pixel_values'].shape
print(f"Shape of the tensor: {tensor_shape}")

Shape of the tensor: torch.Size([1, 3, 518, 672])


In [3]:
with torch.no_grad():
    outputs = model(**inputs)

depth_map = image_processor.post_process_depth_estimation(outputs, target_sizes=[image.size[::-1]])[0]

## Setting up the data loader

In [1]:
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import util.image as u

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from Dataset import NYUDepthV2Dataset

In [3]:
from torch.utils.data import DataLoader
from torchvision import transforms
import cv2


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = NYUDepthV2Dataset('nyu_depth_v2_labeled.mat', transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [4]:
for images, depths in dataloader:
    print("Images shape:", images.shape)
    print("Depths shape:", depths.shape)
    break

Images shape: torch.Size([16, 3, 240, 320])
Depths shape: torch.Size([16, 240, 320])


## Viewing the NYU v2 Dataset
- Importing Dataset from Hugging Face: [Hugging Face: NYU Depth V2](https://huggingface.co/datasets/sayakpaul/nyu_depth_v2)
- Downloading Dataset: [NYU Depth V2](https://cs.nyu.edu/~fergus/datasets/nyu_depth_v2.html)

In [None]:
images = np.array(dataset.file['images'])
depths = np.array(dataset.file['depths'])

In [None]:
random_indices = np.random.choice(len(images), 9).tolist()

plt.figure(figsize=(15, 9))

for i, idx in enumerate(random_indices):
    ax = plt.subplot(3, 3, i + 1)
    image_viz = u.merge_into_row(images[idx], depths[idx])
    plt.tight_layout()
    plt.imshow(image_viz.astype("uint8"))
    plt.axis("off")

## Using the Model 
- Importing Depth-Anything-V2 Small from Transformer: [Hugging Face: Depth Anything V2](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2)

In [5]:
import Baseline as b
import torch
from tqdm import tqdm, trange
from torchinfo import summary

In [6]:
model, criterion, optimizer = b.create_baseline()
summary(model, input_size=(16, 3, 240, 320))

Layer (type:depth-idx)                                                 Output Shape              Param #
DepthAnythingBaseline                                                  [16, 240, 320]            --
├─DepthAnythingForDepthEstimation: 1-1                                 [16, 238, 308]            --
│    └─Dinov2Backbone: 2-1                                             [16, 375, 384]            --
│    │    └─Dinov2Embeddings: 3-1                                      [16, 375, 384]            753,024
│    │    └─Dinov2Encoder: 3-2                                         [16, 375, 384]            21,302,784
│    │    └─LayerNorm: 3-3                                             [16, 375, 384]            768
│    │    └─LayerNorm: 3-4                                             [16, 375, 384]            (recursive)
│    │    └─LayerNorm: 3-5                                             [16, 375, 384]            (recursive)
│    │    └─LayerNorm: 3-6                                     

## Baseline Training

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

num_epochs = 1

In [8]:
import util.eval as e

In [11]:
import torch

loss_values = []
delta_accuracy_values_50 = []
delta_accuracy_values_25 = []
mae_values = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_delta_accuracy_50 = 0
    total_delta_accuracy_25 = 0
    total_mae = 0
    
    for images, depths in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        images, depths = images.to(device), depths.to(device)
                
        outputs = model(images)
        loss = criterion(outputs, depths)
        
        optimizer.zero_grad()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate metrics
        with torch.no_grad():
            delta_accuracy_50 = e.compute_delta_accuracy(outputs, depths, 1.5)
            delta_accuracy_25 = e.compute_delta_accuracy(outputs, depths, 1.25)
            mae = torch.abs(outputs - depths).mean().item()
            
            total_delta_accuracy_50 += delta_accuracy_50
            total_delta_accuracy_25 += delta_accuracy_25
            total_mae += mae
    
    avg_loss = total_loss / len(dataloader)
    avg_delta_accuracy_50 = total_delta_accuracy_50 / len(dataloader)
    avg_delta_accuracy_25 = total_delta_accuracy_25 / len(dataloader)
    avg_mae = total_mae / len(dataloader)
    
    loss_values.append(avg_loss)
    delta_accuracy_values_50.append(avg_delta_accuracy_50)
    delta_accuracy_values_25.append(avg_delta_accuracy_25)
    mae_values.append(avg_mae)
    
    tqdm.write(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}, '
               f'δ1 Accuracy with 1.25 threshold: {avg_delta_accuracy_25:.2f}%, '
               f'δ1 Accuracy with 1.50 threshold: {avg_delta_accuracy_50:.2f}%, '
               f'MAE: {avg_mae:.4f}')

print("Training completed!")

                                                          

Epoch [1/1], Avg Loss: 1.9267, δ1 Accuracy with 1.25 threshold: 15.48%, δ1 Accuracy with 1.50 threshold: 27.30%, MAE: 1.9267
Training completed!


