In [4]:
import torch
import torchvision.transforms as transforms
from torchvision.transforms import Compose, ToTensor, Normalize
import cv2
import numpy as np
from PIL import Image
import os

# Custom resize transform to maintain aspect ratio
class ResizeAspectRatio:
    def __init__(self, size, interpolation=Image.BICUBIC):
        self.size = size
        self.interpolation = interpolation

    def __call__(self, img):
        w, h = img.size
        if h > w:
            new_h = self.size
            new_w = int(w * self.size / h)
        else:
            new_w = self.size
            new_h = int(h * self.size / w)
        return img.resize((new_w, new_h), self.interpolation)

# Load the pre-trained MiDaS model
model_type = "DPT_Large"  # MiDaS v3 - Large
midas = torch.hub.load("intel-isl/MiDaS", model_type)

# Load the model weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device).eval()

# Load transforms to resize and normalize the image
transform = Compose([
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load and preprocess the image
input_image_path = "assets/buck.webp"
img = cv2.imread(input_image_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_pil = Image.fromarray(img_rgb)  # Convert to PIL image
input_batch = transform(img_pil).unsqueeze(0).to(device)

# Predict depth
with torch.no_grad():
    prediction = midas(input_batch)

    # Resize the output to the original image size
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img.shape[:2],
        mode="bicubic",
        align_corners=False,
    ).squeeze()

# Convert to numpy array
depth_map = prediction.cpu().numpy()

# Normalize depth map for visualization
depth_min = depth_map.min()
depth_max = depth_map.max()
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
depth_map = (depth_map * 255).astype(np.uint8)

# Create output directory if it doesn't exist
output_dir = "assets/predicted/"
os.makedirs(output_dir, exist_ok=True)

# Save the depth map as an image
output_image_path = os.path.join(output_dir, os.path.basename(input_image_path))
cv2.imwrite(output_image_path, depth_map)

# Display the depth map
cv2.imshow("Depth Map", depth_map)
cv2.waitKey(0)
cv2.destroyAllWindows()

print(f"Predicted depth map saved at: {output_image_path}")


Using cache found in C:\Users\Tomas Gula/.cache\torch\hub\intel-isl_MiDaS_master


Predicted depth map saved at: assets/predicted/buck.webp


In [3]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import os
import cv2
import numpy as np

# Custom dataset class
class DepthDataset(Dataset):
    def __init__(self, image_paths, depth_paths, transform=None, depth_transform=None):
        self.image_paths = image_paths
        self.depth_paths = depth_paths
        self.transform = transform
        self.depth_transform = depth_transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        depth = Image.open(self.depth_paths[idx]).convert("L")
        if self.transform:
            image = self.transform(image)
        if self.depth_transform:
            depth = self.depth_transform(depth)
        return image, depth

# Custom resize transform to maintain aspect ratio
class ResizeAspectRatio:
    def __init__(self, size, interpolation=Image.BICUBIC):
        self.size = size
        self.interpolation = interpolation

    def __call__(self, img):
        w, h = img.size
        if h > w:
            new_h = self.size
            new_w = int(w * self.size / h)
        else:
            new_w = self.size
            new_h = int(h * self.size / w)
        return img.resize((new_w, new_h), self.interpolation)

# Paths to your dataset
image_dir = 'assets'
depth_dir = 'assets/predicted'
image_paths = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, img))][-1:]
depth_paths = [os.path.join(depth_dir, img) for img in os.listdir(depth_dir) if os.path.isfile(os.path.join(depth_dir, img))][-1:]

# Match depth paths to images based on filenames
depth_paths_dict = {os.path.basename(p): p for p in depth_paths}
depth_paths = [depth_paths_dict.get(os.path.basename(img), None) for img in image_paths]
depth_paths = [p for p in depth_paths if p is not None]

# Transform for input images and depth maps
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
depth_transform = transforms.Compose([
    transforms.ToTensor(),
])

# Create dataset and dataloaders
dataset = DepthDataset(image_paths, depth_paths, transform, depth_transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Load the pre-trained MiDaS model
model_type = "DPT_Large"
midas = torch.hub.load("intel-isl/MiDaS", model_type)

# Load the model weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)

# Print model architecture
#print(midas)

# Fine-tuning: Make all layers trainable
for param in midas.parameters():
    param.requires_grad = True

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(midas.parameters(), lr=1e-4)

# Fine-tuning loop
num_epochs = 10
for epoch in range(num_epochs):
    midas.train()
    running_loss = 0.0
    for images, depths in dataloader:
        images = images.to(device)
        depths = depths.to(device)

        optimizer.zero_grad()
        outputs = midas(images)
        loss = criterion(outputs, depths)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(dataloader)}")

print("Finished fine-tuning")

# Save the fine-tuned model
torch.save(midas.state_dict(), 'midas_finetuned.pth')

# Load and preprocess a new image
img = cv2.imread("assets/test1.jpg")
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_pil = Image.fromarray(img_rgb)  # Convert to PIL image

# Resize the input image to match the depth map dimensions
original_size = img.shape[1], img.shape[0]
resize_transform = ResizeAspectRatio(max(original_size))
img_resized = resize_transform(img_pil)
input_batch = transform(img_resized).unsqueeze(0).to(device)

# Predict depth with the fine-tuned model
with torch.no_grad():
    prediction = midas(input_batch)

    # Resize the output to the original image size
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=original_size,
        mode="bicubic",
        align_corners=False,
    ).squeeze()

# Convert to numpy array
depth_map = prediction.cpu().numpy()

# Normalize depth map for visualization
depth_min = depth_map.min()
depth_max = depth_map.max()
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
depth_map = (depth_map * 255).astype(np.uint8)

# Create output directory if it doesn't exist
output_dir = "assets/predicted/"
os.makedirs(output_dir, exist_ok=True)

# Save the depth map as an image
output_image_path = os.path.join(output_dir, os.path.basename("assets/test1.jpg"))
cv2.imwrite(output_image_path, depth_map)

# Display the depth map
cv2.imshow("Depth Map", depth_map)
cv2.waitKey(0)
cv2.destroyAllWindows()

print(f"Predicted depth map saved at: {output_image_path}")



Using cache found in C:\Users\Tomas Gula/.cache\torch\hub\intel-isl_MiDaS_master


RuntimeError: The size of tensor a (560) must match the size of tensor b (568) at non-singleton dimension 3