In [1]:
import torch
import torch.nn.functional as F
import cv2
import numpy as np
import matplotlib
import os
import time
import glob
from depth_anything_v2.dpt import DepthAnythingV2

xFormers not available
xFormers not available


In [21]:
class OptimizedDepthAnythingV2:
    def __init__(self, encoder='vits', device=None):
        """
        Initialize the optimized Depth Anything V2 model
        
        Args:
            encoder: Model size ('vits', 'vitb', 'vitl', 'vitg')
            device: Device to run on (auto-detected if None)
        """
        # Auto-detect best available device
        if device is None:
            if torch.cuda.is_available():
                self.device = torch.device('cuda')
                print(f"Using CUDA: {torch.cuda.get_device_name()}")
            elif torch.backends.mps.is_available():
                self.device = torch.device('mps')
                print("Using Apple Metal Performance Shaders (MPS)")
            else:
                self.device = torch.device('cpu')
                print("Using CPU")
        else:
            self.device = torch.device(device)
            
        # Model configurations
        self.model_configs = {
            'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
            'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
            'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
            'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
        }
        
        self.encoder = encoder
        self.model = self._load_model()
        
    def _load_model(self):
        """Load and initialize the model"""
        print(f"Loading {self.encoder} model...")
        
        # Initialize model
        model = DepthAnythingV2(**self.model_configs[self.encoder])
        
        # Load weights
        checkpoint_path = f'checkpoints/depth_anything_v2_{self.encoder}.pth'
        if not os.path.exists(checkpoint_path):
            raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
            
        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu', weights_only=True))
        
        # Move to device and set to eval mode
        model = model.to(self.device).eval()
        
        # Enable optimizations for inference
        if self.device.type == 'cuda':
            model = model.half()  # Use FP16 for faster inference on modern GPUs
            torch.backends.cudnn.benchmark = True  # Optimize CUDNN for consistent input sizes
            
        print(f"Model loaded successfully on {self.device}")
        return model
        
    def warmup(self, input_size=518, num_warmup=3):
        """Warm up the model for consistent timing"""
        print("Warming up model...")
        dummy_input = torch.randn(1, 3, input_size, input_size, device=self.device)
        if self.device.type == 'cuda':
            dummy_input = dummy_input.half()
            
        with torch.no_grad():
            for _ in range(num_warmup):
                _ = self.model(dummy_input)
                
        if self.device.type == 'cuda':
            torch.cuda.synchronize()  # Ensure all operations complete
        print("Warmup complete")
        
    def infer_image(self, raw_image, input_size=384):
        """
        Optimized inference with proper error handling
        
        Args:
            raw_image: Input image (BGR format from cv2.imread)
            input_size: Model input size
            
        Returns:
            depth: Depth map as numpy array
            inference_time: Time taken for inference in seconds
        """
        if raw_image is None:
            raise ValueError("Input image is None")
            
        original_height, original_width = raw_image.shape[:2]
        
        # Preprocessing - convert BGR to RGB and normalize
        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB)
        image = image.astype(np.float32) / 255.0
        
        # Convert to tensor and resize
        image_tensor = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0)
        image_tensor = F.interpolate(
            image_tensor, 
            (input_size, input_size), 
            mode='bilinear', 
            align_corners=False
        )
        
        # Normalize with ImageNet stats
        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
        image_tensor = (image_tensor - mean) / std
        
        # Move to device and convert to appropriate dtype
        image_tensor = image_tensor.to(self.device)
        if self.device.type == 'cuda':
            image_tensor = image_tensor.half()
            
        # Inference with timing
        start_time = time.time()
        
        with torch.no_grad():
            depth = self.model(image_tensor)
            
        if self.device.type == 'cuda':
            torch.cuda.synchronize()  # Ensure GPU operations complete
            
        inference_time = time.time() - start_time
        
        # Post-processing - resize back to original dimensions
        depth = F.interpolate(
            depth, 
            (original_height, original_width), 
            mode='bilinear', 
            align_corners=False
        )
        
        # Convert back to numpy
        depth = depth.squeeze().cpu().numpy()
        
        return depth, inference_time
    
    def process_single_image(self, image_path, output_dir='./vis_depth', 
                           input_size=518, grayscale=False, pred_only=True):
        """Process a single image with optimized pipeline"""
        
        # Read image
        raw_image = cv2.imread(image_path)
        if raw_image is None:
            raise ValueError(f"Could not read image: {image_path}")
            
        # Run inference
        depth, inference_time = self.infer_image(raw_image, input_size)
        
        # Normalize depth for visualization
        depth_normalized = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
        depth_normalized = depth_normalized.astype(np.uint8)
        
        # Apply colormap or grayscale
        if grayscale:
            depth_vis = np.repeat(depth_normalized[..., np.newaxis], 3, axis=-1)
        else:
            cmap = matplotlib.colormaps.get_cmap('Spectral_r')
            depth_vis = (cmap(depth_normalized)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
        
        # Save output
        os.makedirs(output_dir, exist_ok=True)
        filename = os.path.splitext(os.path.basename(image_path))[0]
        
        if pred_only:
            output_path = os.path.join(output_dir, f'{filename}_depth.png')
            cv2.imwrite(output_path, depth_vis)
        else:
            # Create side-by-side comparison
            separator = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
            combined = cv2.hconcat([raw_image, separator, depth_vis])
            output_path = os.path.join(output_dir, f'{filename}_combined.png')
            cv2.imwrite(output_path, combined)
            
        print(f"Processed {os.path.basename(image_path)} in {inference_time*1000:.2f}ms")
        print(f"Saved to: {output_path}")
        
        return depth, inference_time
    
    def benchmark(self, image_path, num_runs=10, input_size=518):
        """Benchmark the model performance"""
        raw_image = cv2.imread(image_path)
        if raw_image is None:
            raise ValueError(f"Could not read image: {image_path}")
            
        print(f"Benchmarking with {num_runs} runs...")
        
        # Warmup
        self.warmup()
        
        # Benchmark runs
        times = []
        for i in range(num_runs):
            _, inference_time = self.infer_image(raw_image, input_size)
            times.append(inference_time)
            print(f"Run {i+1}/{num_runs}: {inference_time*1000:.2f}ms")
            
        # Statistics
        times = np.array(times)
        print(f"\nBenchmark Results:")
        print(f"Mean: {times.mean()*1000:.2f}ms")
        print(f"Std:  {times.std()*1000:.2f}ms")
        print(f"Min:  {times.min()*1000:.2f}ms")
        print(f"Max:  {times.max()*1000:.2f}ms")
        
        return times

In [22]:
# Initialize model
model = OptimizedDepthAnythingV2(encoder='vits')  # or 'vitb', 'vitl', 'vitg'

Using CUDA: NVIDIA GeForce RTX 3050 Laptop GPU
Loading vits model...
Model loaded successfully on cuda


In [23]:

# Example single image processing
image_path = r"C:\Codes\Python\obstacle_detection\Depth-Anything-V2\batch_input\right10.jpg"  # Update this path

In [27]:
def main():
    """Example usage"""
    
    try:
        # Process single image
        model.warmup()
        depth, inference_time = model.process_single_image(
            image_path=image_path,
            output_dir='./cuda_depth_output',
            pred_only=True,
            grayscale=False
        )
        
        print(f"Depth shape: {depth.shape}")
        print(f"Inference time: {inference_time*1000:.2f}ms")
        
    except Exception as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()

Warming up model...
Warmup complete
Processed right10.jpg in 44.83ms
Saved to: ./cuda_depth_output\right10_depth.png
Depth shape: (3072, 4096)
Inference time: 44.83ms


In [25]:
# Optional: Run benchmark
model.benchmark(image_path, num_runs=10)

Benchmarking with 10 runs...
Warming up model...
Warmup complete
Run 1/10: 40.10ms
Run 2/10: 44.18ms
Run 3/10: 44.54ms
Run 4/10: 44.72ms
Run 5/10: 44.81ms
Run 6/10: 47.62ms
Run 7/10: 42.89ms
Run 8/10: 45.02ms
Run 9/10: 44.52ms
Run 10/10: 45.17ms

Benchmark Results:
Mean: 44.36ms
Std:  1.80ms
Min:  40.10ms
Max:  47.62ms


array([0.04010439, 0.04417849, 0.04454017, 0.04471803, 0.04480529,
       0.047616  , 0.04288912, 0.04502106, 0.04451895, 0.04517365])