<a href="https://colab.research.google.com/github/Varsha17112005/sample/blob/main/final_catathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Full Google Colab Code for RT-MDE and MiDaS Depth Estimation with Disparity Overlay and Upload/Download Support

# STEP 1: Install Dependencies
!pip install torch torchvision timm opencv-python ultralytics matplotlib

# STEP 2: Upload Video File
from google.colab import files
import os

uploaded = files.upload()
for name in uploaded:
    os.rename(name, "input_video.mp4")
# STEP 3: RT-MDE Based Processing with Disparity Overlay
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import timm
from PIL import Image
from tqdm import tqdm
from ultralytics import YOLO

class RTMDE(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model('mobilenetv3_small_075', pretrained=True, features_only=True)
        self.decoder = nn.Sequential(
            nn.Conv2d(self.backbone.feature_info[-1]['num_chs'], 64, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(64, 32, 3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2),
            nn.Conv2d(32, 1, 1)
        )

    def forward(self, x):
        features = self.backbone(x)[-1]
        return self.decoder(features)

# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
depth_model = RTMDE().to(device).eval()
yolo_model = YOLO("yolov8n.pt")

transform = transforms.Compose([
    transforms.Resize((640, 480)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

def disparity_to_distance(disp):
    return round(6.0 / (disp + 1e-6), 2)

def add_disparity_overlay(frame, disparity_map, overlay_height=100):
    disp_norm = (disparity_map - np.min(disparity_map)) / (np.max(disparity_map) - np.min(disparity_map) + 1e-6)
    disp_colored = cv2.applyColorMap((disp_norm * 255).astype(np.uint8), cv2.COLORMAP_JET)
    disp_resized = cv2.resize(disp_colored, (frame.shape[1], overlay_height))
    frame[-overlay_height:, :, :] = disp_resized
    return frame

cap = cv2.VideoCapture("input_video.mp4")
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

out = cv2.VideoWriter("output1.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

for _ in tqdm(range(frame_count), desc="RT-MDE Processing"):
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(rgb)
    input_tensor = transform(pil_img).unsqueeze(0).to(device)

    with torch.no_grad():
        disp_map = depth_model(input_tensor).squeeze().cpu().numpy()
    disp_map = cv2.resize(disp_map, (width, height))
    norm_disp = (disp_map - disp_map.min()) / (disp_map.max() - disp_map.min() + 1e-6)

    results = yolo_model(rgb)
    for r in results:
        for box in r.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            label = r.names[int(box.cls[0])]
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
            patch = norm_disp[max(0, cy-4):cy+4, max(0, cx-4):cx+4]
            disp = np.median(patch)
            distance = disparity_to_distance(disp)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{label}: {distance}m", (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    frame = add_disparity_overlay(frame, disp_map)
    out.write(frame)

cap.release()
out.release()

# STEP 4: MiDaS Based Processing with Disparity Overlay
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.eval().to(device)

midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform_midas = midas_transforms.small_transform

cap = cv2.VideoCapture("input_video.mp4")
out = cv2.VideoWriter("output2.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

for _ in tqdm(range(frame_count), desc="MiDaS Processing"):
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    input_image = rgb / 255.0  # Convert to float
    input_image = torch.from_numpy(input_image).permute(2, 0, 1).unsqueeze(0).float().to(device)

    with torch.no_grad():
        prediction = midas(input_image)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1), size=(height, width), mode="bicubic", align_corners=False
        ).squeeze()
        depth_map = prediction.cpu().numpy()

    norm_disp = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min() + 1e-6)

    results = yolo_model(rgb)
    for r in results:
        for box in r.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            label = r.names[int(box.cls[0])]
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
            patch = norm_disp[max(0, cy-4):cy+4, max(0, cx-4):cx+4]
            disp = np.median(patch)
            distance = disparity_to_distance(disp)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{label}: {distance}m", (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    frame = add_disparity_overlay(frame, depth_map)
    out.write(frame)

cap.release()
out.release()

# STEP 5: Download Outputs
files.download("output1.mp4")  # RT-MDE Output
files.download("output2.mp4")  # MiDaS Output


Collecting ultralytics
  Downloading ultralytics-8.3.118-py3-none-any.whl.metadata (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col