In [1]:
import cv2
import torch
from ultralytics import YOLO
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor
from PIL import Image

In [2]:
yolo_model = YOLO('yolov8n.pt')

In [5]:
pip install timm

Collecting timm
  Obtaining dependency information for timm from https://files.pythonhosted.org/packages/6b/02/0d8925809296bed4cf841446e1291c3f381fde6d777a1ab2a25a3829b4a4/timm-1.0.12-py3-none-any.whl.metadata
  Downloading timm-1.0.12-py3-none-any.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.6 kB ? eta -:--:--
     ------------------------------ ------- 41.0/51.6 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 51.6/51.6 kB 656.3 kB/s eta 0:00:00
Collecting huggingface_hub (from timm)
  Obtaining dependency information for huggingface_hub from https://files.pythonhosted.org/packages/95/9b/3068fb3ae0b498eb66960ca5f4d92a81c91458cacd4dc17bfa6d40ce90fb/huggingface_hub-0.26.3-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting safetensors (from timm)
  Obtaining dependency information for safetensors from https://files.pythonhosted.org/packages/f1/5f/ab6b6cec85b40789801f35b7d2fb579ae24


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from torchvision import transforms


In [7]:
# midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS")
# midas_model.eval()
# midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms").default_transform
midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms", trust_repo=True).default_transform


Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Downloading: "https://github.com/rwightman/gen-efficientnet-pytorch/zipball/master" to C:\Users\hites/.cache\torch\hub\master.zip
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth" to C:\Users\hites/.cache\torch\hub\checkpoints\tf_efficientnet_lite3-b733e338.pth
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" to C:\Users\hites/.cache\torch\hub\checkpoints\midas_v21_small_256.pt
100%|██████████| 81.8M/81.8M [00:48<00:00, 1.75MB/s]
Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master


In [None]:
midas_transforms = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize to the expected input size for MiDaS
    transforms.ToTensor(),  # Convert PIL image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

In [22]:
# Define MiDaS transformation (for MiDaS Small)
midas_transforms = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize to the expected input size for MiDaS Small
    transforms.ToTensor(),  # Convert PIL image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Load MiDaS model
midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True)
midas_model.eval()

# Webcam setup
cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Resize frame for faster processing
    frame = cv2.resize(frame, (640, 480))

    # Convert frame to RGB for YOLOv8 processing
    input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # YOLOv8 object detection
    results = yolo_model(input_frame, stream=True)
    detections = []
    for result in results:
        boxes = result.boxes.xyxy  # Bounding box coordinates
        confidences = result.boxes.conf
        classes = result.boxes.cls

        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box)
            detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

    # MiDaS depth estimation
    pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
    transformed_frame = midas_transforms(pil_frame).unsqueeze(0)  # Apply MiDaS transformation

    with torch.no_grad():
        depth_map = midas_model(transformed_frame)  # Forward pass to get depth map
        
        # Ensure depth_map is in the correct format (1, 1, H, W)
        depth_map = depth_map.unsqueeze(1)  # Adding a channel dimension if missing
        
        # Resize depth map to match the input frame size (height, width)
        depth_map = torch.nn.functional.interpolate(
            depth_map, size=(frame.shape[0], frame.shape[1]), mode="bicubic", align_corners=False
        ).squeeze().cpu().numpy()

    # Normalize depth for visualization
    depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Process YOLO detections
    for (x1, y1, x2, y2, cls, conf) in detections:
        # Calculate average depth inside bounding box
        obj_depth = np.mean(depth_map[y1:y2, x1:x2])
        label = f"Class: {cls}, Depth: {obj_depth:.2f}m"
        
        # Draw bounding box and label on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display depth map and frame
    cv2.imshow("Depth Map", depth_normalized)
    cv2.imshow("YOLO + Depth Estimation", frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\hites/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master



0: 480x640 2 persons, 1 cell phone, 256.6ms
Speed: 0.0ms preprocess, 256.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 98.6ms
Speed: 0.0ms preprocess, 98.6ms inference, 10.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cell phone, 84.3ms
Speed: 0.0ms preprocess, 84.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 80.8ms
Speed: 0.0ms preprocess, 80.8ms inference, 15.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 99.8ms
Speed: 4.9ms preprocess, 99.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 114.6ms
Speed: 5.8ms preprocess, 114.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 100.1ms
Speed: 0.0ms preprocess, 100.1ms inference, 15.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 

In [11]:
# Define MiDaS transformation (for MiDaS Small)
midas_transforms = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize to the expected input size for MiDaS Small
    transforms.ToTensor(),  # Convert PIL image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Load MiDaS model
midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True)
midas_model.eval()

# Known object dimensions in meters (e.g., a bottle with height = 0.25m)
KNOWN_OBJECT_HEIGHT = 0.30  # Replace with the actual height of the reference object in meters

# Load YOLOv8 model
yolo_model = YOLO("yolov8n.pt")  # Load YOLOv8 model

# Target class index for "bottle" (COCO dataset class index)
TARGET_CLASS = 39

# Webcam setup
cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame for faster processing
    frame = cv2.resize(frame, (640, 480))

    # Convert frame to RGB for YOLOv8 processing
    input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # YOLOv8 object detection
    results = yolo_model(input_frame, stream=True)
    detections = []
    for result in results:
        boxes = result.boxes.xyxy  # Bounding box coordinates
        confidences = result.boxes.conf
        classes = result.boxes.cls

        for i, box in enumerate(boxes):
            # Filter for "bottle" class
            if int(classes[i]) == TARGET_CLASS:
                x1, y1, x2, y2 = map(int, box)
                detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

    # MiDaS depth estimation
    pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
    transformed_frame = midas_transforms(pil_frame).unsqueeze(0)  # Apply MiDaS transformation

    with torch.no_grad():
        depth_map = midas_model(transformed_frame)  # Forward pass to get depth map

        # Ensure depth_map is in the correct format (1, 1, H, W)
        depth_map = depth_map.unsqueeze(1)  # Adding a channel dimension if missing

        # Resize depth map to match the input frame size (height, width)
        depth_map = torch.nn.functional.interpolate(
            depth_map, size=(frame.shape[0], frame.shape[1]), mode="bicubic", align_corners=False
        ).squeeze().cpu().numpy()

    # Normalize depth for visualization
    depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Process YOLO detections for "bottle" class
    for (x1, y1, x2, y2, cls, conf) in detections:
        # Calculate average depth inside bounding box
        obj_depth = np.mean(depth_map[y1:y2, x1:x2])

        # Calculate real-world distance using known object size
        apparent_height = y2 - y1  # Apparent height of the object in pixels
        if apparent_height > 0:  # Avoid division by zero
            scaling_factor = KNOWN_OBJECT_HEIGHT / apparent_height
            real_distance = obj_depth * scaling_factor
            label = f"Bottle: Distance: {real_distance:.2f}m"
        else:
            label = f"Bottle: Depth: {obj_depth:.2f} (Relative)"

        # Draw bounding box and label on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display depth map and frame
    cv2.imshow("Depth Map", depth_normalized)
    cv2.imshow("YOLO + Depth Estimation", frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\hites/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master



0: 480x640 1 person, 1 cat, 1 chair, 375.0ms
Speed: 0.0ms preprocess, 375.0ms inference, 15.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 1 chair, 237.3ms
Speed: 0.1ms preprocess, 237.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 1 chair, 1 dining table, 162.8ms
Speed: 0.0ms preprocess, 162.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 1 chair, 118.6ms
Speed: 0.0ms preprocess, 118.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 136.8ms
Speed: 0.0ms preprocess, 136.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 126.7ms
Speed: 0.0ms preprocess, 126.7ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 116.0ms
Speed: 0.0ms preprocess, 116.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x6

In [None]:
# 1.
# cap = cv2.VideoCapture(0)  
# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break
    
#     frame = cv2.resize(frame, (640, 480))

#     # Resize frame for faster processing
#     input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
#     # YOLOv8 object detection
#     results = yolo_model(input_frame, stream=True)
#     detections = []
#     for result in results:
#         boxes = result.boxes.xyxy  # Bounding box coordinates
#         confidences = result.boxes.conf
#         classes = result.boxes.cls

#         for i, box in enumerate(boxes):
#             x1, y1, x2, y2 = map(int, box)
#             detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

#     # MiDaS depth estimation
#     pil_frame = Image.fromarray(input_frame)
#     transformed_frame = midas_transforms(pil_frame).unsqueeze(0)

#     with torch.no_grad():
#         depth_map = midas_model(transformed_frame)
#         depth_map = torch.nn.functional.interpolate(
#             depth_map, size=frame.shape[:2], mode="bicubic", align_corners=False
#         ).squeeze().cpu().numpy()

#     # Normalize depth for visualization
#     depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

#     # Process YOLO detections
#     for (x1, y1, x2, y2, cls, conf) in detections:
#         # Calculate average depth inside bounding box
#         obj_depth = np.mean(depth_map[y1:y2, x1:x2])
#         label = f"Class: {cls}, Depth: {obj_depth:.2f}m"
        
#         # Draw bounding box and label on the frame
#         cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
#         cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

#     # Display depth map and frame
#     cv2.imshow("Depth Map", depth_normalized)
#     cv2.imshow("YOLO + Depth Estimation", frame)

#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()


# cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break

#     # Convert frame to RGB for YOLOv8 processing
#     input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     # YOLOv8 object detection
#     results = yolo_model(input_frame, stream=True)
#     detections = []
#     for result in results:
#         boxes = result.boxes.xyxy  # Bounding box coordinates
#         confidences = result.boxes.conf
#         classes = result.boxes.cls

#         for i, box in enumerate(boxes):
#             x1, y1, x2, y2 = map(int, box)
#             detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

#     # MiDaS depth estimation
#     pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
#     transformed_frame = midas_transforms(pil_frame).unsqueeze(0)  # Apply MiDaS transformation

#     with torch.no_grad():
#         depth_map = midas_model(transformed_frame)  # Forward pass to get depth map
#         depth_map = torch.nn.functional.interpolate(
#             depth_map, size=frame.shape[:2], mode="bicubic", align_corners=False
#         ).squeeze().cpu().numpy()

#     # Normalize depth for visualization
#     depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

#     # Process YOLO detections
#     for (x1, y1, x2, y2, cls, conf) in detections:
#         # Calculate average depth inside bounding box
#         obj_depth = np.mean(depth_map[y1:y2, x1:x2])
#         label = f"Class: {cls}, Depth: {obj_depth:.2f}m"
        
#         # Draw bounding box and label on the frame
#         cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
#         cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

#     # Display depth map and frame
#     cv2.imshow("Depth Map", depth_normalized)
#     cv2.imshow("YOLO + Depth Estimation", frame)

#     # Break loop on 'q' key press
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()

# 3.
# cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break
    
#     # Resize frame for faster processing
#     frame = cv2.resize(frame, (640, 480))

#     # Convert frame to RGB for YOLOv8 processing
#     input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     # YOLOv8 object detection
#     results = yolo_model(input_frame, stream=True)
#     detections = []
#     for result in results:
#         boxes = result.boxes.xyxy  # Bounding box coordinates
#         confidences = result.boxes.conf
#         classes = result.boxes.cls

#         for i, box in enumerate(boxes):
#             x1, y1, x2, y2 = map(int, box)
#             detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

#     # MiDaS depth estimation
#     pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
#     transformed_frame = midas_transforms(pil_frame)['image'].unsqueeze(0)  # Apply MiDaS transformation, handle 'image' key

#     with torch.no_grad():
#         depth_map = midas_model(transformed_frame)  # Forward pass to get depth map
#         depth_map = torch.nn.functional.interpolate(
#             depth_map, size=frame.shape[:2], mode="bicubic", align_corners=False
#         ).squeeze().cpu().numpy()

#     # Normalize depth for visualization
#     depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

#     # Process YOLO detections
#     for (x1, y1, x2, y2, cls, conf) in detections:
#         # Calculate average depth inside bounding box
#         obj_depth = np.mean(depth_map[y1:y2, x1:x2])
#         label = f"Class: {cls}, Depth: {obj_depth:.2f}m"
        
#         # Draw bounding box and label on the frame
#         cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
#         cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
a
#     # Display depth map and frame
#     cv2.imshow("Depth Map", depth_normalized)
#     cv2.imshow("YOLO + Depth Estimation", frame)

#     # Break loop on 'q' key press
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()



0: 480x640 1 person, 89.9ms
Speed: 0.0ms preprocess, 89.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)


ValueError: Input and output must have the same number of spatial dimensions, but got input with spatial dimensions of [384] and output size of (480, 640). Please provide input tensor in (N, C, d1, d2, ...,dK) format and output size in (o1, o2, ...,oK) format.

In [None]:
# # Define MiDaS transformation (for MiDaS Small)
# midas_transforms = transforms.Compose([
#     transforms.Resize((384, 384)),  # Resize to the expected input size for MiDaS Small
#     transforms.ToTensor(),  # Convert PIL image to a tensor
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
# ])

# # Load MiDaS model
# midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True)
# midas_model.eval()

# # Known object dimensions in meters (e.g., a bottle with height = 0.30m)
# KNOWN_OBJECT_HEIGHT = 0.30  # Replace with the actual height of the reference object in meters

# # Load YOLOv8 model
# yolo_model = YOLO("yolov8n.pt")  # Load YOLOv8 model

# # Target class index for "bottle" (COCO dataset class index)
# TARGET_CLASS = 39

# # Initialize distance buffer for smoothing
# distance_buffer = []
# buffer_size = 10  # Number of frames for temporal smoothing

# # Webcam setup
# cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break

#     # Resize frame for faster processing
#     frame = cv2.resize(frame, (640, 480))

#     # Convert frame to RGB for YOLOv8 processing
#     input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

#     # YOLOv8 object detection
#     results = yolo_model(input_frame, stream=True)
#     detections = []
#     for result in results:
#         boxes = result.boxes.xyxy  # Bounding box coordinates
#         confidences = result.boxes.conf
#         classes = result.boxes.cls

#         for i, box in enumerate(boxes):
#             # Filter for "bottle" class
#             if int(classes[i]) == TARGET_CLASS:
#                 x1, y1, x2, y2 = map(int, box)
#                 detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

#     # MiDaS depth estimation
#     pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
#     transformed_frame = midas_transforms(pil_frame).unsqueeze(0)  # Apply MiDaS transformation

#     with torch.no_grad():
#         depth_map = midas_model(transformed_frame)  # Forward pass to get depth map

#         # Resize depth map to match the input frame size (height, width)
#         depth_map = torch.nn.functional.interpolate(
#             depth_map.unsqueeze(1),  # Add channel dimension if missing
#             size=(frame.shape[0], frame.shape[1]),
#             mode="bicubic",
#             align_corners=False
#         ).squeeze().cpu().numpy()

#     # Normalize depth for visualization
#     depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

#     # Process YOLO detections for "bottle" class
#     for (x1, y1, x2, y2, cls, conf) in detections:
#         # Crop the depth map to the bounding box
#         depth_crop = depth_map[y1:y2, x1:x2]

#         # Filter invalid depth values
#         valid_depths = depth_crop[(depth_crop > 0) & (depth_crop < np.percentile(depth_crop, 99))]
#         if len(valid_depths) == 0:
#             continue  # Skip if no valid depths

#         # Use the median depth for stability
#         obj_depth = np.median(valid_depths)

#         # Calculate real-world distance using known object size
#         apparent_height = y2 - y1  # Apparent height of the object in pixels
#         if apparent_height > 0:  # Avoid division by zero
#             scaling_factor = KNOWN_OBJECT_HEIGHT / apparent_height
#             real_distance = obj_depth * scaling_factor
#         else:
#             real_distance = obj_depth

#         # Add the distance to the buffer for smoothing
#         distance_buffer.append(real_distance)
#         if len(distance_buffer) > buffer_size:
#             distance_buffer.pop(0)

#         # Calculate the smoothed distance
#         stable_distance = np.mean(distance_buffer)

#         # Label the object with the stabilized distance
#         label = f"Bottle: Distance: {stable_distance:.2f}m"
#         cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
#         cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

#     # Display depth map and frame
#     cv2.imshow("Depth Map", depth_normalized)
#     cv2.imshow("YOLO + Depth Estimation", frame)

#     # Break loop on 'q' key press
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#         break

# cap.release()
# cv2.destroyAllWindows()


Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\hites/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master



0: 480x640 1 person, 1 laptop, 88.6ms
Speed: 15.6ms preprocess, 88.6ms inference, 6.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 laptop, 147.5ms
Speed: 4.1ms preprocess, 147.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 107.5ms
Speed: 0.0ms preprocess, 107.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 87.0ms
Speed: 4.6ms preprocess, 87.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 91.1ms
Speed: 0.0ms preprocess, 91.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 87.5ms
Speed: 0.0ms preprocess, 87.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 toothbrush, 109.2ms
Speed: 0.0ms preprocess, 109.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 toothbrush, 100.0ms
Speed: 4.6ms prepro

In [1]:
import cv2
import torch
import numpy as np
from torchvision import transforms
from PIL import Image
from ultralytics import YOLO

# Define MiDaS transformation (for MiDaS Small)
midas_transforms = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize to the expected input size for MiDaS Small
    transforms.ToTensor(),  # Convert PIL image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Load MiDaS model
midas_model = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True)
midas_model.eval()

# Known object dimensions in meters (e.g., a bottle with height = 0.30m)
KNOWN_OBJECT_HEIGHT = 0.30  # Real-world height of the reference object in meters

# Camera parameters
FOCAL_LENGTH = 600  # Approximate focal length of the webcam in pixels
# Note: Focal length can be calibrated more precisely using a checkerboard calibration process.

# Load YOLOv8 model
yolo_model = YOLO("yolov8n.pt")  # Load YOLOv8 model

# Target class index for "bottle" (COCO dataset class index)
TARGET_CLASS = 39

# Initialize distance buffer for smoothing
distance_buffer = []
buffer_size = 10  # Number of frames for temporal smoothing

# Webcam setup
cap = cv2.VideoCapture(0)  # Change index if you have multiple cameras

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame for faster processing
    frame = cv2.resize(frame, (640, 480))

    # Convert frame to RGB for YOLOv8 processing
    input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # YOLOv8 object detection
    results = yolo_model(input_frame, stream=True)
    detections = []
    for result in results:
        boxes = result.boxes.xyxy  # Bounding box coordinates
        confidences = result.boxes.conf
        classes = result.boxes.cls

        for i, box in enumerate(boxes):
            # Filter for "bottle" class
            if int(classes[i]) == TARGET_CLASS:
                x1, y1, x2, y2 = map(int, box)
                detections.append((x1, y1, x2, y2, int(classes[i]), confidences[i]))

    # MiDaS depth estimation
    pil_frame = Image.fromarray(input_frame)  # Convert numpy array to PIL image
    transformed_frame = midas_transforms(pil_frame).unsqueeze(0)  # Apply MiDaS transformation

    with torch.no_grad():
        depth_map = midas_model(transformed_frame)  # Forward pass to get depth map

        # Resize depth map to match the input frame size (height, width)
        depth_map = torch.nn.functional.interpolate(
            depth_map.unsqueeze(1),  # Add channel dimension if missing
            size=(frame.shape[0], frame.shape[1]),
            mode="bicubic",
            align_corners=False
        ).squeeze().cpu().numpy()

    # Normalize depth for visualization
    depth_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # Process YOLO detections for "bottle" class
    for (x1, y1, x2, y2, cls, conf) in detections:
        # Crop the depth map to the bounding box
        depth_crop = depth_map[y1:y2, x1:x2]

        # Filter invalid depth values
        valid_depths = depth_crop[(depth_crop > 0) & (depth_crop < np.percentile(depth_crop, 99))]
        if len(valid_depths) == 0:
            continue  # Skip if no valid depths

        # Use the median depth for stability
        obj_depth = np.median(valid_depths)

        # Calculate apparent object height in pixels
        apparent_height = y2 - y1  # Bounding box height in pixels

        if apparent_height > 0:  # Avoid division by zero
            # Distance using focal length formula
            distance_focal = (KNOWN_OBJECT_HEIGHT * FOCAL_LENGTH) / apparent_height

            # Distance using depth map scaling factor
            scaling_factor = KNOWN_OBJECT_HEIGHT / apparent_height
            distance_depth = obj_depth * scaling_factor

            # Use focal length-based distance for better accuracy
            real_distance = distance_focal

            # Add the distance to the buffer for smoothing
            distance_buffer.append(real_distance)
            if len(distance_buffer) > buffer_size:
                distance_buffer.pop(0)

            # Calculate the smoothed distance
            stable_distance = np.mean(distance_buffer)

            # Label the object with the stabilized distance
            label = f"Bottle: Distance: {stable_distance:.2f}m"
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display depth map and frame
    cv2.imshow("Depth Map", depth_normalized)
    cv2.imshow("YOLO + Depth Estimation", frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\hites/.cache\torch\hub\intel-isl_MiDaS_master
  from .autonotebook import tqdm as notebook_tqdm


Loading weights:  None


Using cache found in C:\Users\hites/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master



0: 480x640 1 person, 1 bottle, 326.6ms
Speed: 0.0ms preprocess, 326.6ms inference, 26.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bottle, 230.1ms
Speed: 5.0ms preprocess, 230.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 201.7ms
Speed: 0.0ms preprocess, 201.7ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 1 toothbrush, 188.9ms
Speed: 0.0ms preprocess, 188.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bottle, 1 toothbrush, 114.4ms
Speed: 0.0ms preprocess, 114.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bottle, 1 cell phone, 99.2ms
Speed: 0.0ms preprocess, 99.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bottle, 1 toothbrush, 90.2ms
Speed: 3.1ms preprocess, 90.2ms inference, 0.0ms postprocess per image at s