In [32]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os

os.environ["YOLO_VERBOSE"] = "False"

import numpy as np
import torch
from PIL import Image
import cv2
from ultralytics import YOLO
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors
from tqdm.notebook import tqdm

In [2]:
torch.hub.help("intel-isl/MiDaS", "DPT_BEiT_L_384", force_reload=True)

model_zoe_n = torch.hub.load("isl-org/ZoeDepth", "ZoeD_N", pretrained=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
zoe = model_zoe_n.to(DEVICE)

yolo = YOLO("weights/yolo11n-seg.pt")

Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /home/ok/.cache/torch/hub/master.zip
Using cache found in /home/ok/.cache/torch/hub/isl-org_ZoeDepth_main


img_size [384, 512]


Using cache found in /home/ok/.cache/torch/hub/intel-isl_MiDaS_master
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Params passed to Resize transform:
	width:  512
	height:  384
	resize_target:  True
	keep_aspect_ratio:  True
	ensure_multiple_of:  32
	resize_method:  minimal
Using pretrained resource url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt
Loaded successfully


100%|██████████| 5.90M/5.90M [00:01<00:00, 5.48MB/s]


In [3]:
from collections import defaultdict


class AvgBuffer:
    def __init__(self, size: int):
        self.size = size
        self.objects = defaultdict(lambda: [])

    def put(self, track_id, position):
        if len(self.objects[track_id]) >= self.size:
            self.objects[track_id] = self.objects[track_id][1:]
        self.objects[track_id].append(position)

    def get(self, track_id):
        return np.mean(self.objects[track_id], axis=0)

    def ready(self, track_id):
        return len(self.objects[track_id]) >= self.size


def fmt_number(number, digits=3):
    return str(int(number * (10**digits)) / (10**digits))

In [7]:
source_path = "./videos/video2.avi"

anchor_coords = (220, 135)
anchor_class = "car"
anchor_true_dist = 5

depth_cache = []

In [8]:
cap = cv2.VideoCapture(source_path)

source_width, source_height, fps, frames = (
    int(cap.get(x))
    for x in (
        cv2.CAP_PROP_FRAME_WIDTH,
        cv2.CAP_PROP_FRAME_HEIGHT,
        cv2.CAP_PROP_FPS,
        cv2.CAP_PROP_FRAME_COUNT,
    )
)

internal_width = 640
internal_height = 384

output_width = source_width
output_height = source_height

os.makedirs("./output", exist_ok=True)

depth_view = cv2.VideoWriter(
    "./output/depth.avi",
    cv2.VideoWriter_fourcc(*"MJPG"),
    fps,
    (internal_width, internal_height),
)

depth_view_raw = cv2.VideoWriter(
    "./output/depth_raw.avi",
    cv2.VideoWriter_fourcc(*"MJPG"),
    fps,
    (internal_width, internal_height),
)

out = cv2.VideoWriter(
    "./output/result.avi",
    cv2.VideoWriter_fourcc(*"MJPG"),
    fps,
    (output_width, output_height),
)

pix_positions_buffer = AvgBuffer(10)
pix_speed_buffer = AvgBuffer(10)
prev_positions = {}


depth_cache_reset = len(depth_cache) == 0

anchor_track_id = None


def mono_to_grayscale(image):
    grayscale = np.zeros((*image.shape, 3))
    for i in range(3):
        grayscale[:, :, i] = image
    return grayscale.astype("uint8")


for i in tqdm(range(frames)):
    ret, source = cap.read()
    if not ret:
        print(
            "Video frame is empty or video processing has been successfully completed."
        )
        break

    scaled_image = Image.fromarray(source).resize((internal_width, internal_height))

    if depth_cache_reset:
        depth_cache.append(zoe.infer_pil(scaled_image))

    depth_raw = depth = depth_cache[i]

    im0 = np.array(scaled_image)

    annotator = Annotator(source, line_width=2)

    results = yolo.track(im0, persist=True)

    if results[0].boxes.id is not None and results[0].masks is not None:
        masks = results[0].masks.xy
        boxes = results[0].boxes.data.cpu().tolist()
        track_ids = results[0].boxes.id.int().cpu().tolist()

        for mask, box, track_id in zip(masks, boxes, track_ids):
            track_id = int(track_id)

            x, y, w, h, _, _, _ = box

            # centroid = np.array([x + w / 2, y + h / 2])
            centroid = np.mean(mask, axis=0)

            pix_positions_buffer.put(track_id, centroid)

            if track_id not in prev_positions:
                prev_positions[track_id] = centroid
                continue

            dir = (centroid - prev_positions[track_id]) / fps
            prev_positions[track_id] = centroid

            pix_speed_buffer.put(track_id, dir)

            if not pix_speed_buffer.ready(track_id):
                continue

            pdir = pix_speed_buffer.get(track_id)

            if np.isclose(np.linalg.norm(pdir), 0):
                continue

        anchor_norm = 1

        if anchor_track_id is not None:
            for mask, track_id in zip(results[0].masks.data, track_ids):
                if track_id != anchor_track_id:
                    continue

                mask = mask.cpu()

                kernel = np.ones((6, 6), np.uint8)
                eroded_mask = cv2.erode(np.array(mask), kernel)

                masked_depth = np.where(eroded_mask, depth, 0)

                occupied_pixels = np.count_nonzero(eroded_mask)

                avg_depth = np.sum(masked_depth) / occupied_pixels
                anchor_norm = anchor_true_dist / avg_depth

        depth = depth * anchor_norm

        for cmask, mask, box, track_id, kls in zip(
            masks, results[0].masks.data, boxes, track_ids, results[0].boxes.cls
        ):
            mask = mask.cpu()
            track_id = int(track_id)
            x, y, w, h, _, _, _ = box

            if (
                anchor_track_id is None
                and mask[anchor_coords[1]][anchor_coords[0]]
                and yolo.names[int(kls)] == anchor_class
            ):
                anchor_track_id = track_id
                print(f"found anchor ({track_id})")

            kernel = np.ones((6, 6), np.uint8)
            eroded_mask = cv2.erode(np.array(mask), kernel)

            masked_depth = np.where(eroded_mask, depth, 0)

            occupied_pixels = np.count_nonzero(eroded_mask)

            avg_depth = np.sum(masked_depth) / occupied_pixels

            if not pix_speed_buffer.ready(track_id):
                continue

            speed_str = fmt_number(
                np.linalg.norm(pix_speed_buffer.get(track_id)) * avg_depth
            )

            color = colors(track_id, True)
            txt_color = annotator.get_txt_color(color)

            ann_mask = cmask.copy()
            ann_mask[:, 0] *= output_width / internal_width
            ann_mask[:, 1] *= output_height / internal_height

            annotator.seg_bbox(
                mask=ann_mask,
                mask_color=color,
                label=speed_str,
                txt_color=txt_color,
            )

    out.write(source)

    depth_view_raw.write(mono_to_grayscale(depth_raw * 10))
    depth_view.write(mono_to_grayscale(depth * 10))

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

out.release()
cap.release()
cv2.destroyAllWindows()

  0%|          | 0/127 [00:00<?, ?it/s]

found anchor (2)
