In [4]:
!pip install ultralytics opencv-python -q


In [5]:
!pip install gTTS -q
!apt-get install ffmpeg -q


Reading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 72 not upgraded.


## loading model and video

In [11]:
import cv2
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Load MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Transformation for MiDaS
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load YOLO model
from ultralytics import YOLO
model_yolo = YOLO("yolo11n.pt")

# Open video and prepare for resizing
video_path = "/kaggle/input/test01/test3.mp4"
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception("Error opening video file")

# Output video properties
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * 0.15)
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * 0.15)
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_path = "resized_video.avi"
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Process video: Resize to 15%
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize the frame
    resized_frame = cv2.resize(frame, (frame_width, frame_height))

    # Write resized frame to output
    out.write(resized_frame)

cap.release()
out.release()
print(f"Resized video saved to {output_path}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Resized video saved to resized_video.avi


## functions for audio generation 

In [9]:
import os
from gtts import gTTS

# Initialize variables to track issued instructions
issued_instructions = {}  # Format: {track_id: "last_instruction"}
last_frame_instruction = None  # To track overall frame-level instructions

# Function to play audio instruction
def play_audio(instruction, audio_count):
    tts = gTTS(text=instruction, lang="en")
    audio_file = f"instruction_{audio_count}.mp3"
    tts.save(audio_file)
    os.system(f"mpg123 {audio_file} > /dev/null 2>&1")
    os.remove(audio_file)

# Logic for audio instructions
def generate_audio_instruction(detections, frame_id):
    global last_frame_instruction
    global issued_instructions

    current_frame_instruction = None
    audio_count = frame_id

    # If no detections, say "Path is clear"
    if not detections:
        if last_frame_instruction != "Path is clear":
            play_audio("Path is clear", audio_count)
            last_frame_instruction = "Path is clear"
        return

    # Process detections
    for detection in detections:
        track_id, x1, y1, x2, y2, distance = detection
        centroid_x = (x1 + x2) // 2

        # Determine the relative position of the object
        if centroid_x < frame_width // 3:
            position = "left"
        elif centroid_x > 2 * frame_width // 3:
            position = "right"
        else:
            position = "front"

        # Generate instruction based on distance and position
        if distance < 2:
            instruction = f"Obstacle ahead, stop"
        elif position == "front":
            instruction = f"Person ahead, move {('right' if centroid_x < frame_width // 2 else 'left')}"
        else:
            instruction = f"Person on the {position}, continue walking"

        # Avoid repeating instructions for the same object
        if issued_instructions.get(track_id) != instruction:
            play_audio(instruction, audio_count)
            issued_instructions[track_id] = instruction
            current_frame_instruction = instruction

    # Clear instructions for objects no longer detected
    issued_instructions = {k: v for k, v in issued_instructions.items() if any(d[0] == k for d in detections)}

    # Prevent repeating the same frame-level instruction
    if current_frame_instruction and current_frame_instruction != last_frame_instruction:
        last_frame_instruction = current_frame_instruction


In [13]:
import cv2
import numpy as np

# Define the footpath zone
def get_footpath_zone(width, height):
    """Define a trapezoidal walking path."""
    bottom_left = (width // 10, height)
    bottom_right = (9 * width // 10, height)
    top_left = (2 * width // 10, height // 2)
    top_right = (8 * width // 10, height // 2)
    return np.array([bottom_left, bottom_right, top_right, top_left], np.int32)

# Draw the footpath zone on the frame for visualization
def visualize_footpath_zone(frame, zone_points):
    """Overlay the trapezoidal walking path on the frame."""
    annotated_frame = frame.copy()
    cv2.polylines(annotated_frame, [zone_points], isClosed=True, color=(0, 255, 0), thickness=3)
    return annotated_frame

# Load and process the video
video_path = "/kaggle/input/test01/test3.mp4"
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception("Error opening video file")

# Resize video parameters
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * 0.15)
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * 0.15)
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_path = "footpath_zone_video.avi"
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define the footpath zone
footpath_zone = get_footpath_zone(frame_width, frame_height)

frame_count = 0  # Initialize frame counter

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize the frame
    resized_frame = cv2.resize(frame, (frame_width, frame_height))

    # Visualize the footpath zone
    annotated_frame = visualize_footpath_zone(resized_frame, footpath_zone)

    # Save the first frame for inspection
    if frame_count == 0:
        cv2.imwrite("sample_footpath_zone_frame.jpg", annotated_frame)

    # Write the annotated frame to the output
    out.write(annotated_frame)
    frame_count += 1

cap.release()
out.release()
print(f"Footpath zone video saved to {output_path}")


Footpath zone video saved to footpath_zone_video.avi


## checke dthe output video the zone is well built

In [None]:
import math
import cv2
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator

# Load YOLOv11 model
model = YOLO("yolo11n.pt")  

# Load test video
video_path = "/kaggle/input/test01/test3.mp4"  # Update this with your video file path
cap = cv2.VideoCapture(video_path)

# Get original video properties and resize
original_w, original_h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
w, h = int(original_w * 0.15), int(original_h * 0.15)  # Resized dimensions

# Define output video
output_path = "/kaggle/working/visioneye-distance-calculation-resized.avi"
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"MJPG"), fps, (w, h))

# Parameters
center_point = (0, h)
pixel_per_meter = 10
txt_color, txt_background, bbox_clr = ((0, 0, 0), (255, 255, 255), (255, 0, 255))

while True:
    ret, im0 = cap.read()
    if not ret:
        print("Video frame is empty or video processing has been successfully completed.")
        break

    # Resize the frame to 15% of its original size
    im0 = cv2.resize(im0, (w, h))

    annotator = Annotator(im0, line_width=2)

    # Run YOLO detection and tracking
    results = model.track(im0, persist=True)
    boxes = results[0].boxes.xyxy.cpu()

    if results[0].boxes.id is not None:
        track_ids = results[0].boxes.id.int().cpu().tolist()

        for box, track_id in zip(boxes, track_ids):
            annotator.box_label(box, label=str(track_id), color=bbox_clr)

            # Calculate bounding box centroid
            x1, y1 = int((box[0] + box[2]) // 2), int((box[1] + box[3]) // 2)

            # Calculate distance
            distance = (math.sqrt((x1 - center_point[0]) ** 2 + (y1 - center_point[1]) ** 2)) / pixel_per_meter

            # Annotate distance on the frame
            text_size, _ = cv2.getTextSize(f"Distance: {distance:.2f} m", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
            cv2.rectangle(im0, (x1, y1 - text_size[1] - 10), (x1 + text_size[0] + 10, y1), txt_background, -1)
            cv2.putText(im0, f"Distance: {distance:.2f} m", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 1.2, txt_color, 3)

    # Write the annotated frame to output
    out.write(im0)

# Release resources
out.release()
cap.release()

print(f"Processed video saved to {output_path}")


[31m[1mrequirements:[0m Ultralytics requirement ['lap>=0.5.12'] not found, attempting AutoUpdate...
Collecting lap>=0.5.12
  Downloading lap-0.5.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Downloading lap-0.5.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lap
Successfully installed lap-0.5.12

[31m[1mrequirements:[0m AutoUpdate success ✅ 7.5s, installed 1 package: ['lap>=0.5.12']
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m


0: 640x384 5 persons, 89.6ms
Speed: 9.0ms preprocess, 89.6ms inference, 314.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 8.9ms
Speed: 2.2ms preprocess, 8.9ms inference, 1.

### Updated process_yolo_detections with Track Zone

In [19]:
def process_yolo_detections(frame, model, center_point, pixel_per_meter, txt_color, txt_background, bbox_clr, track_zone):
    """
    Process a frame using the YOLO model, annotate detections, and calculate distances within a track zone.

    Args:
        frame (ndarray): Input video frame.
        model (YOLO): YOLO model instance.
        center_point (tuple): Reference point for distance calculations.
        pixel_per_meter (float): Conversion factor for pixels to meters.
        txt_color (tuple): Text color (B, G, R).
        txt_background (tuple): Background color for text (B, G, R).
        bbox_clr (tuple): Bounding box color (B, G, R).
        track_zone (np.array): Trapezoidal track zone defined by its vertices (x, y).

    Returns:
        ndarray: Annotated frame.
        list: List of detections with distances [(track_id, x1, y1, x2, y2, distance)].
    """
    annotator = Annotator(frame, line_width=2)
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu() if results[0].boxes else []
    detections = []

    if results[0].boxes.id is not None:
        track_ids = results[0].boxes.id.int().cpu().tolist()

        for box, track_id in zip(boxes, track_ids):
            x1, y1, x2, y2 = map(int, box)

            # Calculate centroid of the bounding box
            centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

            # Check if the centroid is within the trapezoidal track zone
            if cv2.pointPolygonTest(track_zone, (centroid_x, centroid_y), False) >= 0:
                annotator.box_label(box, label=str(track_id), color=bbox_clr)

                # Calculate distance from the reference point
                distance = (math.sqrt((centroid_x - center_point[0]) ** 2 + (centroid_y - center_point[1]) ** 2)) / pixel_per_meter

                # Annotate distance on the frame
                text_size, _ = cv2.getTextSize(f"Distance: {distance:.2f} m", cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)
                cv2.rectangle(frame, 
                              (centroid_x, centroid_y - text_size[1] - 10), 
                              (centroid_x + text_size[0] + 10, centroid_y), 
                              txt_background, -1)
                cv2.putText(frame, f"Distance: {distance:.2f} m", 
                            (centroid_x, centroid_y - 5), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1.2, txt_color, 3)

                # Store detection details
                detections.append((track_id, x1, y1, x2, y2, distance))

    # Return the annotated frame and the detections
    return annotator.result(), detections


### Modifications for Displaying Depth Values in the Track Zone


In [None]:
import cv2
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import math
import numpy as np

# Load and set up the MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Prepare the transformation for MiDaS
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load YOLO model
from ultralytics import YOLO
model_yolo = YOLO("yolo11n.pt")

# Video processing setup
cap = cv2.VideoCapture("/kaggle/input/test01/test3.mp4")
if not cap.isOpened():
    raise Exception("Error opening video file or stream")

# Resize parameters
resize_scale = 0.15
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * resize_scale)
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * resize_scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_path = "output_with_depth_and_detections_resized.avi"
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define the track zone based on resized dimensions
track_zone = (frame_width // 4, frame_height // 2, 3 * frame_width // 4, frame_height)

# Reference point for distance calculation
reference_point = (frame_width // 2, frame_height)
frame_count = 0

# Main loop
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize the frame
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Extract the track zone region
    x1, y1, x2, y2 = track_zone
    track_frame = frame[y1:y2, x1:x2]

    # Convert track frame to PIL for MiDaS processing
    rgb_track_frame = cv2.cvtColor(track_frame, cv2.COLOR_BGR2RGB)
    pil_track_image = Image.fromarray(rgb_track_frame)
    input_tensor = midas_transform(pil_track_image).unsqueeze(0).to(device)

    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Resize depth map to match track zone size
    depth_map_resized = cv2.resize(depth_map, (x2 - x1, y2 - y1))
    depth_map_height, depth_map_width = depth_map_resized.shape

    # Convert depth map to a colormap for visualization
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the track zone region
    combined_track_frame = cv2.addWeighted(track_frame, 0.6, depth_colormap, 0.4, 0)
    frame[y1:y2, x1:x2] = combined_track_frame  # Place the combined track frame back into the original frame

    # Run YOLO on the resized frame
    results = model_yolo.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu() if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    detections = []  # List to store processed detections

    if len(boxes) == 0:
        print("No detections in this frame.")
        out.write(frame)
        continue  # Skip this frame if no detections

    # Process each detected object
    for box, track_id in zip(boxes, track_ids):
        x1_obj, y1_obj, x2_obj, y2_obj = map(int, box)
        centroid_x, centroid_y = (x1_obj + x2_obj) // 2, (y1_obj + y2_obj) // 2

        # Check if object centroid is within the track zone
        if x1 <= centroid_x <= x2 and y1 <= centroid_y <= y2:
            depth_x = centroid_x - x1  # Adjust x for track zone relative position
            depth_y = centroid_y - y1  # Adjust y for track zone relative position

            # Ensure depth coordinates are within bounds
            if 0 <= depth_x < depth_map_width and 0 <= depth_y < depth_map_height:
                depth_value = depth_map_resized[depth_y, depth_x]  # Depth at the centroid
                # Calculate distance from the reference point
                distance = math.sqrt((centroid_x - reference_point[0]) ** 2 + (centroid_y - reference_point[1]) ** 2)

                # Annotate bounding box, depth, and distance
                cv2.rectangle(frame, (x1_obj, y1_obj), (x2_obj, y2_obj), (255, 0, 255), 2)
                cv2.putText(frame,
                            f"ID: {track_id}, Depth: {depth_value:.2f}m, Dist: {distance:.2f}",
                            (x1_obj, y1_obj - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

                # Store detection details
                detections.append((track_id, x1_obj, y1_obj, x2_obj, y2_obj, distance))
            else:
                print(f"Depth coordinates out of bounds: ({depth_x}, {depth_y})")

    # Generate and play audio instructions
    generate_audio_instruction(detections, frame_count)

    # Draw the track zone for visualization
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Green rectangle for the track zone

    # Write the annotated frame to the output video
    out.write(frame)
    frame_count += 1

cap.release()
out.release()
print(f"Processed video saved to {output_path}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master



0: 640x384 5 persons, 10.8ms
Speed: 1.8ms preprocess, 10.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 11.8ms
Speed: 3.6ms preprocess, 11.8ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 11.4ms
Speed: 2.3ms preprocess, 11.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 10.0ms
Speed: 1.9ms preprocess, 10.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 9.9ms
Speed: 1.8ms preprocess, 9.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 10.2ms
Speed: 1.9ms preprocess, 10.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 11.4ms
Speed: 1.8ms preprocess, 11.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 10.0ms
Speed: 1.8ms preprocess, 10.0ms inference, 1.3ms postprocess per image at shape (1,

In [23]:
import cv2
import numpy as np
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import math

# Load MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Transformation for MiDaS
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load YOLO model
from ultralytics import YOLO
model_yolo = YOLO("yolo11n.pt")

# Video input and output
cap = cv2.VideoCapture("/kaggle/input/test01/test3.mp4")
if not cap.isOpened():
    raise Exception("Error opening video file")

# Resize parameters
resize_scale = 0.15
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * resize_scale)
frame_height = int(original_height * resize_scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Output video writer
output_path = "output_with_resized_trapezoid_debug.avi"
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define the trapezoid ROI points based on resized dimensions
def define_trapezoid(frame_width, frame_height):
    top_width = int(frame_width * 0.5)
    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, 0],  # Top-left
        [frame_width // 2 + top_width // 2, 0],  # Top-right
        [frame_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Process video
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame to 15% of original size
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Process depth estimation with MiDaS
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Resize depth map to match frame size
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))

    # Convert depth map to colormap
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_colormap, 0.4, 0)

    # Run YOLO on the resized frame
    results = model_yolo.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu() if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    # Annotate detections within the trapezoid ROI
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = map(int, box)
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Check if the centroid is inside the trapezoid
        if cv2.pointPolygonTest(trapezoid_points, (centroid_x, centroid_y), False) >= 0:
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)
            depth_value = depth_map_resized[centroid_y, centroid_x]

            # Annotate bounding box, depth, and distance
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame,
                        f"ID: {track_id}, Depth: {depth_value:.2f}m, Dist: {distance:.2f}",
                        (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    # Visualize the trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_path}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Loading weights:  None

0: 640x384 5 persons, 11.4ms
Speed: 1.9ms preprocess, 11.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 9.9ms
Speed: 2.0ms preprocess, 9.9ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 8.2ms
Speed: 2.1ms preprocess, 8.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 10.4ms
Speed: 2.0ms preprocess, 10.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 8.3ms
Speed: 2.1ms preprocess, 8.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 13.0ms
Speed: 2.3ms preprocess, 13.0ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 13.1ms
Speed: 1.8ms preprocess, 13.1ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 persons, 12.5ms
Speed: 2.3ms preprocess, 12.5ms inference, 1.7ms postprocess per

## inference on another video

In [None]:
import cv2
import numpy as np
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import math

# Load MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Transformation for MiDaS
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load YOLO model
from ultralytics import YOLO
model_yolo = YOLO("yolo11n.pt")

# Define the new video path (update this with your new video file path)
video_path = "/kaggle/input/test04/test04.mp4"  # Update this path
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception("Error opening new video file")

# Resize parameters
resize_scale = 0.15
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * resize_scale)
frame_height = int(original_height * resize_scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Output video writer
output_path = "processed_new_video2.avi"
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define the trapezoid ROI points based on resized dimensions
def define_trapezoid(frame_width, frame_height):
    top_width = int(frame_width * 0.5)
    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, 0],  # Top-left
        [frame_width // 2 + top_width // 2, 0],  # Top-right
        [frame_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Process video
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame to 15% of original size
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Process depth estimation with MiDaS
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Resize depth map to match frame size
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))

    # Convert depth map to colormap
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_colormap, 0.4, 0)

    # Run YOLO on the resized frame
    results = model_yolo.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu() if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    # Annotate detections within the trapezoid ROI
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = map(int, box)
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Check if the centroid is inside the trapezoid
        if cv2.pointPolygonTest(trapezoid_points, (centroid_x, centroid_y), False) >= 0:
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)
            depth_value = depth_map_resized[centroid_y, centroid_x]

            # Annotate bounding box, depth, and distance
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame,
                        f"ID: {track_id}, Depth: {depth_value:.2f}m, Dist: {distance:.2f}",
                        (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    # Visualize the trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_path}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master



0: 384x640 12 persons, 46.7ms
Speed: 2.5ms preprocess, 46.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 8.0ms
Speed: 1.4ms preprocess, 8.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 8.3ms
Speed: 1.5ms preprocess, 8.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 8.2ms
Speed: 1.3ms preprocess, 8.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 7.9ms
Speed: 1.4ms preprocess, 7.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 8.1ms
Speed: 1.4ms preprocess, 8.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 8.3ms
Speed: 1.4ms preprocess, 8.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 7.9ms
Speed: 1.4ms preprocess, 7.9ms inference, 1.2ms postprocess per image at shape (1, 3, 

In [None]:
import cv2
import numpy as np
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import math

# Load MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Transformation for MiDaS
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load YOLO model
from ultralytics import YOLO
model_yolo = YOLO("yolo11n.pt")

# Video input and output
video_path = "/kaggle/input/test04/test04.mp4"  # Update this path
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise Exception("Error opening new video file")

# Resize parameters
resize_scale = 0.35
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * resize_scale)
frame_height = int(original_height * resize_scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Output video writer
output_path = "processed_new_video_with_fixed_trapezoid.avi"
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

# Define the trapezoid ROI points based on resized dimensions
def define_trapezoid(frame_width, frame_height):
    """
    Define trapezoid to focus on the walking area, up to the middle of the frame.
    """
    top_width = int(frame_width * 0.5)
    bottom_width = frame_width
    middle_height = frame_height // 2  # Limit trapezoid to the middle of the frame

    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, middle_height],  # Top-left
        [frame_width // 2 + top_width // 2, middle_height],  # Top-right
        [bottom_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Process video
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame to 15% of original size
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Process depth estimation with MiDaS
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Resize depth map to match frame size
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))

    # Convert depth map to colormap
    depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_colormap, 0.4, 0)

    # Run YOLO on the resized frame
    results = model_yolo.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu() if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    # Annotate detections within the trapezoid ROI
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = map(int, box)
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Check if the centroid is inside the trapezoid
        if cv2.pointPolygonTest(trapezoid_points, (centroid_x, centroid_y), False) >= 0:
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)
            depth_value = depth_map_resized[centroid_y, centroid_x]

            # Annotate bounding box, depth, and distance
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame,
                        f"ID: {track_id}, Depth: {depth_value:.2f}m, Dist: {distance:.2f}",
                        (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    # Visualize the trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_path}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master



0: 384x640 12 persons, 1 dog, 1 chair, 10.4ms
Speed: 1.8ms preprocess, 10.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 dog, 1 chair, 8.3ms
Speed: 2.0ms preprocess, 8.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 8.4ms
Speed: 2.1ms preprocess, 8.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 8.4ms
Speed: 2.3ms preprocess, 8.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 9.0ms
Speed: 2.1ms preprocess, 9.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 skateboard, 1 chair, 8.7ms
Speed: 2.1ms preprocess, 8.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 skateboard, 1 chair, 8.4ms
Speed: 1.8ms preprocess, 8.4ms inference, 1.2ms postprocess per image at s

In [None]:
import cv2
import torch
import numpy as np
from gtts import gTTS
from ultralytics import YOLO
import math
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Load YOLO model
model = YOLO("yolo11n.pt")

# Load MiDaS depth estimation model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)

# MiDaS transformations
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Video settings
video_path = "/kaggle/input/test01/test3.mp4"  
cap = cv2.VideoCapture(video_path)
scale = 0.35  # Resize factor for computational efficiency
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * scale)
frame_height = int(original_height * scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object
output_filename = "final_processed_video1.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

# Define trapezoid ROI
def define_trapezoid(frame_width, frame_height):
    top_width = int(frame_width * 0.5)
    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, frame_height // 2],  # Top-left
        [frame_width // 2 + top_width // 2, frame_height // 2],  # Top-right
        [frame_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Function to save and play audio instruction
def save_speech(text, filename):
    tts = gTTS(text, lang='en')
    tts.save(filename)
    print(f"Instruction: {text}")

# Process video
last_instruction = ""
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame for processing
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Prepare frame for MiDaS
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    # Perform depth estimation
    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Normalize depth map for visualization
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))
    depth_map_colored = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_map_colored, 0.4, 0)

    # Run YOLO detection
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int) if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    current_instruction = "Path is clear."
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = box
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Ensure the centroid is passed as a tuple of floats
        centroid = (float(centroid_x), float(centroid_y))

        # Check if the object is within the trapezoid ROI
        if cv2.pointPolygonTest(trapezoid_points, centroid, False) >= 0:
            depth_value = depth_map_resized[min(centroid_y, frame_height - 1), min(centroid_x, frame_width - 1)]
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)

            # Annotate bounding box and depth
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame, f"ID: {track_id}, Depth: {depth_value:.2f}m", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

            # Update instructions based on depth
            if depth_value < 0.5:
                current_instruction = "Immediate obstacle nearby, please stop!"
            elif depth_value < 1.5:
                current_instruction = "Obstacle ahead within one meter, please slow down."
            else:
                current_instruction = "Obstacle detected, proceed with caution."

    # Play and update instructions only if it changes
    if current_instruction != last_instruction:
        save_speech(current_instruction, "instruction.mp3")
        last_instruction = current_instruction

    # Visualize trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_filename}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master



0: 640x384 4 persons, 11.0ms
Speed: 2.2ms preprocess, 11.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)
Instruction: Obstacle detected, proceed with caution.

0: 640x384 4 persons, 9.2ms
Speed: 2.8ms preprocess, 9.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 8.6ms
Speed: 2.7ms preprocess, 8.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 8.5ms
Speed: 2.7ms preprocess, 8.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 10.4ms
Speed: 3.0ms preprocess, 10.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 10.0ms
Speed: 2.7ms preprocess, 10.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 9.2ms
Speed: 2.9ms preprocess, 9.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 8.5ms
Speed: 2.8ms preprocess, 8.5ms infer

In [None]:
import cv2
import torch
import numpy as np
from gtts import gTTS
from ultralytics import YOLO
import math
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Load YOLO model
model = YOLO("yolo11n.pt")

# Load MiDaS depth estimation model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)

# MiDaS transformations
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Video settings
video_path = "/kaggle/input/test03/test03.mp4"  
cap = cv2.VideoCapture(video_path)
scale = 0.35  # Resize factor for computational efficiency
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * scale)
frame_height = int(original_height * scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object
output_filename = "final_processed_video2.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

# Define trapezoid ROI
def define_trapezoid(frame_width, frame_height):
    top_width = int(frame_width * 0.5)
    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, frame_height // 2],  # Top-left
        [frame_width // 2 + top_width // 2, frame_height // 2],  # Top-right
        [frame_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Function to save and play audio instruction
def save_speech(text, filename):
    tts = gTTS(text, lang='en')
    tts.save(filename)
    print(f"Instruction: {text}")

# Process video
last_instruction = ""
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame for processing
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Prepare frame for MiDaS
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    # Perform depth estimation
    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Normalize depth map for visualization
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))
    depth_map_colored = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_map_colored, 0.4, 0)

    # Run YOLO detection
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int) if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    current_instruction = "Path is clear."
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = box
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Ensure the centroid is passed as a tuple of floats
        centroid = (float(centroid_x), float(centroid_y))

        # Check if the object is within the trapezoid ROI
        if cv2.pointPolygonTest(trapezoid_points, centroid, False) >= 0:
            depth_value = depth_map_resized[min(centroid_y, frame_height - 1), min(centroid_x, frame_width - 1)]
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)

            # Annotate bounding box and depth
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame, f"ID: {track_id}, Depth: {depth_value:.2f}m", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

            # Update instructions based on depth
            if depth_value < 0.5:
                current_instruction = "Immediate obstacle nearby, please stop!"
            elif depth_value < 1.5:
                current_instruction = "Obstacle ahead within one meter, please slow down."
            else:
                current_instruction = "Obstacle detected, proceed with caution."

    # Play and update instructions only if it changes
    if current_instruction != last_instruction:
        save_speech(current_instruction, "instruction.mp3")
        last_instruction = current_instruction

    # Visualize trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_filename}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master



0: 384x640 12 persons, 1 dog, 1 chair, 11.6ms
Speed: 1.8ms preprocess, 11.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Instruction: Obstacle detected, proceed with caution.

0: 384x640 12 persons, 1 dog, 1 chair, 10.4ms
Speed: 2.1ms preprocess, 10.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 8.5ms
Speed: 1.7ms preprocess, 8.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 8.6ms
Speed: 1.7ms preprocess, 8.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 1 chair, 8.0ms
Speed: 2.1ms preprocess, 8.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 skateboard, 1 chair, 8.3ms
Speed: 1.6ms preprocess, 8.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 skateboard, 1 chair, 8.1ms
Speed: 1.6ms prepr

In [33]:
import cv2
import torch
import numpy as np
from gtts import gTTS
from ultralytics import YOLO
import math
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Load YOLO model
model = YOLO("yolo11n.pt")

# Load MiDaS depth estimation model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)

# MiDaS transformations
midas_transform = Compose([
    Resize((256, 256)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Video settings
video_path = "/kaggle/input/test05/test02.mp4"  
cap = cv2.VideoCapture(video_path)
scale = 0.35  # Resize factor for computational efficiency
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_width = int(original_width * scale)
frame_height = int(original_height * scale)
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Define the codec and create VideoWriter object
output_filename = "final_processed_video3.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

# Define trapezoid ROI
def define_trapezoid(frame_width, frame_height):
    top_width = int(frame_width * 0.5)
    trapezoid_points = np.array([
        [frame_width // 2 - top_width // 2, frame_height // 2],  # Top-left
        [frame_width // 2 + top_width // 2, frame_height // 2],  # Top-right
        [frame_width, frame_height],  # Bottom-right
        [0, frame_height]  # Bottom-left
    ], dtype=np.int32)
    return trapezoid_points

trapezoid_points = define_trapezoid(frame_width, frame_height)

# Function to save and play audio instruction
def save_speech(text, filename):
    tts = gTTS(text, lang='en')
    tts.save(filename)
    print(f"Instruction: {text}")

# Process video
last_instruction = ""
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Resize frame for processing
    frame = cv2.resize(frame, (frame_width, frame_height))

    # Prepare frame for MiDaS
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    input_tensor = midas_transform(pil_image).unsqueeze(0).to(device)

    # Perform depth estimation
    with torch.no_grad():
        depth_map = midas(input_tensor).squeeze().cpu().numpy()

    # Normalize depth map for visualization
    depth_map_resized = cv2.resize(depth_map, (frame_width, frame_height))
    depth_map_colored = cv2.applyColorMap(cv2.convertScaleAbs(depth_map_resized, alpha=0.03), cv2.COLORMAP_JET)

    # Overlay depth map onto the frame
    combined_frame = cv2.addWeighted(frame, 0.6, depth_map_colored, 0.4, 0)

    # Run YOLO detection
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xyxy.cpu().numpy().astype(int) if results[0].boxes else []
    track_ids = results[0].boxes.id.int().cpu().tolist() if results[0].boxes.id is not None else []

    current_instruction = "Path is clear."
    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = box
        centroid_x, centroid_y = (x1 + x2) // 2, (y1 + y2) // 2

        # Ensure the centroid is passed as a tuple of floats
        centroid = (float(centroid_x), float(centroid_y))

        # Check if the object is within the trapezoid ROI
        if cv2.pointPolygonTest(trapezoid_points, centroid, False) >= 0:
            depth_value = depth_map_resized[min(centroid_y, frame_height - 1), min(centroid_x, frame_width - 1)]
            distance = math.sqrt((centroid_x - frame_width // 2) ** 2 + (centroid_y - frame_height) ** 2)

            # Annotate bounding box and depth
            cv2.rectangle(combined_frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
            cv2.putText(combined_frame, f"ID: {track_id}, Depth: {depth_value:.2f}m", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

            # Update instructions based on depth
            if depth_value < 0.5:
                current_instruction = "Immediate obstacle nearby, please stop!"
            elif depth_value < 1.5:
                current_instruction = "Obstacle ahead within one meter, please slow down."
            else:
                current_instruction = "Obstacle detected, proceed with caution."

    # Play and update instructions only if it changes
    if current_instruction != last_instruction:
        save_speech(current_instruction, "instruction.mp3")
        last_instruction = current_instruction

    # Visualize trapezoid ROI
    cv2.polylines(combined_frame, [trapezoid_points], isClosed=True, color=(0, 255, 0), thickness=2)

    # Write the annotated frame to the output video
    out.write(combined_frame)

cap.release()
out.release()

print(f"Processed video saved to {output_filename}")


Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /root/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Loading weights:  None

0: 384x640 11 persons, 1 handbag, 11.8ms
Speed: 1.7ms preprocess, 11.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)
Instruction: Obstacle detected, proceed with caution.

0: 384x640 11 persons, 1 handbag, 8.4ms
Speed: 2.1ms preprocess, 8.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 handbag, 8.3ms
Speed: 1.6ms preprocess, 8.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 8.2ms
Speed: 1.6ms preprocess, 8.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 handbag, 8.1ms
Speed: 1.5ms preprocess, 8.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 8.4ms
Speed: 1.7ms preprocess, 8.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 handbag, 8.1ms
Speed: 1.7ms preprocess, 8.1ms inference, 1.2ms postproces

### Wrapping up

This project combines YOLOv11 for object detection and MiDaS for depth estimation to assist pedestrians with real-time audio navigation and obstacle detection.

### Key Features:
- Trapezoid ROI:

Focuses processing on the walking area for efficiency and relevance.

- Object Detection & Depth Estimation:

Detects and tracks objects within the walking zone.
Estimates depth to calculate distances and identify potential obstacles.

- Interactive Audio Instructions:

Provides real-time feedback, e.g., "Path is clear," "Move left," or "Stop."
Plans to include an interactive assistant for user queries.

= Future Enhancements:

Google Maps Navigation: Integration for dynamic path guidance.
Heygen Integration: Exploring its potential to enhance interactivity.
Output:

= Annotated video with bounding boxes, depth, and walking area overlays.
= Audio instructions saved as MP3 files for playback.
This system offers a robust foundation for pedestrian assistance and is set to become a more interactive, AI-driven tool with the planned additions of audio interactivity and navigation features. 🚀