In [54]:
import sys
from typing import Any

import cv2 as cv
import numpy as np

In [55]:

def merge_overlapping_detections(detections: list, overlap_threshold: float = 0.3) -> list:
    """
    Merges overlapping bounding boxes based on Intersection over Union (IoU).

    Parameters:
        detections: List of bounding boxes to merge.
        overlap_threshold: IoU threshold for merging boxes. Boxes with IoU >= threshold are merged.

    Returns:
        list: List of merged bounding boxes.
    """
    if not detections:
        return []

    boxes = np.array(detections)
    x1, y1 = boxes[:, 0], boxes[:, 1]
    x2, y2 = boxes[:, 2], boxes[:, 3]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    '''
    [     x      y    w      h   Area
        [ 1842   647  1926   771 10416]
        [ 1918   512  1947   575  1827]
        [ 1855   467  1912   635  9576]
     ]
    ------------------------
    [1842 1918 1855] all x1
    ------------------------
    [647 512 467] all y1
    ------------------------
    [1926 1947 1912] all x2
    ------------------------
    [771 575 635] all y2
    ------------------------
    [10625  1920  9802] all Area between them
    '''

    # Sort boxes by their area in descending order
    order = areas.argsort()[::-1]  # a[start:end:step]
    merged_boxes = []

    while len(order) > 0:
        i = order[0]
        merged_boxes.append(boxes[i])

        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)

        inter = w * h
        union = areas[i] + areas[order[1:]] - inter
        iou = inter / union

        # Keep boxes with IoU below the threshold
        remain_indices = np.where(iou < overlap_threshold)[0] + 1
        order = order[remain_indices]

    return merged_boxes

In [56]:
def detect_moving_objects(frame: np.ndarray, background_subtractor: cv.BackgroundSubtractor,
                          area_threshold: int = 100) -> tuple[Any, Any]:
    """
    Detects moving objects using a background subtractor, returns their bounding boxes.

    Parameters:
        frame: Current video frame.
        background_subtractor: Background subtractor for motion detection.
        area_threshold: Minimum area for detected bounding boxes.


    Returns:
        List of detected bounding boxes with their areas, [x1, y1, x2, y2, area]

    Notes:
        - https://medium.com/analytics-vidhya/opencv-findcontours-detailed-guide-692ee19eeb18
    """

    fg_mask = background_subtractor.apply(frame)
    contours, _ = cv.findContours(fg_mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
    detections = []
    for cnt in contours:
        x, y, w, h = cv.boundingRect(cnt)
        area = w * h
        if area > area_threshold:
            detections.append([x, y, x + w, y + h, area])

    return detections, fg_mask

In [57]:
# utility function
def mog2_movement_detection(frame: np.ndarray, *, background_subtractor: cv.BackgroundSubtractor, area_threshold: int = 100, overlap_threshold = 0.0, draw=False) -> tuple[Any, list, np.ndarray]:
        detections, _ = detect_moving_objects(frame, background_subtractor, area_threshold=area_threshold)
        merged_detections = merge_overlapping_detections(detections, overlap_threshold)

        # Draw bounding boxes
        if draw:
            for det in merged_detections:
                x1, y1, x2, y2, _ = det
                cv.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        return detections, merged_detections, frame

In [58]:
# utility crop on BBoxes
def crop_bbox(frame, bbox: tuple[int, int, int, int]) -> np.ndarray:
    """
    Crops the frame based on the bounding box.
    :param frame: frame to crop.
    :param bbox: bbox to crop [x1, y1, x2, y2].
    :return: the new frame cropped.

    bbox example: [1947,  475, 1954,  698, 1561]
    """
    # calculate width and height
    x1, y1, x2, y2 = bbox
    w = x2 - x1
    h = y2 - y1

    cropped_frame = frame[y1:y1+h, x1:x1+w]
    return cropped_frame

def crop_bboxes(frame, bboxes: list[tuple[int, int, int, int]]) -> list[np.ndarray]:
    cropped = []
    for bbox in bboxes:
        cropped_frame = crop_bbox(frame, bbox)
        cropped.append(cropped_frame)
    return cropped


In [59]:
def view(frame, *, scale=0.5):
    """
    :param frame: frame to draw on
    :param scale: scale factor to scale the frame by
    :return: stop the drawing of the frame
    """
    # Resize frame to    a normal view
    frame = cv.resize(frame, None, fx=scale, fy=scale, interpolation=cv.INTER_LINEAR)
    cv.imshow('Frame', frame)
    key = cv.waitKey(1)
    if key in [27, ord('q'), ord('Q')]:
        return False
    return True

In [66]:
def process_video(video_path: str, scale: float = 0.5, overlap_threshold: float = 0.3, area_threshold: int = 100):
    """
    Processes the input video.

    Parameters:
        video_path: Path to the video file.
        scale: Scaling factor for resizing frames.
        overlap_threshold: Threshold for merging overlapping detections using IoU.
        area_threshold: Minimum area for detected bounding boxes.

    """
    capture = cv.VideoCapture(video_path)
    back_sub = cv.createBackgroundSubtractorMOG2(detectShadows=False)

    while True:
        ret, frame = capture.read()
        if not ret:
            break

        frame = cv.resize(frame, None, fx=scale, fy=scale, interpolation=cv.INTER_LINEAR)

        detections, merged_detections, frame = mog2_movement_detection(frame, background_subtractor=back_sub, area_threshold=area_threshold,
                                overlap_threshold=overlap_threshold, draw=True)
        print(merged_detections)

        if len(merged_detections) > 0:
            frame = crop_bbox(frame, bbox=merged_detections[0][:-1].tolist())
        stop = view(frame, scale=scale)
        if not stop:
            break

    capture.release()
    cv.destroyAllWindows()

    if sys.platform == 'darwin':
        for _ in range(4):
            cv.waitKey(1)

    cv.destroyAllWindows()



## Main

In [67]:
video_path = 'SamsungGear360.mp4'
process_video(video_path, scale=0.5, overlap_threshold=0.0005, area_threshold=700)

[array([      0,       0,    2048,    1024, 2097152])]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[array([1947,  475, 1954,  698, 1561])]
[array([1947,  474, 1955,  698, 1792])]
[array([1941,  474, 1956,  699, 3375])]
[array([1940,  474, 1956,  698, 3584])]
[array([1924,  473, 1958,  698, 7650])]
[array([1922,  473, 1957,  698, 7875])]
[array([1921,  472, 1959,  699, 8626])]
[array([1919,  472, 1959,  699, 9080])]
[array([1916,  472, 1959,  700, 9804])]
[array([1918,  471, 1958,  700, 9160])]
[array([1916,  472, 1958,  700, 9576])]
[array([ 1907,   471,  1952,   700, 10305]), array([1963,  635, 1996,  688, 1749])]
[array([ 1903,   471,  1952,   700, 11221]), array([1963,  581, 1996,  688, 3531])]
[array([ 1899,   470,  1952,   700, 12190]), array([1963,  588, 1996,  689, 3333])]
[array([ 1895,   470,  1952,   700, 13110]), array([1963,  585, 1996,  689, 3432])]
[array([ 1893,   470,  1952,   700

# People Detection


Define the base functions for people detection. <br/>
It Takes as input an array of frames containing the motion detected by MOG2 above

In [49]:
# imports
from ultralytics import YOLO
from PIL import Image

In [50]:
# Load a model
model = YOLO("./yolo11x.pt")  # load an official model, or use local path

In [51]:
def test_yolo11(video_path: str, scale: float = 0.5):
    # Open the video file
    cap = cv.VideoCapture(video_path)

    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLO inference on the frame
            # frame = cv.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv.INTER_LINEAR)
            results = model(frame)

            # Visualize the results on the frame
            annotated_frame = results[0].plot()
            annotated_frame = cv.resize(annotated_frame, None, fx=scale, fy=scale, interpolation=cv.INTER_LINEAR)
            # Display the annotated frame
            cv.imshow("YOLO Inference", annotated_frame)

            # Break the loop if 'q' is pressed
            if cv.waitKey(1) & 0xFF == ord("q"):
                break
        else:
            # Break the loop if the end of the video is reached
            break

    # Release the video capture object and close the display window
    cap.release()
    cv.destroyAllWindows()

## Main test YOLO11

In [52]:
test_yolo11("SamsungGear360.mp4", scale=0.25)


0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 20.8ms
Speed: 1.2ms preprocess, 20.8ms inference, 0.9ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 28.4ms
Speed: 1.4ms preprocess, 28.4ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 20.9ms
Speed: 1.4ms preprocess, 20.9ms inference, 1.5ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 19.9ms
Speed: 1.3ms preprocess, 19.9ms inference, 0.9ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 19.9ms
Speed: 1.5ms preprocess, 19.9ms inference, 1.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 umbrella, 3 chairs, 1 couch, 2 potted plants, 2 tvs, 19.5ms
Speed: 1.3ms preprocess, 19.5ms inference, 1.2ms postprocess per image 