In [1]:
import sys

import cv2 as cv
import numpy as np

from motion_detector.motion_detector import mog2_movement_detection
from people_detector.people_detector import PeopleDetector
from utils.bbox_utils import crop_bboxes
from utils.view import view

## PROCESS VIDEO
*Here happens the magick*


In [2]:
def process_video(video_path: str, people_detector: PeopleDetector, scale: float = 0.5, overlap_threshold: float = 0.3, area_threshold: int = 100, early_stop=None, starts_from=0):
    """
    Processes the input video.
    Parameters:
        video_path: Path to the video file.
        scale: Scaling factor for resizing frames.
        overlap_threshold: Threshold for merging overlapping detections using IoU.
        area_threshold: Minimum area for detected bounding boxes.
        people_detector: YOLO model to detect people.
        early_stop: stop after n frames
    """
    capture = cv.VideoCapture(video_path)
    back_sub = cv.createBackgroundSubtractorMOG2(detectShadows=False)
    iterations = 0
    while early_stop is None or iterations < early_stop:
        iterations += 1
        ret, frame = capture.read()
        if iterations <= starts_from:
            print("skipping frame {}".format(iterations))
            continue
        if not ret:
            break

        # frame = cv.resize(frame, None, fx=scale, fy=scale, interpolation=cv.INTER_LINEAR)
        frame = cv.resize(frame, None, fx=scale, fy=scale, interpolation=cv.INTER_AREA)

        detections, merged_detections, frame = mog2_movement_detection(frame, background_subtractor=back_sub, area_threshold=area_threshold,
                                overlap_threshold=overlap_threshold, draw=False)

        if not merged_detections:
            continue

        # drop area column
        # cropped_frames = crop_bboxes(frame, bboxes=[md[:-1] for md in merged_detections])
        merged_detections = np.array(merged_detections)
        cropped_frames = crop_bboxes(frame, bboxes=merged_detections[:, :-1]) # a-la numpy

        # detect people on cropped frames

        # probs, bboxes, result = people_detector.detect(frame)
        # annotated_frame = result.plot()
        # view(annotated_frame)

        detections = people_detector.detect_on_frames(cropped_frames)
        for detection in detections:
            probs, _, result = detection
            print(probs)
            if len(probs) == 0:
                continue
            annotated_frame = result.plot()
            # Image.fromarray(annotated_frame).save('./runs/detect/{}.png'.format(iterations))

            view(annotated_frame)

        # for frame in cropped_frames:
        #     stop = view(frame, scale=scale)
        #     if not stop:
        #         break
    else:
        print(f"Stopped after {iterations} frames due early stopping condition.")

    capture.release()
    cv.destroyAllWindows()

    if sys.platform == 'darwin':
        for _ in range(4):
            cv.waitKey(1)

    cv.destroyAllWindows()



## Main

In [3]:
yolosize = 'x'
yolo11 = PeopleDetector(f"yolo11{yolosize}.pt", verbose=False,)
yolo11.to('cpu')

# probs, bboxes, _ = yolo11.detect("The-Office-HD-Background.jpg")
# print('confidence scores', probs)
# print('bboxes', bboxes)

PeopleDetector(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(96, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(192, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(192, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(384, eps=0.001, momentum=0.03, affine=Tr

In [4]:
video_path = 'SamsungGear360.mp4'
process_video(video_path, yolo11, scale=0.5, overlap_threshold=0.0005, area_threshold=700) # , starts_from=3000, early_stop=3500)


0: 320x640 (no detections), 440.8ms
Speed: 2.2ms preprocess, 440.8ms inference, 1.6ms postprocess per image at shape (1, 3, 320, 640)
[]

0: 640x32 (no detections), 99.8ms
Speed: 0.5ms preprocess, 99.8ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 32)
[]

0: 640x32 (no detections), 91.1ms
Speed: 0.6ms preprocess, 91.1ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 32)
[]

0: 640x64 (no detections), 130.0ms
Speed: 0.4ms preprocess, 130.0ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 64)
[]

0: 640x64 (no detections), 115.0ms
Speed: 0.4ms preprocess, 115.0ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 64)
[]

0: 640x96 (no detections), 138.2ms
Speed: 0.4ms preprocess, 138.2ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 96)
[]

0: 640x128 (no detections), 175.9ms
Speed: 0.5ms preprocess, 175.9ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 128)
[]

0: 640x128 (no detections), 178.4ms
Speed: 0.6m

2025-01-23 16:43:43.255 Python[60740:1175035] +[IMKClient subclass]: chose IMKClient_Modern
2025-01-23 16:43:43.255 Python[60740:1175035] +[IMKInputSession subclass]: chose IMKInputSession_Modern


0: 640x256 1 person, 353.2ms
Speed: 0.7ms preprocess, 353.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 256)
[    0.88575]

0: 640x256 1 person, 354.4ms
Speed: 1.5ms preprocess, 354.4ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 256)

0: 640x352 (no detections), 430.6ms
Speed: 1.0ms preprocess, 430.6ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 352)
[    0.88251]
[]

0: 640x256 1 person, 324.7ms
Speed: 0.7ms preprocess, 324.7ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 256)

0: 256x640 (no detections), 306.7ms
Speed: 0.8ms preprocess, 306.7ms inference, 0.6ms postprocess per image at shape (1, 3, 256, 640)

0: 640x416 (no detections), 493.4ms
Speed: 0.9ms preprocess, 493.4ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 416)

0: 640x288 (no detections), 362.3ms
Speed: 0.8ms preprocess, 362.3ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 288)
[    0.88949]
[]
[]
[]

0: 640x256 1 person, 3

KeyboardInterrupt: 

# People Detection TEST


Define the base functions for people detection. <br/>
It Takes as input an array of frames containing the motion detected by MOG2 above

In [None]:
# imports

In [None]:
# Load a model
model = PeopleDetector("./yolo11x.pt")  # load an official model, or use local path

In [None]:
def test_yolo11(video_path: str, scale: float = 0.5):
    # Open the video file
    cap = cv.VideoCapture(video_path)

    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLO inference on the frame
            # frame = cv.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv.INTER_LINEAR)
            results = model(frame)

            # Visualize the results on the frame
            annotated_frame = results[0].plot()
            annotated_frame = cv.resize(annotated_frame, None, fx=scale, fy=scale, interpolation=cv.INTER_LINEAR)
            # Display the annotated frame
            cv.imshow("YOLO Inference", annotated_frame)

            # Break the loop if 'q' is pressed
            if cv.waitKey(1) & 0xFF == ord("q"):
                break
        else:
            # Break the loop if the end of the video is reached
            break

    # Release the video capture object and close the display window
    cap.release()
    cv.destroyAllWindows()

## Main test YOLO11

In [None]:
test_yolo11("SamsungGear360.mp4", scale=0.25)