In [2]:
# ! pip install opencv-python

In [4]:
# test webcam
import cv2

cap = cv2.VideoCapture(0)
cap.set(3, 1920)
cap.set(4, 1024)

while True:
    ret, img= cap.read()
    cv2.imshow('Webcam', img)

    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
# ! pip install ultralytics

In [22]:
from ultralytics import YOLO
model = YOLO("yolo-Weights/yolov8n.pt")
models_dict = model.names
model_classes = [models_dict[i] for i in range(len(models_dict))]
models_dict.get(28)

'suitcase'

In [None]:
# test recognition
from ultralytics import YOLO
import cv2
import math 

# Open the video file
cap = cv2.VideoCapture("video_store_cam_5min.mp4")

# # start webcam
# cap = cv2.VideoCapture(0)
# cap.set(3, 1920)
# cap.set(4, 1024)

# model
model = YOLO("yolo-Weights/yolov8n.pt")

while True:
    success, img = cap.read()
    results = model(img, stream=True)

    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # class name
            cls = int(box.cls[0])

            # find only persons
            if cls > 0:
                continue
            # bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

            # confidence
            confidence = math.ceil((box.conf[0]*100))

            # find only confident person (>50%)
            if confidence < 50:
                continue
            # put box in cam
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            

            # object details
            org = [x1, y1]
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2

            cv2.putText(img, f"{model_classes[cls]} ({confidence}%)", org, font, fontScale, color, thickness)

    cv2.imshow('Webcam', img)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

In [24]:
# instance segmentation and tracking 
# from https://github.com/ultralytics/ultralytics/blob/main/examples/object_tracking.ipynb

from collections import defaultdict

import cv2
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors

# Dictionary to store tracking history with default empty lists
track_history = defaultdict(lambda: [])

# Load the YOLO model with segmentation capabilities
model = YOLO("yolov8n-seg.pt")
# model = YOLO("yolov8m.pt")

# Open the video file
cap = cv2.VideoCapture("video_store_cam_5min.mp4")

# start webcam
# cap = cv2.VideoCapture(0)
# cap.set(3, 1920)
# cap.set(4, 1024)


# Retrieve video properties: width, height, and frames per second
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

# Initialize video writer to save the output video with the specified properties
out = cv2.VideoWriter("instance-segmentation-object-tracking.avi", cv2.VideoWriter_fourcc(*"MJPG"), fps, (w, h))

while True:
    # Read a frame from the video
    ret, im0 = cap.read()
    if not ret:
        print("Video frame is empty or video processing has been successfully completed.")
        break

    # Create an annotator object to draw on the frame
    annotator = Annotator(im0, line_width=2)

    # Perform object tracking on the current frame
    results = model.track(im0, persist=True)
    # results = model.track(im0, show=True)

    # Check if tracking IDs and masks are present in the results
    if results[0].boxes.id is not None and results[0].masks is not None:
        # Extract masks and tracking IDs
        masks = results[0].masks.xy
        track_ids = results[0].boxes.id.int().cpu().tolist()

        # Annotate each mask with its corresponding tracking ID and color
        for i, my_tuple in enumerate(zip(masks, track_ids)):
            mask, track_id = my_tuple
            # class number
            cls = int(results[0].boxes.cls[i])
            # find only persons
            if cls > 0:
                continue
            id_color = (255, 0, 0)
            # print(f"{mask=}\n{track_id=}")
            # annotator.seg_bbox(mask=mask, mask_color=colors(track_id, True), track_label=str(track_id))
            annotator.seg_bbox(mask=mask, mask_color=id_color, track_label=str(track_id))

    # Write the annotated frame to the output video
    out.write(im0)
    # Display the annotated frame
    cv2.imshow("instance-segmentation-object-tracking", im0)

    # Exit the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# Release the video writer and capture objects, and close all OpenCV windows
out.release()
cap.release()
cv2.destroyAllWindows()


0: 384x640 1 person, 12 suitcases, 2 tvs, 5.6ms
Speed: 1.8ms preprocess, 5.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 13 suitcases, 2 tvs, 5.5ms
Speed: 2.0ms preprocess, 5.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 14 suitcases, 2 tvs, 5.7ms
Speed: 1.4ms preprocess, 5.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 13 suitcases, 2 tvs, 5.6ms
Speed: 1.1ms preprocess, 5.6ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 suitcases, 2 tvs, 5.7ms
Speed: 1.0ms preprocess, 5.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 suitcases, 2 tvs, 5.5ms
Speed: 1.0ms preprocess, 5.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 12 suitcases, 2 tvs, 5.7ms
Speed: 1.6ms preprocess, 5.7ms inference, 1.4ms postprocess per image at shape (1