In [1]:
from ultralytics import YOLO
import cv2
import json
import os

def crop_and_save(image, bbox, save_path):
    """Crop the image based on the bounding box and save it."""
    x1, y1, x2, y2 = bbox
    if x1 < 0 or y1 < 0 or x2 > image.shape[1] or y2 > image.shape[0]:
        print(f"Invalid bbox: {bbox}, skipping")
        return
    cropped = image[y1:y2, x1:x2]
    cv2.imwrite(save_path, cropped)

def detect_objects_and_subobjects(video_path, output_json_path, subobject_classes):
    """Detect objects and sub-objects in a video and save outputs in JSON format."""

    # Load the YOLOv8 model
    model = YOLO("yolov8n.pt")  # Lightweight YOLO model

    # Open the video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Cannot open video.")
        return

    # Prepare the output JSON
    hierarchical_output = []
    frame_count = 0

    # Initialize video writer for output visualization
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        print(f"Processing frame {frame_count}...")

        # Run object detection
        results = model(frame)

        # Parse object detections
        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
                label = model.names[int(box.cls[0])]  # Object label
                conf = box.conf[0]  # Confidence score

                # Filter for main objects (e.g., "person", "car")
                if label not in subobject_classes:
                    # Initialize the object entry
                    object_entry = {
                        "object": label,
                        "id": len(hierarchical_output) + 1,
                        "bbox": [x1, y1, x2, y2],
                        "subobject": []
                    }

                    # Draw bounding box for the main object
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(frame, f"{label} ({conf:.2f})", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                    # Crop the detected object for sub-object detection
                    roi = frame[y1:y2, x1:x2]

                    # Run sub-object detection on the cropped region
                    sub_results = model(roi)
                    for sub_result in sub_results:
                        for sub_box in sub_result.boxes:
                            sx1, sy1, sx2, sy2 = map(int, sub_box.xyxy[0].tolist())
                            sub_label = model.names[int(sub_box.cls[0])]
                            sub_conf = sub_box.conf[0]

                            # Only include sub-objects of interest
                            if sub_label in subobject_classes:
                                sub_object_entry = {
                                    "object": sub_label,
                                    "id": len(object_entry["subobject"]) + 1,
                                    "bbox": [sx1, sy1, sx2, sy2]
                                }
                                object_entry["subobject"].append(sub_object_entry)

                                # Draw bounding box for the sub-object
                                cv2.rectangle(roi, (sx1, sy1), (sx2, sy2), (255, 0, 0), 2)
                                cv2.putText(roi, f"{sub_label} ({sub_conf:.2f})", (sx1, sy1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

                                # Save cropped sub-object
                                crop_path = os.path.join(
                                    "../results/", f"frame{frame_count}_{label}_{sub_label}_{sub_object_entry['id']}.jpg"
                                )
                                crop_and_save(roi, [sx1, sy1, sx2, sy2], crop_path)

                    # Append the object entry to the hierarchical output
                    hierarchical_output.append(object_entry)

        # Write the processed frame to the output video
        if out is None:
            height, width, _ = frame.shape
            out = cv2.VideoWriter("../results/annotated_video.mp4", fourcc, 30, (width, height))
        out.write(frame)

    cap.release()
    if out:
        out.release()

    # Save the hierarchical output to a JSON file
    with open(output_json_path, "w") as f:
        json.dump(hierarchical_output, f, indent=4)

    print(f"Detection complete. Results saved to {output_json_path} and annotated video saved.")

if __name__ == "__main__":
    # Input video path
    video_path = "../data/sample_video.mp4"  # Replace with your video file

    # Output JSON file path
    output_json_path = "../results/detections.json"

    # Classes to consider as sub-objects (e.g., "helmet", "tire")
    subobject_classes = ["helmet", "tire", "door"]

    # Run the detection
    detect_objects_and_subobjects(video_path, output_json_path, subobject_classes)


Processing frame 1...

0: 384x640 9 persons, 2 cups, 3 laptops, 1 cell phone, 1 book, 71.2ms
Speed: 7.7ms preprocess, 71.2ms inference, 6.9ms postprocess per image at shape (1, 3, 384, 640)

0: 640x544 2 persons, 1 laptop, 99.4ms
Speed: 2.1ms preprocess, 99.4ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 544)

0: 448x640 1 person, 1 cup, 1 laptop, 77.8ms
Speed: 5.6ms preprocess, 77.8ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)

0: 640x576 1 person, 1 cup, 1 book, 300.6ms
Speed: 5.5ms preprocess, 300.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 576)

0: 512x640 2 persons, 1 laptop, 154.2ms
Speed: 5.9ms preprocess, 154.2ms inference, 1.0ms postprocess per image at shape (1, 3, 512, 640)

0: 512x640 1 person, 274.4ms
Speed: 4.9ms preprocess, 274.4ms inference, 0.8ms postprocess per image at shape (1, 3, 512, 640)

0: 448x640 1 cup, 1 laptop, 1 keyboard, 387.1ms
Speed: 3.1ms preprocess, 387.1ms inference, 17.5ms postprocess per imag