In [1]:
import cv2
from PIL import Image
from skimage.feature import hog
from skimage.color import rgb2gray
from skimage.filters import threshold_otsu
from torchvision.transforms import functional as F
from torchvision.transforms import ToTensor
from tqdm import tqdm
from ultralytics import YOLO

# Load the video
video_path = "videoplayback.mp4"  # Update with your video file path
cap = cv2.VideoCapture(video_path)

# Load the YOLOv8 model
model = YOLO("best.pt")  # Update with your YOLOv5 model weights file path

# Specify the target class (siren light)
target_class = "Siren-Light"

#Get the index of the target class
target_class_index = 0

# Define the HOG parameters
orientations = 9
pixels_per_cell = (8, 8)
cells_per_block = (2, 2)

while cap.isOpened():
    ret, frame = cap.read()

    if not ret:
        break

    # Convert the frame to PIL Image
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Perform object detection on the frame using YOLOv5 model
    results = model.predict(frame_pil)
    
    # Get the bounding box coordinates and confidence scores
    bboxes = results[0].boxes  # Assuming batch size is 1
    
    # Filter the bounding boxes for the target class
    target_bboxes = bboxes

    for bbox in target_bboxes:
        x1, y1, x2, y2 = bbox.xyxy[0]
        confidence = float(bbox.conf)
        class_index = 0
        
        
        # Extract the siren light region from the frame
        siren_light = frame[int(y1):int(y2), int(x1):int(x2)]

        # Convert the siren light region to grayscale
        siren_light_gray = cv2.cvtColor(siren_light, cv2.COLOR_BGR2GRAY)

        # Apply Histogram of Oriented Gradients (HOG) feature extraction
        features = hog(siren_light_gray, orientations=orientations, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, visualize=False)

        # Reshape the feature array
        features = features.reshape(1, -1)

        # Perform thresholding based on image intensity
        threshold = threshold_otsu(siren_light_gray)
        intensity = features.mean()
        if intensity < threshold:
            siren_state = "On"
        else:
            siren_state = "Off"

        # Display the results on the frame
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        cv2.putText(frame, f"{target_class}: {confidence:.2f}", (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        cv2.putText(frame, f"Siren State: {siren_state}", (int(x1), int(y1) - 40), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Frame", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


0: 256x416 1 Siren-Light, 71.3ms
Speed: 2.9ms preprocess, 71.3ms inference, 8.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 63.8ms
Speed: 0.8ms preprocess, 63.8ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 47.7ms
Speed: 1.0ms preprocess, 47.7ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 45.9ms
Speed: 0.9ms preprocess, 45.9ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.9ms
Speed: 0.9ms preprocess, 46.9ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 50.1ms
Speed: 0.8ms preprocess, 50.1ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 47.2ms
Speed: 1.0ms preprocess, 47.2ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 49.3ms
Speed: 0.9ms preprocess, 49.3ms inference, 0.


0: 256x416 1 Siren-Light, 45.2ms
Speed: 0.8ms preprocess, 45.2ms inference, 0.6ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 46.7ms
Speed: 0.8ms preprocess, 46.7ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 47.6ms
Speed: 0.8ms preprocess, 47.6ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 45.9ms
Speed: 0.7ms preprocess, 45.9ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 46.2ms
Speed: 0.9ms preprocess, 46.2ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 2 Siren-Lights, 46.2ms
Speed: 0.7ms preprocess, 46.2ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 45.9ms
Speed: 1.0ms preprocess, 45.9ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 4 Siren-Lights, 45.5ms
Speed: 0.9ms preprocess, 45.5ms inference


0: 256x416 1 Siren-Light, 45.0ms
Speed: 0.9ms preprocess, 45.0ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 48.2ms
Speed: 0.8ms preprocess, 48.2ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.4ms
Speed: 0.7ms preprocess, 46.4ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 45.8ms
Speed: 0.9ms preprocess, 45.8ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.3ms
Speed: 0.8ms preprocess, 46.3ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.1ms
Speed: 0.9ms preprocess, 46.1ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.7ms
Speed: 0.8ms preprocess, 46.7ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 46.5ms
Speed: 0.8ms preprocess, 46.5ms inference, 0.5ms 

Speed: 1.0ms preprocess, 46.5ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 47.1ms
Speed: 1.1ms preprocess, 47.1ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 46.2ms
Speed: 0.9ms preprocess, 46.2ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 48.2ms
Speed: 0.9ms preprocess, 48.2ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 45.7ms
Speed: 0.8ms preprocess, 45.7ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 47.5ms
Speed: 0.8ms preprocess, 47.5ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 (no detections), 45.3ms
Speed: 0.8ms preprocess, 45.3ms inference, 0.2ms postprocess per image at shape (1, 3, 416, 416)

0: 256x416 1 Siren-Light, 48.8ms
Speed: 0.8ms preprocess, 48.8ms inference, 0.5ms postprocess per image 

KeyboardInterrupt: 