In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from akida_models.detection.processing import preprocess_image, decode_output
from akida_models import yolo_voc_pretrained

# Load the pre-trained YOLO model
model_akida,anchors = yolo_voc_pretrained()

# Labels for the PASCAL VOC dataset
labels = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
          'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
          'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
          'train', 'tvmonitor']

# Initialize video capture from the inbuilt camera
video_capture = cv2.VideoCapture(0)

# Define anchors (you might need to adjust these based on your model)
#anchors = np.array([[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])

while True:
    ret, frame = video_capture.read()
    if not ret:
        break

    # Keep the original image size for later bounding boxes rescaling
    raw_height, raw_width, _ = frame.shape

    # Pre-process the image
    input_shape = model_akida.input_shape[1:3]  # Correctly access input shape
    image = preprocess_image(frame, input_shape)
    input_image = image[np.newaxis, :].astype(np.uint8)

    # Call evaluate on the image
    pots = model_akida.predict(input_image)[0]

    # Reshape the potentials to prepare for decoding
    h, w, c = pots.shape
    pots = pots.reshape((h, w, len(anchors), 4 + 1 + len(labels)))

    # Decode potentials into bounding boxes
    raw_boxes = decode_output(pots, anchors, len(labels))

    # Rescale boxes to the original image size
    pred_boxes = np.array([[
        box.x1 * raw_width, box.y1 * raw_height, box.x2 * raw_width,
        box.y2 * raw_height,
        box.get_label(),
        box.get_score()
    ] for box in raw_boxes])

    # Draw bounding boxes on the frame
    for box in pred_boxes:
        cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2)
        label = f"{labels[int(box[4])]} - {box[5]:.2f}"
        cv2.putText(frame, label, (int(box[0]), int(box[1]) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow('YOLO Video Detection', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture and close windows
video_capture.release()
cv2.destroyAllWindows()

