In [1]:
import json
import torch
from transformers import DetrForObjectDetection, DetrImageProcessor
import cv2
import numpy as np
from PIL import Image
import ipywidgets as widgets
from IPython.display import display, clear_output


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50').to(device)
processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')
model.eval()


video_path = 'Parkng_Lot_Surveillance_Video.mp4'
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_seconds = int(total_frames / fps)


initial_time = 280
initial_frame_number = int(initial_time * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, initial_frame_number)
ret, frame = cap.read()
if not ret:
    raise ValueError("Cannot read the initial frame at 285 seconds.")

image_height = frame.shape[0]


with open('flipped_car_coordinates_with_parking_positions.json', 'r') as file:
    parking_boxes = json.load(file)

for box in parking_boxes:
    box["y1"] = image_height - box["y1"]
    box["y2"] = image_height - box["y2"]


def calculate_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interWidth = max(0, xB - xA)
    interHeight = max(0, yB - yA)
    interArea = interWidth * interHeight

    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea)
    
    return iou

parking_status_output = widgets.Output()

image_widget = widgets.Image(format='jpeg')

def update_frame(change):
    time_in_seconds = time_slider.value
    frame_number = int(fps * time_in_seconds)

    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    ret, frame = cap.read()

    if not ret or frame is None:
        print(f"Error: Cannot read frame at {time_in_seconds} seconds.")
        return

    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits[0].cpu()
    boxes = outputs.pred_boxes[0].cpu()

    probas = logits.softmax(-1)
    keep = probas.max(-1).values > 0.5

    for parking_box in parking_boxes:
        x1 = int(parking_box["x1"])
        y1 = int(parking_box["y1"])
        x2 = int(parking_box["x2"])
        y2 = int(parking_box["y2"])
        cv2.rectangle(frame, (x1, y1), (x2, y2), color=(255, 0, 0), thickness=2)
        cv2.putText(frame, f'Row: {parking_box["row"]}, Col: {parking_box["column"]}', (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

    detected_boxes = []
    for box, cls in zip(boxes[keep], probas[keep]):
        category_id = cls.argmax().item()
        if category_id == 3:  # 3 is car
            x_center, y_center, width, height = box.numpy()
            x1 = int((x_center - width / 2) * frame.shape[1])
            y1 = int((y_center - height / 2) * frame.shape[0])
            x2 = int((x_center + width / 2) * frame.shape[1])
            y2 = int((y_center + height / 2) * frame.shape[0])
            detected_boxes.append((x1, y1, x2, y2))

            cv2.rectangle(frame, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
            cv2.putText(frame, 'Car', (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

    parking_status = [
        {"row": row, "column": col, "parked": False}
        for row in range(1, 3)
        for col in range(1, 14)
    ]

    for detected_box in detected_boxes:
        for parking_box in parking_boxes:
            parking_box_coords = [parking_box["x1"], parking_box["y1"], parking_box["x2"], parking_box["y2"]]
            iou = calculate_iou(detected_box, parking_box_coords)
            if iou > 0.5:
                row, column = parking_box["row"], parking_box["column"]
                for status in parking_status:
                    if status["row"] == row and status["column"] == column:
                        status["parked"] = True

    with parking_status_output:
        clear_output(wait=True)
        print(parking_status)

    ret, buffer = cv2.imencode('.jpg', frame)
    if ret:
        image_widget.value = buffer.tobytes()

time_slider = widgets.IntSlider(value=initial_time, min=initial_time, max=total_seconds, step=1, description="Time (s)", continuous_update=False)
time_slider.observe(update_frame, names='value')

display(widgets.VBox([time_slider, image_widget, parking_status_output]))
update_frame(None)


Using device: cuda


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(IntSlider(value=280, continuous_update=False, description='Time (s)', max=316, min=280), Image(â€¦