In [None]:
import cv2
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import numpy as np

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the model and move it to GPU
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)

# Convert model to FP16 (half precision) if CUDA is available
if device == "cuda":
    model.half()

# OpenCV to capture video from webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open camera.")
    exit()

print("Press 'q' to quit the application.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture frame.")
        break

    
    frame = cv2.resize(frame, (640, 480))

    # Convert frame to PIL image
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Preprocess image and move to GPU
    inputs = processor(images=pil_image, return_tensors="pt").to(device)
    
    # Convert inputs to half precision if using CUDA
    if device == "cuda":
        inputs["pixel_values"] = inputs["pixel_values"].half()

    # Perform object detection
    with torch.no_grad():
        outputs = model(**inputs)

    # Move outputs back to CPU for processing
    target_sizes = torch.tensor([pil_image.size[::-1]], device=device)
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.7)[0]
    results = {k: v.cpu() for k, v in results.items()}

    # Draw bounding boxes for detected persons
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        if label.item() == 1:  # 1 = "Person" in COCO dataset
            box = [round(i, 2) for i in box.tolist()]
            cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2)
            cv2.putText(frame, f"Person: {round(score.item(), 3)}",
                        (int(box[0]), int(box[1]) - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Show frame
    cv2.imshow('Person Detection', frame)

    # Exit if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


2.5.1+cu121
12.1
True
1
NVIDIA GeForce RTX 2050
