In [2]:
import torch
import clip
import cv2
import numpy as np
from PIL import Image

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [4]:
object_labels = ["cat", "dog", "person", "car", "bottle", "chair", 
                 "laptop", "phone", "tree", "book", "table", 
                 "toothbrush", "toothpaste"]

In [5]:
text_inputs = clip.tokenize(object_labels).to(device)


In [6]:
cap = cv2.VideoCapture(0)

In [7]:
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

In [None]:
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    height, width, _ = frame.shape

    # Split frame into regions for multi-object detection
    num_regions = 3  # Number of different locations to analyze
    region_width = width // num_regions

    detected_objects = []  # Store detected objects

    for i in range(num_regions):
        # Define sub-region coordinates
        x1, x2 = i * region_width, (i + 1) * region_width
        y1, y2 = int(height * 0.2), int(height * 0.8)

        # Extract sub-region from frame
        sub_frame = frame[y1:y2, x1:x2]

        # Convert sub-frame to PIL Image
        image = cv2.cvtColor(sub_frame, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

        # Preprocess image for CLIP
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Get CLIP predictions
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_inputs)
            similarity = (image_features @ text_features.T).softmax(dim=-1)

        # Get the best prediction
        best_match_idx = similarity.argmax().item()
        best_match_label = object_labels[best_match_idx]
        confidence = similarity[0, best_match_idx].item()

        # Add detected object to list if confidence is high
        if confidence > 0.6:  # Set confidence threshold
            detected_objects.append((best_match_label, confidence, x1, y1, x2, y2))

    # Draw bounding boxes and labels for detected objects
    for obj in detected_objects:
        label, conf, x1, y1, x2, y2 = obj

        # Draw bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 3)

        # Draw label at the bottom
        label_text = f"{label} ({conf:.2f})"
        label_x, label_y = x1, y2 + 30  # Positioning label below the box
        cv2.rectangle(frame, (label_x, label_y - 25), (label_x + 250, label_y), (0, 0, 255), -1)  # Red background
        cv2.putText(frame, label_text, (label_x + 5, label_y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

    # Show the frame
    cv2.imshow("CLIP Multi-Object Detection", frame)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


: 

In [None]:
cap.release()
cv2.destroyAllWindows()