In [6]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch16", device="cuda")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["a photo of a cat", "a photo of a dog"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


Detected a photo of a cat with confidence 0.276 at location [10.94, 50.4, 315.8, 471.39]
Detected a photo of a cat with confidence 0.333 at location [334.84, 25.33, 636.16, 374.71]


In [30]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch16", device="cuda")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16").to("cuda")

In [33]:
import cv2
model.to("cuda")
image = cv2.imread("./gray_crab/1713595937.5568116.jpg")
                   
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

image = Image.fromarray(image)

texts = [["a photo of a crab"], "a photo of a box"]



inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda")

outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]

target_sizes = torch.Tensor([image.size[::-1]]).to("cuda")

# Convert outputs (bounding boxes and class logits) to COCO API

results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries

text = texts[i]

boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates

for box, score, label in zip(boxes, scores, labels):
  
      box = [round(i, 2) for i in box.tolist()]
  
      print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

      

In [34]:
boxes, scores, labels

(tensor([], device='cuda:0', size=(0, 4), grad_fn=<IndexBackward0>),
 tensor([], device='cuda:0', grad_fn=<IndexBackward0>),
 tensor([], device='cuda:0', dtype=torch.int64))

In [18]:
# draw boxes on the image
from PIL import ImageDraw
draw = ImageDraw.Draw(image)
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    draw.rectangle(box, outline="red", width=3)

image.show()

In [27]:
# convert image to numpy array
import matplotlib.pyplot as plt
import numpy as np
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

# draw boxes on the image
for box in boxes:
    box = [round(i, 2) for i in box.tolist()]
    image = cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)

plt.imshow(image)

KeyboardInterrupt: 