In [19]:
import requests
from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection

# Load processor and model
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

# Load image from URL
url = "https://as1.ftcdn.net/v2/jpg/05/02/59/18/1000_F_502591834_Td7o5AUVdIui2Q5YCwaSaPeKTFQMCu9B.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Define text queries
texts = [["Smartwatch","scratch on screen", "a crack", "broken defects"]]
# Process inputs
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])

# Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)

# Check for defects
i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

defect_found = False
for box, score, label in zip(boxes, scores, labels):
    if score >= 0.1:
        defect_found = True
        box = [round(i, 2) for i in box.tolist()]
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

if defect_found:
    print("Defect(s) found in the image.")
else:
    print("No defect found in the image.")


Detected Smartwatch with confidence 0.164 at location [329.64, 220.11, 639.73, 439.86]
Defect(s) found in the image.
