In [12]:
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForZeroShotObjectDetection
from PIL import Image

In [13]:
input_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base")

dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to("cuda")

In [17]:
input_path = "../input-data/cat.jpg"
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"

In [20]:
image = Image.open(input_path)

inputs = input_processor(images = image, return_tensors = "pt")
output = model.generate(**inputs)

generated_caption = input_processor.decode(output[0], skip_special_tokens=True)
print("Caption:", generated_caption)

image = Image.open(requests.get(image_url, stream=True).raw)
text_labels = [[generated_caption, "null"]]

inputs = dino_processor(images=image, text=text_labels, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = dino_model(**inputs)
results = dino_processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)
# Retrieve the first image result
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")



Caption: a small orange kitten sitting on a white background
Detected a small orange kitten with confidence 0.453 at location [345.44, 23.65, 637.8, 373.5]
