In [1]:
import json
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForZeroShotObjectDetection
from tqdm import tqdm
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [3]:
input_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
input_model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(device)

## Try Model

In [4]:
input_path = "../input-data/cat.jpg"
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"

In [5]:
image = Image.open(input_path)

inputs = input_processor(images = image, return_tensors = "pt").to(device)
output = input_model.generate(**inputs)

generated_caption = input_processor.decode(output[0], skip_special_tokens=True)
print("Caption:", generated_caption)

image = Image.open(requests.get(image_url, stream=True).raw)
text_labels = [[generated_caption, "null"]]

inputs = dino_processor(images=image, text=text_labels, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = dino_model(**inputs)
results = dino_processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)
# Retrieve the first image result
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")

Caption: a small orange kitten sitting on a white background
Detected a small orange kitten with confidence 0.499 at location [345.36, 23.42, 637.64, 374.18]


  results = dino_processor.post_process_grounded_object_detection(


## COCO Evaluation

In [6]:
coco_dataset = open("../data/instances_val2017.json")
coco_json = json.load(coco_dataset)
coco_images = coco_json["images"]
coco_annotations = coco_json["annotations"]
coco_categories = coco_json["categories"]
input_url = []
image_url = []
for anno in coco_annotations:
    for images in coco_images:
        if anno["image_id"] == images["id"]:
            image_url.append(images["coco_url"])
            break
    for categories in coco_categories:
        if anno["category_id"] == categories["id"]:
            input_url.append("../input-data/" + categories["name"] + ".jpg")
            break

In [7]:
print(len(input_url))
print(len(image_url))

36781
36781


In [8]:
full_results = []
for x in tqdm(range(len(input_url))):
    #opens the image input, and captions it
    image = Image.open(input_url[x])

    inputs = input_processor(images = image, return_tensors = "pt").to(device)
    input_model_output = input_model.generate(**inputs)
    generated_caption = input_processor.decode(output[0], skip_special_tokens=True)

    image = Image.open(requests.get(image_url[x], stream=True).raw)
    # Uses generated caption to detect
    text_labels = [[generated_caption, "null"]]

    inputs = dino_processor(images=image, text=text_labels, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = dino_model(**inputs)

    results = dino_processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        box_threshold=0.4,
        text_threshold=0.3,
        target_sizes=[image.size[::-1]]
    )


    # Retrieve the first image result
    result = results[0]
    for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
        box = [round(x, 2) for x in box.tolist()]
    formatted_results = {
        "image_id": coco_annotations[x]["image_id"],
        "category_id": coco_annotations[x]["category_id"],
        "bbox": box,
        "score": round(score.item(), 3)
    }
    # if x%10 == 0:
    #     print(x/10)
    full_results.append(formatted_results)
full_results = json.dumps(full_results)

  0%|          | 0/36781 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  results = dino_processor.post_process_grounded_object_detection(
  0%|          | 9/36781 [00:34<38:43:58,  3.79s/it]


KeyboardInterrupt: 