In [11]:
import json
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForZeroShotObjectDetection
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from PIL import Image
from qwen_vl_utils import process_vision_info

In [5]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [6]:
# Load QWEN2.5 model, 3B parameters
input_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
input_model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2-VL-2B-Instruct").to(device)

dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.28s/it]


## Try Model

In [7]:
input_path = "../input-data/cat.jpg"
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"

In [9]:
image = Image.open(input_path)

prompt = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "What is in the image? Only give your answer."}
        ]
    }]

text = input_processor.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
image_inputs, video_inputs = process_vision_info(prompt)

inputs = input_processor(
    text = [text],
    images = image_inputs,
    videos = video_inputs,
    padding = True,
    return_tensors = "pt",
).to(device)

output = input_model.generate(**inputs, max_new_tokens = 64)

generated_text = input_processor.decode(output[0], skip_special_tokens=True)
generated_caption = generated_text.split("\n")[-1]
print("Caption:", generated_caption)

image = Image.open(requests.get(image_url, stream=True).raw)
text_labels = [generated_caption]

inputs = dino_processor(images=image, text=text_labels, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = dino_model(**inputs)
results = dino_processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)
# Retrieve the first image result
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")

Caption: cat
Detected cat with confidence 0.786 at location [346.0, 23.85, 639.41, 372.65]
Detected cat with confidence 0.829 at location [9.57, 54.0, 316.34, 474.77]


  results = dino_processor.post_process_grounded_object_detection(


## COCO Evaluation

In [12]:
def requests_retry_session(
    retries=100,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [17]:
coco_dataset = open("../instances_val2017.json")
coco_json = json.load(coco_dataset)
coco_images = coco_json["images"]
coco_annotations = coco_json["annotations"]
coco_categories = coco_json["categories"]
inputs_url = []
image_url = []
for images in coco_images:
    image_inputs = []
    for anno in coco_annotations:
        if anno["image_id"] == images["id"]:
            for categories in coco_categories:
                if anno["category_id"] == categories["id"]:
                    image_input = {
                        "input": "../input-data/" + categories["name"] + ".jpg",
                        "category_id": categories["id"]
                    }
                    if image_input not in image_inputs:
                        image_inputs.append(image_input)
                    break
    inputs_url.append(image_inputs)
    image_url.append(images)

In [18]:
print(len(inputs_url))
print(len(image_url))

5000
5000


In [19]:
full_results = []
for x in range(len(inputs_url)):
    for input_url in inputs_url[x]:
        #opens the image input, and captions it
        image = Image.open(input_url["input"])
        prompt = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "What is in the image? Only give your answer."}
            ]
        }]

        text = input_processor.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
        image_inputs, video_inputs = process_vision_info(prompt)

        inputs = input_processor(
            text = [text],
            images = image_inputs,
            videos = video_inputs,
            padding = True,
            return_tensors = "pt",
        ).to(device)

        input_model_output = input_model.generate(**inputs, max_new_tokens = 64)
        generated_text = input_processor.decode(input_model_output[0], skip_special_tokens=True)
        generated_caption = generated_text.split("\n")[-1]

        image = Image.open(requests_retry_session().get(image_url[x]["coco_url"], stream=True).raw)
        # Uses generated caption to detect
        text_labels = [generated_caption]

        inputs = dino_processor(images=image, text=text_labels, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = dino_model(**inputs)

        results = dino_processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=0.4,
            text_threshold=0.3,
            target_sizes=[image.size[::-1]]
        )

         # Retrieve the first image result
        for result in results:
            for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
                box = [round(x, 2) for x in box.tolist()]
                formatted_results = {
                    "image_id": image_url[x]["id"],
                    "category_id": input_url["category_id"],
                    "bbox": box,
                    "score": round(score.item(), 3)
                }
                full_results.append(formatted_results)
    print(str(x + 1) + " out of " + str(len(image_url)))
full_results = json.dumps(full_results)

  results = dino_processor.post_process_grounded_object_detection(


1 out of 5000
