Make sure the transformer version is up to data

In [1]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

## For testing 

In [12]:
# load the caption from the txt file
with open("OFA_output.txt", "r") as f:
    caption = f.read().strip()


# DINO
dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny") # .to(device)

image_path = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_path, stream=True).raw)
text_labels = [[caption, "null"]]

# Use OFA caption as input to DINO
dino_inputs = dino_processor(images=image, text=text_labels, return_tensors="pt") #.to(device)
with torch.no_grad():
    dino_outputs = dino_model(**dino_inputs)

# Post-process and display results =
results = dino_processor.post_process_grounded_object_detection(
    dino_outputs,
    dino_inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["text_labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")

Detected orange cat with confidence 0.536 at location [345.46, 23.5, 637.62, 373.86]


## COCO Evalution

In [2]:
import json

# DINO
dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny") # .to(device)

with open("OFA_output.txt", "r") as f:
    lines = f.readlines()
full_results = []
for idx, line in enumerate(lines):
    try:
        # Split line: 
        parts = line.strip().split(" || ")
        image_path, caption = parts[0], parts[1]
        image_id = int(parts[2])
        category_id = int(parts[3])

        image = Image.open(requests.get(image_path, stream=True).raw)
        image = image.convert('RGB')

        text_labels = [[caption, "null"]]

        # Process with DINO
        dino_inputs = dino_processor(images=image, text=text_labels, return_tensors="pt") # .to(device)
        with torch.no_grad():
            dino_outputs = dino_model(**dino_inputs)

        # Post-process
        results = dino_processor.post_process_grounded_object_detection(
            dino_outputs,
            dino_inputs.input_ids,
            box_threshold=0.4,
            text_threshold=0.3,
            target_sizes=[image.size[::-1]]
        )

        #result = results[0]

        # Retrieve the first image result
        
        for result in results:
            for box, score, labels in zip(result["boxes"], result["scores"], result["text_labels"]):
                box = [round(x, 2) for x in box.tolist()]
                formatted_results = {
                    "image_id": image_id,
                    "category_id": category_id,
                    "bbox": box,
                    "score": round(score.item(), 3)
                }
                # if x%10 == 0:
                #     print(str(x) + " out of " + str(len(image_url)))
                full_results.append(formatted_results)

    except Exception as e:
        print(f"[{idx}] Error processing line: {line.strip()}")
        print(e)

full_results = json.dumps(full_results)

KeyboardInterrupt: 

In [None]:
with open("OFA_results.json", "w") as f:
  f.write(full_results)

## Processing the result

In [None]:
results = open("OFA_results.json")
results = json.load(results)
for result in results:
    bbox_result = result["bbox"]
    width = bbox_result[2] - bbox_result[0]
    height = bbox_result[3] - bbox_result[1]
    result["bbox"][2] = round(width,2)
    result["bbox"][3] = round(height,2)
results = json.dumps(results)
with open("OFA_processed_results.json", "w") as f:
  f.write(results)