In [1]:
import requests

from PIL import Image
import torch

from transformers import Owlv2Processor, Owlv2ForObjectDetection

processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")

model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image = Image.open(requests.get(url, stream=True).raw)

texts = [["a photo of a cat", "a photo of a dog"]]

inputs = processor(text=texts, images=image, return_tensors="pt")

for k,v in inputs.items():
  print(k,v.shape)

outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]

target_sizes = torch.Tensor([image.size[::-1]])

# Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)

results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)

i = 0  # Retrieve predictions for the first image for the corresponding text queries

text = texts[i]

boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

  from .autonotebook import tqdm as notebook_tqdm


input_ids torch.Size([2, 16])
attention_mask torch.Size([2, 16])
pixel_values torch.Size([1, 3, 960, 960])
Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51]
Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]


In [2]:
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()}")


Layer: owlv2.logit_scale | Size: torch.Size([])
Layer: owlv2.text_model.embeddings.token_embedding.weight | Size: torch.Size([49408, 512])
Layer: owlv2.text_model.embeddings.position_embedding.weight | Size: torch.Size([16, 512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.k_proj.weight | Size: torch.Size([512, 512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.k_proj.bias | Size: torch.Size([512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.v_proj.weight | Size: torch.Size([512, 512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.v_proj.bias | Size: torch.Size([512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.q_proj.weight | Size: torch.Size([512, 512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.q_proj.bias | Size: torch.Size([512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.out_proj.weight | Size: torch.Size([512, 512])
Layer: owlv2.text_model.encoder.layers.0.self_attn.out_proj.bias | Size: torch.Size([512])
Layer: owlv2.text_model.en

In [3]:
%pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Note: you may need to restart the kernel to use updated packages.


In [20]:
from torchinfo import summary


print(type(inputs))

inputsDict = {k: v for k,v in inputs.items()}
# Print model summary
summary(model, input_data=inputsDict)

<class 'transformers.tokenization_utils_base.BatchEncoding'>


Layer (type:depth-idx)                                            Output Shape              Param #
Owlv2ForObjectDetection                                           [1, 3601, 768]            --
├─Owlv2Model: 1-1                                                 [1, 3601, 768]            1
│    └─Owlv2VisionTransformer: 2-1                                [1, 768]                  --
│    │    └─Owlv2VisionEmbeddings: 3-1                            [1, 3601, 768]            3,356,160
│    │    └─LayerNorm: 3-2                                        [1, 3601, 768]            1,536
│    │    └─Owlv2Encoder: 3-3                                     [1, 3601, 768]            85,054,464
│    │    └─LayerNorm: 3-4                                        [1, 768]                  1,536
│    └─Owlv2TextTransformer: 2-2                                  [2, 512]                  --
│    │    └─Owlv2TextEmbeddings: 3-5                              [2, 16, 512]              25,305,088
│    │    └─Owlv2