In [6]:
import torch
from transformers import AutoProcessor, GroundingDinoForObjectDetection
import cv2
import time
import torch.cuda as cuda
from PIL import Image
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"

In [7]:
processor = AutoProcessor.from_pretrained(model_id)
model = GroundingDinoForObjectDetection.from_pretrained(model_id).to("cuda")

RuntimeError: No CUDA GPUs are available

In [29]:
model.eval()

GroundingDinoForObjectDetection(
  (model): GroundingDinoModel(
    (backbone): GroundingDinoConvModel(
      (conv_encoder): GroundingDinoConvEncoder(
        (model): SwinBackbone(
          (embeddings): SwinEmbeddings(
            (patch_embeddings): SwinPatchEmbeddings(
              (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
            )
            (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (encoder): SwinEncoder(
            (layers): ModuleList(
              (0): SwinStage(
                (blocks): ModuleList(
                  (0): SwinLayer(
                    (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
                    (attention): SwinAttention(
                      (self): SwinSelfAttention(
                        (query): Linear(in_features=96, out_features=96, bias=True)
                        (key): Linear(in_features=9

In [33]:

image = Image.open("test.jpeg")
inputs = processor(images=image, text="a dog", return_tensors="pt").to("cuda")
times = {"non_specific": [], "specific": []}
# starter, ender = cuda.Event(enable_timing=True), cuda.Event(enable_timing=True)

In [30]:
class TimerHook:
    def __init__(self, name):
        self.name = name
        self.start_time = 0
        self.total_time = 0
        
    def start(self, module, input):
        self.start_time = time.perf_counter()
        
    def end(self, module, input, output):
        self.total_time += time.perf_counter() - self.start_time

In [38]:
backbone_timer = TimerHook("backbone")
text_backbone_timer = TimerHook("text_backbone")
# Register hooks
backbone_forward_hook = model.model.backbone.register_forward_pre_hook(backbone_timer.start)
backbone_backward_hook = model.model.backbone.register_forward_hook(backbone_timer.end)

text_backbone_forward_hook = model.model.text_backbone.register_forward_pre_hook(text_backbone_timer.start)
text_backbone_backward_hook = model.model.text_backbone.register_forward_hook(text_backbone_timer.end)

start_total = time.perf_counter()
output = model(**inputs)
total_time = time.perf_counter() - start_total

# Remove hooks
backbone_forward_hook.remove()
backbone_backward_hook.remove()
text_backbone_forward_hook.remove()
text_backbone_backward_hook.remove()

In [32]:
def run_video(model, video_path, query):
    backbone_timer = TimerHook("backbone")
    text_backbone_timer = TimerHook("text_backbone")
    backbone_forward_hook = model.model.backbone.register_forward_pre_hook(backbone_timer.start)
    backbone_backward_hook = model.model.backbone.register_forward_hook(backbone_timer.end)
    text_backbone_forward_hook = model.model.text_backbone.register_forward_pre_hook(text_backbone_timer.start)
    text_backbone_backward_hook = model.model.text_backbone.register_forward_hook(text_backbone_timer.end)
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    total_time = 0
    with torch.no_grad():
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = torch.from_numpy(frame).float().to("cuda")
            inputs = processor(images=frame, text=query, return_tensors="pt").to("cuda")
            start_time = time.perf_counter()
            _ = model(**inputs)
            end_time = time.perf_counter() - start_time
            total_time += end_time
            frame_count += 1
    cap.release()
    backbone_forward_hook.remove()
    backbone_backward_hook.remove()
    text_backbone_forward_hook.remove()
    text_backbone_backward_hook.remove()
    return {
        "model": model_id,
        "video": video_path,
        "query": query,
        "frame_count": frame_count,
        "non-query specific part (s)": text_backbone_timer.total_time + backbone_timer.total_time,
        "query specific part (s)": total_time - (text_backbone_timer.total_time + backbone_timer.total_time),
        "non-query specific part (%)": (text_backbone_timer.total_time + backbone_timer.total_time) / total_time * 100,
        "query specific part (%)": (total_time - (text_backbone_timer.total_time + backbone_timer.total_time)) / total_time * 100
    }

In [33]:
data = run_video(model, './hong_kong_airport_demo_data.mp4', 'pink suitcase')
print(data)

{'model': 'IDEA-Research/grounding-dino-tiny', 'video': './hong_kong_airport_demo_data.mp4', 'query': 'pink suitcase', 'frame_count': 2701, 'non-query specific part (s)': 42.16182706599557, 'query specific part (s)': 803.8611774740166, 'non-query specific part (%)': 4.983531989052616, 'query specific part (%)': 95.01646801094739}


In [2]:
import numpy as np
import torch
from mmengine.config import Config
from mmengine.dataset import Compose
from mmengine.runner import Runner
from mmengine.runner.amp import autocast
from mmyolo.registry import RUNNERS
from torchvision.ops import nms

In [3]:
cfg = Config.fromfile(
        "YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py"
    )
cfg.work_dir = "."
cfg.load_from = "yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
runner = Runner.from_cfg(cfg)
runner.call_hook("before_run")
runner.load_or_resume()
pipeline = cfg.test_dataloader.dataset.pipeline
runner.pipeline = Compose(pipeline)
runner.model.eval()

12/24 00:11:01 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.10.0 (default, Mar  3 2022, 09:58:08) [GCC 7.5.0]
    CUDA available: False
    MUSA available: False
    numpy_random_seed: 791419896
    GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
    PyTorch: 1.11.0
    PyTorch compiling details: PyTorch built with:
  - GCC 7.3
  - C++ Version: 201402
  - Intel(R) oneAPI Math Kernel Library Version 2021.4-Product Build 20210904 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.5.2 (Git Hash a9302535553c73243c632ad3c4c80beec3d19a1e)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidde

YOLOWorldDetector(
  (data_preprocessor): YOLOWDetDataPreprocessor()
  (backbone): MultiModalYOLOBackbone(
    (image_model): YOLOv8CSPDarknet(
      (stem): ConvModule(
        (conv): Conv2d(3, 80, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (activate): SiLU(inplace=True)
      )
      (stage1): Sequential(
        (0): ConvModule(
          (conv): Conv2d(80, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(160, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (activate): SiLU(inplace=True)
        )
        (1): CSPLayerWithTwoConv(
          (main_conv): ConvModule(
            (conv): Conv2d(160, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(160, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (activate): SiLU(inplace=True)


In [4]:
def run_yoloworld(video_path, text_query):
    frame_count = 0
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = torch.from_numpy(frame).float().to("cuda")
        frame_count += 1
    cap.release()
    return frame_count

In [5]:
run_yoloworld('hong_kong_airport_demo_data.mp4', 'McDonalds Logo')

RuntimeError: No CUDA GPUs are available