<a href="https://colab.research.google.com/github/Vashi050197/Detect_Anything/blob/main/grounding_dino_example_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq git+https://github.com/EduardoPach/transformers.git@adding-grounding-dino
!pip install -qq supervision

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.2/72.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import cv2
import torch
import requests
import numpy as np
from PIL import Image
from tqdm import tqdm
import supervision as sv
import matplotlib.pyplot as plt
from transformers import AutoModelForObjectDetection, AutoProcessor

In [3]:
model_id = "EduardoPacheco/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForObjectDetection.from_pretrained(model_id).to(device)
processor = AutoProcessor.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/692M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def predict(images, text):
    inputs = processor(images=images, text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Needs to be on cpu for post_process_object_detection
    outputs.logits = outputs.logits.cpu()
    outputs.pred_boxes = outputs.pred_boxes.cpu()

    target_sizes = [image.size[::-1] for image in images]
    results = processor.image_processor.post_process_object_detection(outputs, threshold=0.35, target_sizes=target_sizes)

    frames = []
    for idx, image in enumerate(images):
        detections = sv.Detections.from_transformers(results[idx])
        annot = sv.BoundingBoxAnnotator()
        img_annot = annot.annotate(scene=np.array(image), detections=detections)
        frames.append(img_annot)

    return frames

# Video

In [5]:
path = "Bottle.mp4"

In [6]:
frames = []
for frame in tqdm(sv.get_video_frames_generator(source_path=path), unit="frames"):
    frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    frames.append(frame_pil)

93frames [00:00, 128.77frames/s]


In [12]:
# To avoid OOM error I'll predict image by image
frame_annot = []
for frame in frames:
    out = predict([frame], "bottle.")
    frame_annot.append(out[0])

In [13]:
def plot_images_in_grid(image_list, grid_shape=None, figsize=(10, 10)):
    """
    Plot a list of images in a grid layout.

    Parameters:
    - image_list: A list of NumPy arrays representing images.
    - grid_shape: A tuple (rows, columns) specifying the grid layout. If None, the function will attempt to create a square grid.
    - figsize: A tuple specifying the size of the entire figure.

    Example usage:
    plot_images_in_grid(image_list, grid_shape=(2, 3), figsize=(12, 8))
    """
    if grid_shape is None:
        # If grid_shape is not provided, try to create a square grid
        num_images = len(image_list)
        rows = int(np.sqrt(num_images))
        cols = int(np.ceil(num_images / rows))
    else:
        rows, cols = grid_shape

    plt.figure(figsize=figsize)

    for i, image in enumerate(image_list):
        plt.subplot(rows, cols, i + 1)
        plt.imshow(image, cmap='gray')  # You can change the cmap as needed
        plt.axis('off')

    plt.show()

In [14]:
def create_video_from_frames_rgb(frame_list, output_filename, fps):
    # Get the shape of the frames to determine video dimensions
    frame_height, frame_width, _ = frame_list[0].shape

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec as needed
    out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

    for frame in frame_list:
        # Convert the frame from RGB to BGR
        bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        # Write the frame to the video file
        out.write(bgr_frame)

    # Release the VideoWriter object
    out.release()

In [15]:
# Example usage:
# Assuming you have a list of NumPy arrays called frame_list
# and you want to create an "output_video.mp4" at 30 frames per second (fps)
create_video_from_frames_rgb(frame_annot, "output_video2.mp4", fps=30)

In [16]:
from IPython.display import Video

# Replace 'your_video.mp4' with the actual filename of your video
video_path = 'output_video2.mp4'

# Play the video
Video(video_path)

In [14]:
Video(path)

In [None]:
/content/output_video1.mp4