# Object Detection with Grounding DINO

This notebook uses Grounding DINO to perform object detection on keyframes from videos, using previously extracted captions.

## 1. Install required libraries

In [1]:
# Install required libraries
!pip install -q torch torchvision
!pip install -q transformers
!pip install -q timm
!pip install -q huggingface_hub
!pip install -q opencv-python-headless
!pip install -q gdown
!pip install -q matplotlib
!pip install -q Pillow
!pip install -q groundingdino-py

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 2. Download data from Google Drive

In [2]:
# Configure batch and Google Drive IDs
BATCH_NAME = "L01"  # Batch name (L01, L02, L03, ...)
BATCH_ID = "14MeYV2WBWwldMDGRrpG9s7vz8triwbWr"  # ID for L01.zip
BATCH_RESULT_ID = "15AVPGtZ6W3C3H8Hc_JF3SBsrhMVUbZWU"  # ID for results file

# Create data directory if it doesn't exist
!mkdir -p data

# Download batch (keyframes) from Google Drive
print(f"Downloading batch {BATCH_NAME}...")
!gdown {BATCH_ID} -O data/{BATCH_NAME}.zip
!unzip -qq data/{BATCH_NAME}.zip -d ./

# Download results file with captions
print(f"Downloading caption results...")
!gdown {BATCH_RESULT_ID} -O data/results.zip
!unzip -qq data/results.zip -d data/

print("Data downloaded successfully!")

# Create directory for detection results
!mkdir -p detection_results

print("Data downloaded successfully!")

Downloading batch L01...
Downloading...
From (original): https://drive.google.com/uc?id=14MeYV2WBWwldMDGRrpG9s7vz8triwbWr
From (redirected): https://drive.google.com/uc?id=14MeYV2WBWwldMDGRrpG9s7vz8triwbWr&confirm=t&uuid=30d1a7d6-3851-46e9-bbf7-2082cc49b066
To: /kaggle/working/data/L01.zip
100%|████████████████████████████████████████| 527M/527M [00:05<00:00, 91.7MB/s]
Downloading caption results...
Downloading...
From: https://drive.google.com/uc?id=15AVPGtZ6W3C3H8Hc_JF3SBsrhMVUbZWU
To: /kaggle/working/data/results.zip
100%|█████████████████████████████████████████| 599k/599k [00:00<00:00, 111MB/s]
Data downloaded successfully!
Data downloaded successfully!


## 3. Import libraries

In [3]:
# Import required libraries
import os
import json
import glob
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
import numpy as np

# Handle Google Drive connection if running in Colab
try:
    from google.colab import drive
    try:
        drive.mount('/content/drive')
        print("[INFO] Google Drive connected via Colab")
    except NotImplementedError:
        print("[INFO] Current environment does not support mounting Google Drive via Colab")
        # In Kaggle, data is accessed directly from the current directory
except ImportError:
    print("[INFO] Not running in Colab, skipping Google Drive mount")

# Imports for Grounding DINO will be added in the model loading section

[INFO] Current environment does not support mounting Google Drive via Colab


## 4. Check environment

In [4]:
# Check GPU
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    
    print(f"Number of GPUs: {device_count}")
    print(f"Current GPU: {current_device}")
    print(f"GPU name: {device_name}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("CUDA not available. Check NVIDIA drivers and PyTorch CUDA installation.")

Number of GPUs: 2
Current GPU: 0
GPU name: Tesla T4
CUDA version: 12.4


## 5. Configure batch and video indices

In [5]:
# Configure batch processing
START_VIDEO_INDEX = 1  # Start from V001
BATCH_SIZE = 8  # Process 8 videos at a time

# Define paths
BATCH_PATH = BATCH_NAME  # Example: "L01"

# Get list of videos in the batch
videos = sorted(glob.glob(os.path.join(BATCH_PATH, "V*")))

# Define paths
BATCH_PATH = BATCH_NAME  # Example: "L01"

# Get list of videos in the batch
videos = sorted(glob.glob(os.path.join(BATCH_PATH, "V*")))
print(f"Found {len(videos)} video directories in batch {BATCH_PATH}")

# Only process videos from START_VIDEO_INDEX to START_VIDEO_INDEX + BATCH_SIZE - 1
end_idx = min(START_VIDEO_INDEX + BATCH_SIZE - 1, len(videos))
selected_videos = videos[START_VIDEO_INDEX - 1:end_idx]
print(f"Processing {len(selected_videos)} videos: {[os.path.basename(v) for v in selected_videos]}")

Found 8 video directories in batch L01
Processing 8 videos: ['V001', 'V002', 'V003', 'V004', 'V005', 'V006', 'V007', 'V008']


## 6. Load Grounding DINO model

In [6]:
# Load Grounding DINO model
device = "cuda" if torch.cuda.is_available() else "cpu"

# Use groundingdino-py directly
from groundingdino.util.inference import load_model, load_image, predict, annotate
import groundingdino.datasets.transforms as T

# Paths for model config and checkpoint
model_config_path = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
model_checkpoint_path = "weights/groundingdino_swint_ogc.pth"

# Download config and checkpoint
!mkdir -p GroundingDINO/groundingdino/config
!mkdir -p weights
!wget -q -O {model_config_path} https://github.com/IDEA-Research/GroundingDINO/raw/main/groundingdino/config/GroundingDINO_SwinT_OGC.py
!wget -q -O {model_checkpoint_path} https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

# Load model
model = load_model(model_config_path, model_checkpoint_path)
model.to(device)

print(f"Grounding DINO model loaded on device: {device}")

2025-07-19 17:20:15.259894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752945615.448073      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752945615.508158      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Grounding DINO model loaded on device: cuda


## 7. Helper functions for extracting objects from captions

In [7]:
# Function to extract meaningful segments from captions to use as prompts
def extract_objects_from_caption(caption):
    """
    Extract meaningful segments from captions to use as prompts for object detection
    """
    # Remove unnecessary meta text
    caption = caption.replace("The image appears to be", "")
    caption = caption.replace("The image shows", "")
    
    # List to store prompt segments
    prompts = []
    
    # Split caption into parts by line breaks
    parts = caption.split('\n')
    
    for part in parts:
        # Skip short lines or lines without content
        if len(part.strip()) < 10:
            continue
            
        # Find specific description sections
        if ':' in part and '**' in part:
            # Example: "**Top Row:** - People walking..."
            topic_parts = part.split(':')
            if len(topic_parts) > 1 and len(topic_parts[1].strip()) > 10:
                prompts.append(topic_parts[1].strip())
        elif '-' in part:
            # Split into parts by hyphens
            bullet_points = part.split('-')
            for point in bullet_points:
                if len(point.strip()) > 10:
                    prompts.append(point.strip())
        elif len(part.strip()) > 20 and part.strip().endswith('.'):
            # Get complete sentences
            sentences = part.split('.')
            for sentence in sentences:
                if len(sentence.strip()) > 20:
                    prompts.append(sentence.strip() + '.')
    
    # If no prompts were found, use the original caption
    if not prompts:
        # If caption is too long, split into smaller segments
        if len(caption) > 200:
            sentences = caption.split('.')
            for sentence in sentences:
                if len(sentence.strip()) > 20:
                    prompts.append(sentence.strip() + '.')
        else:
            prompts.append(caption)
    
    # Limit the number of prompts to avoid overload
    return prompts[:3]

# Function to detect objects with Grounding DINO
def detect_objects(image_path, object_name, box_threshold=0.35, text_threshold=0.25):
    try:
        # Load and preprocess image
        image_source, image = load_image(image_path)
        
        # Detect objects
        boxes, logits, phrases = predict(
            model=model,
            image=image,
            caption=f"Find {object_name}",
            box_threshold=box_threshold,
            text_threshold=text_threshold,
            device=device
        )
        
        # Convert to standard format
        H, W, _ = image_source.shape
        boxes_xyxy = boxes * torch.Tensor([W, H, W, H])
        boxes_xyxy = boxes_xyxy.cpu().numpy().tolist()
        
        return {
            "boxes": boxes_xyxy,
            "scores": logits.cpu().numpy().tolist(),
            "labels": phrases
        }
    except Exception as e:
        print(f"Error processing {os.path.basename(image_path)} with object '{object_name}': {e}")
        return {
            "boxes": [],
            "scores": [],
            "labels": []
        }

# Function to visualize detection results on an image
def visualize_detection(image_path, detection_results):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    for box, score, label in zip(detection_results["boxes"], detection_results["scores"], detection_results["labels"]):
        x1, y1, x2, y2 = map(int, box)
        
        # Draw bounding box
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Draw label and score
        text = f"{label}: {score:.2f}"
        cv2.putText(image, text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(image)
    plt.axis('off')
    plt.show()
    
    return image

# Function to calculate IoU (Intersection over Union)
def calculate_iou(box1, box2):
    """Calculate IoU between two bounding boxes"""
    # Box coordinates
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    
    # Calculate area of each box
    area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
    area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
    
    # Calculate coordinates of intersection
    x1_i = max(x1_1, x1_2)
    y1_i = max(y1_1, y1_2)
    x2_i = min(x2_1, x2_2)
    y2_i = min(y2_1, y2_2)
    
    # Check if there is no intersection
    if x2_i < x1_i or y2_i < y1_i:
        return 0.0
    
    # Calculate area of intersection
    area_intersection = (x2_i - x1_i) * (y2_i - y1_i)
    
    # Calculate IoU
    iou = area_intersection / (area1 + area2 - area_intersection)
    
    return iou

# Function to filter duplicate objects and keep only the highest scoring object
def filter_objects(objects, iou_threshold=0.7, confidence_threshold=0.5):
    """Filter duplicated objects, keep only the highest scoring object for each group of overlapping boxes"""
    # If there are no objects, return empty list
    if not objects:
        return []
    
    # Filter objects based on confidence threshold
    objects = [obj for obj in objects if obj["score"] >= confidence_threshold]
    
    # Sort objects by score in descending order
    sorted_objects = sorted(objects, key=lambda x: x["score"], reverse=True)
    
    # List to store filtered objects
    filtered_objects = []
    
    # Iterate through each object
    for obj in sorted_objects:
        # Check if current object overlaps with any object in filtered_objects
        duplicate = False
        for filtered_obj in filtered_objects:
            # If same object name and IoU greater than threshold
            if obj["object"] == filtered_obj["object"] and \
               calculate_iou(obj["box"], filtered_obj["box"]) > iou_threshold:
                duplicate = True
                break
        
        # If not duplicate, add to filtered list
        if not duplicate:
            filtered_objects.append(obj)
    
    return filtered_objects

## 8. Processing and object detection

In [None]:
# Create directory to save results
!mkdir -p detection_results

# Process each video
for video_dir in selected_videos:
    video_name = os.path.basename(video_dir)
    print(f"\nProcessing video: {video_name}")
    
    # Path to the caption file, based on confirmed directory structure
    caption_file = os.path.join("data", "results", f"{BATCH_NAME}_{video_name}_caption.json")
    
    try:
        with open(caption_file, 'r', encoding='utf-8') as f:
            captions = json.load(f)
        print(f"Successfully loaded {len(captions)} keyframes from {caption_file}")
    except FileNotFoundError:
        print(f"Error: Caption file not found at {caption_file}")
        continue
    except Exception as e:
        print(f"Error reading caption file {caption_file}: {e}")
        continue
    
    # Initialize list to store results
    detection_results = []
    
    # Process each keyframe
    for item in captions:
        keyframe_name = item["keyframe"]
        caption = item["caption"]
        
        # Full path to the keyframe file. video_dir is the correct path (e.g., 'L01/V001')
        keyframe_path = os.path.join(video_dir, keyframe_name)
        
        # Extract prompt segments from caption
        prompts = extract_objects_from_caption(caption)
        
        # Initialize results for current keyframe
        keyframe_results = {
            "keyframe": keyframe_name,
            "caption": caption,
            "objects": []
        }
        
        # Detect objects using each prompt
        for prompt in prompts:
            # Detect objects
            results = detect_objects(keyframe_path, prompt)
            
            # Add results to the list
            for i, (box, score) in enumerate(zip(results["boxes"], results["scores"])):
                label = results["labels"][i] if i < len(results["labels"]) else prompt
                keyframe_results["objects"].append({
                    "prompt": prompt,  # Save the prompt that was used
                    "object": label,
                    "box": box,
                    "score": score
                })
        
        # Apply filter_objects to remove duplicate objects and filter by score
        keyframe_results["objects"] = filter_objects(keyframe_results["objects"])
        
        print(f"Keyframe {keyframe_name}: {len(keyframe_results['objects'])} objects after filtering")
        
        # Add keyframe results to the main list
        detection_results.append(keyframe_results)
    
    # Save results to JSON file
    output_file = os.path.join("detection_results", f"{BATCH_NAME}_{video_name}_detection.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(detection_results, f, ensure_ascii=False, indent=4)
    
    print(f"\nSaved detection results for {video_name} to {output_file}")

print("\nObject detection completed for all videos!")


Processing video: V001
Successfully loaded 803 keyframes from data/results/L01_V001_caption.json


  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=False):


Keyframe L01_V001_000000.jpg: 0 objects after filtering
Keyframe L01_V001_000009.jpg: 3 objects after filtering
Keyframe L01_V001_000019.jpg: 0 objects after filtering
Keyframe L01_V001_000020.jpg: 3 objects after filtering
Keyframe L01_V001_000035.jpg: 5 objects after filtering
Keyframe L01_V001_000050.jpg: 1 objects after filtering
Keyframe L01_V001_000052.jpg: 0 objects after filtering
Keyframe L01_V001_000260.jpg: 13 objects after filtering
Keyframe L01_V001_000469.jpg: 1 objects after filtering
Keyframe L01_V001_000470.jpg: 1 objects after filtering
Keyframe L01_V001_000490.jpg: 1 objects after filtering
Keyframe L01_V001_000510.jpg: 1 objects after filtering
Keyframe L01_V001_000511.jpg: 0 objects after filtering
Keyframe L01_V001_000529.jpg: 0 objects after filtering
Keyframe L01_V001_000547.jpg: 0 objects after filtering
Keyframe L01_V001_000548.jpg: 2 objects after filtering
Keyframe L01_V001_000568.jpg: 0 objects after filtering
Keyframe L01_V001_000588.jpg: 1 objects after f

## 9. Compress results for download

In [None]:
# Compress results for download
!cd detection_results && zip -r ../detection_results.zip *
print("\nCreated detection_results.zip file for detection results")

# In Colab, you can download this file by clicking on the folder icon on the left