In [1]:
import av
import torch
import numpy as np
import os
from PIL import Image
from transformers import AutoProcessor, GroundingDinoForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_keyframes(video_path):
    container = av.open(video_path)
    stream = container.streams.video[0]
    frame_index = 0
    keyframe_index = []
    keyframes = []
    for frame in container.decode(stream):
        if frame_index == 0 or frame.key_frame:
            frame_data = frame.to_ndarray(format='rgb24')
            keyframes.append(frame_data)
            keyframe_index.append(frame_index)
        frame_index += 1
    container.close()
    return keyframe_index, np.array(keyframes)

In [3]:
keyframe_index, keyframe = extract_keyframes('input_videos/hong_kong_airport_demo_data.mp4')

In [4]:
print(keyframe_index)

[0, 44, 196, 348, 500, 652, 804, 956, 1108, 1260, 1412, 1564, 1716, 1868, 2020, 2172, 2324, 2476, 2628]


In [45]:
def sample_keyframes(keyframe_index, keyframes, sampling_ratio):
    num_samples = int(len(keyframe_index) * sampling_ratio)
    sampled_indices = np.linspace(0, len(keyframe_index)-1, num_samples, dtype=int)
    keyframe_samples = keyframes[sampled_indices]
    keyframe_index_samples = [keyframe_index[i] for i in sampled_indices]
    return keyframe_index_samples, keyframe_samples

In [46]:
sampled_keyframe_indices, sampled_keyframes = sample_keyframes(keyframe_index, keyframe, 1)

In [47]:
sampled_keyframes

array([[[[139, 130, 126],
         [139, 130, 126],
         [139, 130, 126],
         ...,
         [148, 137, 122],
         [140, 129, 114],
         [151, 140, 125]],

        [[ 97,  88,  84],
         [ 97,  88,  84],
         [ 97,  88,  84],
         ...,
         [145, 134, 119],
         [149, 138, 123],
         [151, 140, 125]],

        [[ 82,  73,  69],
         [ 82,  73,  69],
         [ 82,  73,  69],
         ...,
         [148, 137, 122],
         [149, 138, 123],
         [132, 121, 106]],

        ...,

        [[154, 139, 119],
         [154, 139, 119],
         [154, 139, 119],
         ...,
         [137, 179, 227],
         [136, 178, 235],
         [136, 178, 235]],

        [[154, 139, 119],
         [154, 139, 119],
         [154, 139, 119],
         ...,
         [134, 179, 235],
         [134, 178, 241],
         [134, 178, 241]],

        [[154, 139, 119],
         [154, 139, 119],
         [154, 139, 119],
         ...,
         [133, 178, 234],
        

In [57]:
os.makedirs('iframes', exist_ok=True)
for i, frame in enumerate(sampled_keyframes):
    img = Image.fromarray(frame)
    output_path = os.path.join(f'iframes/iframe_{i}.jpg')
    img.save(output_path)

In [5]:
processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to('cuda')

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Could not load the custom kernel for multi-scale deformable attention: Error building extension 'MultiScaleDeformableAttention': [1/4] c++ -MMD -MF ms_deform_attn_cpu.o.d -DTORCH_EXTENSION_NAME=MultiScaleDeformableAttention -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/srijan/.conda/envs/seiden/lib/python3.12/site-packages/transformers/kernels/deformable_detr -isystem /home/srijan/.conda/envs/seiden/lib/python3.12/site-packages/torch/include -isystem /home/srijan/.conda/envs/seiden/lib/python3.12/site-packages/torch/include/torch/csrc/api/include -isystem /home/srijan/.conda/envs/seiden/lib/python3.12/site-packages/torch/include/TH -isystem /home/srijan/.conda/envs/seiden/lib/python3.12/site-packages/torch/include/THC -isystem /home/srijan/.conda/envs/seiden/include/python3.12 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17

In [48]:
def create_index(model, processor, sampled_keyframes, sampled_keyframe_indices, device='cuda'):

    index = {}
    for i, frame in enumerate(sampled_keyframes):
        # print(frame.shape[:2])
        inputs = processor(images=frame, text='pink suitcase', return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        result = processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=0.3,
            text_threshold=0.2,
            target_sizes=torch.tensor([[frame.shape[0], frame.shape[1]]])
        )
        # print(result)
        index[sampled_keyframe_indices[i]] = result[0]['scores'].cpu()
    return index

In [49]:
index = create_index(model, processor, sampled_keyframes, sampled_keyframe_indices)

In [50]:
print(index)

{0: tensor([]), 44: tensor([]), 196: tensor([]), 348: tensor([]), 500: tensor([0.3449, 0.3505]), 652: tensor([]), 804: tensor([]), 956: tensor([]), 1108: tensor([]), 1260: tensor([0.3317, 0.3119]), 1412: tensor([]), 1564: tensor([0.3447]), 1716: tensor([]), 1868: tensor([]), 2020: tensor([0.3149]), 2172: tensor([0.3240]), 2324: tensor([]), 2476: tensor([0.3049]), 2628: tensor([])}
