# X-CLIP : Text to Video Retrieval

## XCLIPModel

In [44]:
## clip_len = 8
# model_name = "microsoft/xclip-base-patch32" # clip_len = 8
# model_name = "microsoft/xclip-base-patch16" # clip_len = 8
# model_name = "microsoft/xclip-large-patch14" # clip_len = 8

## clip_len = 16
# model_name = "microsoft/xclip-base-patch32-16-frames" # clip_len = 16
model_name = "microsoft/xclip-base-patch16-16-frames" # clip_len = 16
# model_name = "microsoft/xclip-large-patch14-16-frames" #  clip_len = 16

## HMDB-51
# model_name = "microsoft/xclip-base-patch16-hmdb-2-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-4-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-8-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-16-shot" # clip_len = 32

## UCF-101
# model_name = "microsoft/xclip-base-patch16-ucf-2-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-4-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-8-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-16-shot" # clip_len = 32

## Kinetics-400
# model_name = "microsoft/xclip-base-patch16-zero-shot" # clip_len = 8

## Kinetics-600
# model_name = "microsoft/xclip-base-patch16-kinetics-600" # clip_len = 8
# model_name = "microsoft/xclip-large-patch14-kinetics-600" # clip_len = 8
# model_name = "microsoft/xclip-base-patch16-kinetics-600-16-frames" # clip_len = 16

label_list = [
    "abuse", "arrest", "arson", "assault", "burglary",
    "explosion", "fighting", "road accident", "robbery", "shooting",
    "shoplifting", "stealing", "vandalism"
]

label_list = list(map(lambda x : f"A video of action, {x}", label_list))

clip_len = 16
seed = 826

label_list

['A video of action, abuse',
 'A video of action, arrest',
 'A video of action, arson',
 'A video of action, assault',
 'A video of action, burglary',
 'A video of action, explosion',
 'A video of action, fighting',
 'A video of action, road accident',
 'A video of action, robbery',
 'A video of action, shooting',
 'A video of action, shoplifting',
 'A video of action, stealing',
 'A video of action, vandalism']

In [45]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [50]:
import os

dir_path = 'data/Sample/'

video_path_list = os.listdir(dir_path)
video_path_list

['Abuse001_x264.mp4', '몬스터볼.mp4', '지구.mp4', '지구총.mp4', '폭발.mp4']

### Text Input

In [53]:
label = 'shooting'

print(f'label : {label}')

similarity_list = []

for file_path in video_path_list:

    print(file_path)

    np.random.seed(seed)
    
    # video clip consists of 300 frames (10 seconds at 30 FPS)
    
    container = av.open(dir_path + file_path)
    
    frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
    # print(f"frame_sample_rate : {frame_sample_rate}")
    # print(f"seg_len : {container.streams.video[0].frames}")
    
    # sample clip_len frames
    indices = sample_frame_indices(
        clip_len=clip_len, frame_sample_rate=frame_sample_rate,
        seg_len=container.streams.video[0].frames
    )
    video = read_video_pyav(container, indices)
    
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = processor(
        text=label,
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )
    
    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits_per_video = outputs.logits_per_video  # this is the video-text similarity score

    similarity_list.append(float(logits_per_video[0][0]))

    print(f'similarity score : {float(logits_per_video[0][0])}')

similarity_list

label : shooting
Abuse001_x264.mp4


Unused or unrecognized kwargs: padding.


similarity score : 14.390071868896484
몬스터볼.mp4


Unused or unrecognized kwargs: padding.


similarity score : 17.46728515625
지구.mp4


Unused or unrecognized kwargs: padding.


similarity score : 15.2318115234375
지구총.mp4


Unused or unrecognized kwargs: padding.


similarity score : 21.49944496154785
폭발.mp4


Unused or unrecognized kwargs: padding.


similarity score : 16.75489044189453


[14.390071868896484,
 17.46728515625,
 15.2318115234375,
 21.49944496154785,
 16.75489044189453]