In [2]:
from transformers import VivitConfig, VivitModel

In [3]:
config = VivitConfig(
    image_size=224,
    num_frames=32,
    tubelet_size=[2, 16, 16],
    num_channels=3,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act='gelu_fast',
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    initializer_range=0.02,
    layer_norm_eps=0.000001,
    qkv_bias=True
)

In [4]:
model = VivitModel(
    config=config,
    add_pooling_layer=True
)

In [1]:
import av
import numpy as np

from transformers import VivitImageProcessor, VivitModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

print("Downloading videos")
# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

print('sample indices')
# sample 32 frames
indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container=container, indices=indices)

print('Load vivit')
image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

print('Get inputs')
# prepare video for the model
inputs = image_processor(list(video), return_tensors="pt")

print('Get outputs')
# forward pass
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

2024-07-10 10:10:27.102873: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 10:10:27.103101: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 10:10:27.189769: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 10:10:27.388508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading videos
sample indices
Load vivit


  return self.fget.__get__(instance, owner)()
Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Get inputs


  return torch.tensor(value)


Get outputs


[1, 3137, 768]

In [2]:
import torch
tensor = torch.load('last_hidden_states.pt')

In [7]:
tensor[0][0]

tensor([-7.6379e-02,  4.6651e-01, -2.4014e-01,  4.5017e-01,  1.1438e+00,
         1.8163e+00,  1.6684e-01,  3.2136e-01, -8.1002e-01,  5.8922e-01,
         3.8196e-01, -3.1741e-02,  1.9726e+00, -1.1832e+00, -2.0024e+00,
         1.2548e+00, -2.6207e+00,  1.4276e-02,  2.6208e-01,  1.0811e-01,
         9.7158e-01,  5.0071e-01,  3.0242e+00, -3.5963e-01,  8.6071e-01,
         1.2982e+00, -2.7555e-01, -1.0677e+00,  4.5861e-01, -4.4223e-01,
         1.1474e+00, -8.3381e-01, -8.3832e-01, -1.9045e+00,  1.0746e+00,
        -1.0963e+00, -2.1976e-01, -2.4826e-01, -4.9723e-01, -8.4550e-01,
         8.4025e-01, -8.4539e-01,  5.7962e-01,  5.8470e-01,  6.7049e-01,
        -1.2926e+00, -2.7801e-01,  2.9399e-01,  6.8319e-01, -9.9227e-01,
        -5.8361e-01,  4.0723e-02,  4.5529e-01, -4.4509e-01,  1.0076e+00,
         1.9670e-01,  1.3554e-01, -6.0439e-01,  6.8855e-02, -3.9623e-01,
         3.8921e-01, -7.2124e-01,  9.6061e-01,  1.3253e+00, -1.3934e+00,
         2.4375e-01, -1.3566e-01,  6.6577e-01,  5.6

In [4]:
tensor.shape

torch.Size([1, 3137, 768])