In [1]:
# !pip install transformers datasets evaluate accelerate

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m190.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.15.3-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m729.9 kB/s[0m eta [36m0:00:00[

In [38]:
import av

import numpy as np

from transformers import VivitImageProcessor, VivitModel, VivitConfig

from huggingface_hub import hf_hub_download

np.random.seed(0)


In [2]:
def read_video_pyav(container, indices):

    '''

    Decode the video with PyAV decoder.

    Args:

        container (`av.container.input.InputContainer`): PyAV container.

        indices (`List[int]`): List of frame indices to decode.

    Returns:

        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).

    '''

    frames = []

    container.seek(0)

    start_index = indices[0]

    end_index = indices[-1]

    for i, frame in enumerate(container.decode(video=0)):

        if i > end_index:

            break

        if i >= start_index and i in indices:

            frames.append(frame)

    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):

    '''

    Sample a given number of frame indices from the video.

    Args:

        clip_len (`int`): Total number of frames to sample.

        frame_sample_rate (`int`): Sample every n-th frame.

        seg_len (`int`): Maximum allowed index of sample's last frame.

    Returns:

        indices (`List[int]`): List of sampled frame indices

    '''

    converted_len = int(clip_len * frame_sample_rate)

    end_idx = np.random.randint(converted_len, seg_len)

    start_idx = end_idx - converted_len

    indices = np.linspace(start_idx, end_idx, num=clip_len)

    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

    return indices

In [27]:

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")

model2 = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")


Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.bias', 'vivit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model2

VivitModel(
  (embeddings): VivitEmbeddings(
    (patch_embeddings): VivitTubeletEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): VivitEncoder(
    (layer): ModuleList(
      (0): VivitLayer(
        (attention): VivitAttention(
          (attention): VivitSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VivitSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): VivitIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p

In [4]:
model

VivitModel(
  (embeddings): VivitEmbeddings(
    (patch_embeddings): VivitTubeletEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): VivitEncoder(
    (layer): ModuleList(
      (0): VivitLayer(
        (attention): VivitAttention(
          (attention): VivitSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VivitSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): VivitIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p

In [32]:
for name, param in model2.named_parameters():
  print(name ,param.shape)

embeddings.cls_token torch.Size([1, 1, 768])
embeddings.position_embeddings torch.Size([1, 3137, 768])
embeddings.patch_embeddings.projection.weight torch.Size([768, 3, 2, 16, 16])
embeddings.patch_embeddings.projection.bias torch.Size([768])
encoder.layer.0.attention.attention.query.weight torch.Size([768, 768])
encoder.layer.0.attention.attention.query.bias torch.Size([768])
encoder.layer.0.attention.attention.key.weight torch.Size([768, 768])
encoder.layer.0.attention.attention.key.bias torch.Size([768])
encoder.layer.0.attention.attention.value.weight torch.Size([768, 768])
encoder.layer.0.attention.attention.value.bias torch.Size([768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
encoder.layer.0.intermediate.dense.bias torch.Size([3072])
encoder.layer.0.output.dense.weight torch.Size([768, 3072])
encoder.layer.0.output.dense.bias 

In [39]:
vv_config = VivitConfig()

In [40]:
vv_config

VivitConfig {
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu_fast",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-06,
  "model_type": "vivit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 32,
  "num_hidden_layers": 12,
  "qkv_bias": true,
  "transformers_version": "4.41.2",
  "tubelet_size": [
    2,
    16,
    16
  ]
}

In [36]:
model2.encoder.layer.attention

AttributeError: 'ModuleList' object has no attribute 'attention'

In [12]:
!pip install timm

Collecting timm
  Downloading timm-1.0.7-py3-none-any.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.5/47.5 kB[0m [31m399.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading timm-1.0.7-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m761.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.7


In [13]:
from urllib.request import urlopen
from PIL import Image
import timm

img = Image.open(urlopen(
    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model('vit_small_r26_s32_224.augreg_in21k_ft_in1k', pretrained=True)
model = model.eval()

model.safetensors:   0%|          | 0.00/146M [00:00<?, ?B/s]

In [14]:
model

VisionTransformer(
  (patch_embed): HybridEmbed(
    (backbone): ResNetV2(
      (stem): Sequential(
        (conv): StdConv2dSame(3, 64, kernel_size=(7, 7), stride=(2, 2), bias=False)
        (norm): GroupNormAct(
          32, 64, eps=1e-05, affine=True
          (drop): Identity()
          (act): ReLU(inplace=True)
        )
        (pool): MaxPool2dSame(kernel_size=(3, 3), stride=(2, 2), padding=(0, 0), dilation=(1, 1), ceil_mode=False)
      )
      (stages): Sequential(
        (0): ResNetStage(
          (blocks): Sequential(
            (0): Bottleneck(
              (downsample): DownsampleConv(
                (conv): StdConv2dSame(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (norm): GroupNormAct(
                  32, 256, eps=1e-05, affine=True
                  (drop): Identity()
                  (act): Identity()
                )
              )
              (conv1): StdConv2dSame(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

In [15]:
for name, param in model.named_parameters():
  print(name ,param.shape)

cls_token torch.Size([1, 1, 384])
pos_embed torch.Size([1, 50, 384])
patch_embed.backbone.stem.conv.weight torch.Size([64, 3, 7, 7])
patch_embed.backbone.stem.norm.weight torch.Size([64])
patch_embed.backbone.stem.norm.bias torch.Size([64])
patch_embed.backbone.stages.0.blocks.0.downsample.conv.weight torch.Size([256, 64, 1, 1])
patch_embed.backbone.stages.0.blocks.0.downsample.norm.weight torch.Size([256])
patch_embed.backbone.stages.0.blocks.0.downsample.norm.bias torch.Size([256])
patch_embed.backbone.stages.0.blocks.0.conv1.weight torch.Size([64, 64, 1, 1])
patch_embed.backbone.stages.0.blocks.0.norm1.weight torch.Size([64])
patch_embed.backbone.stages.0.blocks.0.norm1.bias torch.Size([64])
patch_embed.backbone.stages.0.blocks.0.conv2.weight torch.Size([64, 64, 3, 3])
patch_embed.backbone.stages.0.blocks.0.norm2.weight torch.Size([64])
patch_embed.backbone.stages.0.blocks.0.norm2.bias torch.Size([64])
patch_embed.backbone.stages.0.blocks.0.conv3.weight torch.Size([256, 64, 1, 1])
p

In [19]:
b2 = model.blocks[2]
b2.attn.fused_attn = False

In [21]:
b2.attn

Attention(
  (qkv): Linear(in_features=384, out_features=1152, bias=True)
  (q_norm): Identity()
  (k_norm): Identity()
  (attn_drop): Dropout(p=0.0, inplace=False)
  (proj): Linear(in_features=384, out_features=384, bias=True)
  (proj_drop): Dropout(p=0.0, inplace=False)
)

In [24]:
model.embed_dim

384

In [25]:
model.head

Linear(in_features=384, out_features=1000, bias=True)

In [26]:
model.cls_token

Parameter containing:
tensor([[[-1.2115e-02, -7.6643e-03,  1.1881e-02,  2.6442e-03, -1.0855e-02,
           2.4157e-03, -4.0606e-03, -1.0074e-02, -6.4589e-03,  1.5418e-02,
          -6.5152e-03,  8.1977e-02,  1.6923e-03,  1.7749e-03,  4.8799e-03,
          -9.6725e-03, -7.9814e-03, -1.4743e-03,  3.0378e-03,  3.0562e-03,
          -2.2242e-03, -7.5510e-03, -7.7013e-03, -8.9757e-03, -4.6692e-04,
           1.1451e-03, -5.2634e-03, -7.0081e-03, -1.5327e-02, -1.6745e-02,
           2.1182e-03,  1.4652e-04, -1.5238e-03, -1.2412e-03, -1.0010e-02,
           1.8737e-02,  2.0203e-02, -6.4820e-03, -8.1280e-03, -1.9151e-02,
          -1.0828e-02,  3.8819e-03,  1.8269e-02, -5.9395e-03, -2.3163e-02,
           2.6079e-02, -1.5846e-02,  3.0691e-02, -2.5788e-03, -4.9555e-03,
           6.4525e-03,  1.3288e-02, -3.9641e-04,  6.5831e-04,  5.2235e-03,
          -4.2631e-03,  9.4750e-02, -8.5485e-03, -4.6355e-03,  5.3769e-03,
          -1.2403e-02, -1.0681e-02,  1.6142e-02, -2.0484e-02, -6.5551e-03,
   

In [1]:
model.embeded_dim

NameError: name 'model' is not defined