# Подгружаем оригинальную модель

In [1]:
import sys
sys.path.append('..')

from transformers import TimesformerModel, TimesformerConfig
import torch
from src.csl.models.model import SimilarityRecognizer

In [2]:
orig_model = SimilarityRecognizer('base', 16)
orig_model.load_pretrained_weights('../weights/base.pth')
orig_model.cuda()

  checkpoint = torch.load(filename, map_location="cpu")


SimilarityRecognizer(
  (feature_extractor): Timesformer(
    (model): VisionTransformer(
      (dropout): Dropout(p=0.0, inplace=False)
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
      (blocks): ModuleList(
        (0): Block(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (proj): Linear(in_features=768, out_features=768, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (attn_drop): Dropout(p=0.0, inplace=False)
          )
          (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (temporal_attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (proj): Linear(in_features=768, 

# Функции для сопоставления словаря весов к transformers

In [3]:
"""Convert TimeSformer checkpoints from the original repository: https://github.com/MCG-NJU/TimeSformer"""

import numpy as np
import torch
from huggingface_hub import hf_hub_download

from transformers import TimesformerConfig, VideoMAEImageProcessor


def rename_key(name):
    a = name
    
    if "encoder." in name:
        name = name.replace("encoder.", "")
    if "cls_token" in name:
        name = name.replace("cls_token", "embeddings.cls_token")
    if "pos_embed" in name:
        name = name.replace("pos_embed", "embeddings.position_embeddings")
    if "time_embed" in name:
        name = name.replace("time_embed", "embeddings.time_embeddings")
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")
    if "blocks" in name:
        name = name.replace("blocks", "encoder.layer")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name and "bias" not in name and "temporal" not in name:
        name = name.replace("attn", "attention.self")
    if "attn" in name and "temporal" not in name:
        name = name.replace("attn", "attention.attention")
    if "temporal_norm1" in name:
        name = name.replace("temporal_norm1", "temporal_layernorm")
    if "temporal_attn.proj" in name:
        name = name.replace("temporal_attn", "temporal_attention.output.dense")
    if "temporal_fc" in name:
        name = name.replace("temporal_fc", "temporal_dense")
    if "norm1" in name and "temporal" not in name:
        name = name.replace("norm1", "layernorm_before")
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    if "norm.weight" in name and "fc" not in name and "temporal" not in name:
        name = name.replace("norm.weight", "layernorm.weight")
    if "norm.bias" in name and "fc" not in name and "temporal" not in name:
        name = name.replace("norm.bias", "layernorm.bias")
    if "head" in name:
        name = name.replace("head", "classifier")

    if name == a:
        i = 1
    
    return name


def convert_state_dict(orig_state_dict, config):
    for key in orig_state_dict.copy().keys():
        val = orig_state_dict.pop(key)

        if key.startswith("feature_extractor.model."):
            key = key.replace("feature_extractor.model.", "")

        if "qkv" in key:
            key_split = key.split(".")
            layer_num = int(key_split[1])
            prefix = "encoder.layer."
            if "temporal" in key:
                postfix = ".temporal_attention.attention.qkv."
            else:
                postfix = ".attention.attention.qkv."
            if "weight" in key:
                orig_state_dict[f"{prefix}{layer_num}{postfix}weight"] = val
            else:
                orig_state_dict[f"{prefix}{layer_num}{postfix}bias"] = val
        else:
            orig_state_dict[rename_key(key)] = val

    return orig_state_dict


# We will verify our results on a video of eating spaghetti
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
def prepare_video():
    file = hf_hub_download(
        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
    )
    video = np.load(file)
    return list(video)


def convert_timesformer_checkpoint(config):
    model = TimesformerModel(config)

    # download original checkpoint, hosted on Google Drive
    output = "../weights/base.pth"
    state_dict = torch.load(output, map_location="cpu")
    new_state_dict = convert_state_dict(state_dict.copy(), config)

    model.load_state_dict(new_state_dict)
    model.eval()

    # verify model on basic input
    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
    video = prepare_video()
    inputs = image_processor(video[:8], return_tensors="pt")

    outputs = model(**inputs)
    return model

# Перевод модели


В Transformers используется некорректная инициализация модели, есть различия с оригинальным timesformer от meta. В файле modeling_timesformer.py строку 
```
self.drop_path = TimeSformerDropPath(config.drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
``` 

нужно заменить на 
```
self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
```
из-за того, что drop_path_rate должен меняться с каждой головой. Тогда выходы будут совпадать

In [4]:
# base model
tr_config = TimesformerConfig(image_size=224, patch_size=16, num_frames=8, drop_path_rate=0.1)
model = convert_timesformer_checkpoint(tr_config)

  state_dict = torch.load(output, map_location="cpu")
  return torch.tensor(value)


In [32]:
# small model
tr_config = TimesformerConfig(
    image_size=224,
    patch_size=16,
    num_frames=8,
    drop_path_rate=0.1,
    hidden_size=384,
    intermediate_size=1536,
    num_attention_heads=6
)
model = convert_timesformer_checkpoint(tr_config)
# model = TimesformerModel.from_pretrained('../weights/csl_transformers_small/').cuda()

In [5]:
model.cuda()

TimesformerModel(
  (embeddings): TimesformerEmbeddings(
    (patch_embeddings): TimesformerPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
  )
  (encoder): TimesformerEncoder(
    (layer): ModuleList(
      (0): TimesformerLayer(
        (drop_path): Identity()
        (attention): TimeSformerAttention(
          (attention): TimesformerSelfAttention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
          )
          (output): TimesformerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): TimesformerIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.0, inplace=False

In [7]:
inp = torch.randn((2, 3, 8, 224, 224)).cuda()

In [8]:
model(inp.permute(0, 2, 1, 3, 4)).last_hidden_state[:, 0].shape

torch.Size([2, 768])

In [10]:
orig_model.eval()
with torch.no_grad():
    print(torch.abs(orig_model(inp) - model(inp.permute(0, 2, 1, 3, 4)).last_hidden_state[:, 0]).sum())

tensor(0., device='cuda:0')


# Сохраняем модель

In [10]:
model.save_pretrained('../weights/csl_transformers_base')

In [2]:
model = TimesformerModel.from_pretrained('../weights/csl_transformers_base/')

In [4]:
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []
    
    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing 

            layer_names.append(name)
    
    return layer_names

list(set(get_specific_layer_names(model)))

['encoder.layer.2.intermediate.dense',
 'encoder.layer.4.intermediate.dense',
 'encoder.layer.3.temporal_dense',
 'encoder.layer.0.temporal_dense',
 'encoder.layer.8.attention.attention.qkv',
 'encoder.layer.9.attention.output.dense',
 'encoder.layer.1.intermediate.dense',
 'encoder.layer.0.output.dense',
 'encoder.layer.8.temporal_attention.attention.qkv',
 'encoder.layer.9.temporal_attention.attention.qkv',
 'encoder.layer.0.attention.attention.qkv',
 'encoder.layer.11.temporal_dense',
 'encoder.layer.1.attention.attention.qkv',
 'encoder.layer.4.temporal_attention.output.dense',
 'encoder.layer.10.temporal_attention.output.dense',
 'encoder.layer.10.attention.output.dense',
 'encoder.layer.6.attention.attention.qkv',
 'encoder.layer.8.temporal_dense',
 'encoder.layer.3.temporal_attention.attention.qkv',
 'encoder.layer.0.temporal_attention.attention.qkv',
 'encoder.layer.11.temporal_attention.attention.qkv',
 'encoder.layer.2.temporal_attention.output.dense',
 'encoder.layer.3.outpu