In [1]:
# # requirements installation
!pip install einops
!pip install PyPI
!pip install timm

Collecting einops
  Downloading einops-0.3.2-py3-none-any.whl (25 kB)
Installing collected packages: einops
Successfully installed einops-0.3.2
Collecting PyPI
  Downloading pypi-2.1.tar.gz (997 bytes)
Building wheels for collected packages: PyPI
  Building wheel for PyPI (setup.py) ... [?25l[?25hdone
  Created wheel for PyPI: filename=pypi-2.1-py3-none-any.whl size=1354 sha256=84aff4c153fb9025e8b25348681482e7e390bf2d51733ce06b4a32dbda19c951
  Stored in directory: /root/.cache/pip/wheels/9d/58/40/27b525c0b051491cabddd0157355cd3365dfafe2e83618baa6
Successfully built PyPI
Installing collected packages: PyPI
Successfully installed PyPI-2.1
Collecting timm
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
[K     |████████████████████████████████| 376 kB 4.2 MB/s 
Installing collected packages: timm
Successfully installed timm-0.4.12


In [2]:
import math
import logging
from functools import partial
from collections import OrderedDict
from einops import rearrange, repeat      # requires install einops

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
# timm is a pytorch DL library that can easily create model, load data and so on.
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD   # install timm
from timm.models.helpers import load_pretrained
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.models.registry import register_model


In [5]:
# hidden_feature, in_features, out_features stand for the channel of features.
# act_layer = activation layer

class Mlp(nn.Module):
  def __init__(self, in_features, hidden_features=None, out_features=None, act_layer = nn.GELU, drop=0.):
    super().__init__()    # initialize the same way with the nn.module 
    out_features = out_features or in_features    # ?
    hidden_features = hidden_features or in_features # ?

    self.fc1 = nn.Linear(in_features, hidden_features)
    self.act = act_layer()
    self.fc2 = nn.Linear(hidden_features, out_features)
    self.drop = nn.Dropout(drop)    # why drop is 0. ?
  
  def forward(self, x):
    # """debug"""
    # import pdb
    # pdb.set_trace()
    x = self.fc1(x)
    x = self.act(x)
    x = self.drop(x)
    x = self.fc2(x)
    x = self.drop(x)
    return x

In [6]:
# y = torch.ones(512, 9, 680)
# mlp = Mlp(in_features=680, hidden_features=2*680, act_layer=nn.GELU, drop=0.)
# y = mlp(y)
# y

In [7]:
class Attention(nn.Module):
  def __init__(self,dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.,):
    super().__init__()
    self.num_heads = num_heads
    head_dim = dim // num_heads

    self.scale = qk_scale or head_dim ** -0.5   #?
    self.qkv = nn.Linear(dim, dim*3, bias=qkv_bias)
    self.attn_drop = nn.Dropout(attn_drop)
    self.proj = nn.Linear(dim, dim)
    self.proj_drop = nn.Dropout(proj_drop)

  def forward(self,x):
    B, N, C = x.shape
    qkv = self.qkv(x).reshape(B,N,3,self.num_heads, C//self.num_heads).permute(2,0,3,1,4)
    q, k, v = qkv[0], qkv[1], qkv[2]    # assignt the 3 into q, k, v

    #### NOTE: look at the data dimension here
    # convert k to transpose and 
    attn = (q @ k.transpose(-2,-1)) * self.scale    # transpose works the same with permute
    # @ compute the inner product of 2 array,
    # should look at the q@k, why transpose?

    attn = attn.softmax(dim=-1)
    attn = self.attn_drop(attn)

    x = (attn @ v).transpose(1,2).reshape(B,N,C)
    x = self.proj(x)
    x = self.proj_drop(x)
    return x


In [8]:
import numpy as np
# a = np.array([[1,1,1,1],[2,2,2,2]])
# b = a
# print(a)
# a @ b.transpose(-2, -1)
# print(b.transpose(-1, -2))
# b = b.softmax(dim=-1)
# a @ b.transpose()
# a = np.ones((4,5))
# a.transpose(-2,-1).shape
# b = 2*a
# a @ b.transpose(-1,-2)
# b.transpose(-1,-2)

## @ means matrix multiplication

In [9]:
class Block(nn.Module):
  def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=None, qk_scale=None, drop=0., attn_drop=0., drop_path=0., 
               act_layer=nn.GELU, norm_layer=nn.LayerNorm):
    super().__init__()

    self.norm1 = norm_layer(dim)
    self.attn = Attention(
        dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop
    )

    ## stochastic depth of drop path
    self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
    self.norm2 = norm_layer(dim)
    mlp_hidden_dim = int(dim*mlp_ratio)
    self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

  def forward(self, x):

      x = x + self.drop_path(self.attn(self.norm1(x)))
      x = x + self.drop_path(self.mlp(self.norm2(x)))
      
      return x

In [10]:
class VideoTransformer(nn.Module):
  def __init__(self, num_frame=9, in_chans=85, embed_dim_ratio=8, depth=4, \
               num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,\
               drop_path_rate=0.2, norm_layer=None):
    super().__init__()

    norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)    # partial imported from functional
    embed_dim = embed_dim_ratio * in_chans         # 8 * 85             ### the embedded ratio could be adjusted
    out_dim = in_chans

    ### temporal path embedding
    self.Temporal_patch_to_embedding = nn.Linear(in_chans, embed_dim)
    self.Temporal_pos_embed = nn.Parameter(torch.zeros(1, num_frame, embed_dim))   ### NOTE: change the embed_dim here
    self.pos_drop = nn.Dropout(p=drop_rate)                       ### pos_drop?


    dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]      # stochastic depth decay rule

    self.blocks = nn.ModuleList([
      Block(
          dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
          drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
      for i in range(depth)
    ])

    self.Temporal_norm = norm_layer(embed_dim)


    # implement weithted mean
    self.weighted_mean =torch.nn.Conv1d(in_channels=num_frame, out_channels=1, kernel_size=1)

    self.head = nn.Sequential(
        nn.LayerNorm(embed_dim),
        nn.Linear(embed_dim, out_dim),
    )
  def Temporal_forward_features(self,x):
    # b = x.shape[0]
    b, f, p = x.shape

    # x = rearrange(x, 'b c f p  -> (b f) p  c', )    #[512, 1, 9, 85]
    x = self.Temporal_patch_to_embedding(x)         #[512, 9, 85*8 = 680]

    x += self.Temporal_pos_embed      ### NOTE: look after the input dimension
    x = self.pos_drop(x)
    for blk in self.blocks:
      x = blk(x)

    x = self.Temporal_norm(x)
    # x size [b ,f, emb_dim], then take weighted mean on frame dimension, we only
    # predict 3D pose for the central frame
    x = self.weighted_mean(x)
    x = x.view(b,1,-1)

    return x

  def forward(self,x):
    # x = x.permute(0,3,1,2)
    b, f, p = x.shape

    x = self.Temporal_forward_features(x)
    x = self.head(x)
    x = x.view(b,1,p)     # regress to the 85 smpl vector of the central frame

    return x





In [17]:
# experiment the model by input a tensor with (512, 9, 85)

if __name__ == '__main__':
  # A = torch.ones(512, 9, 85)
  A = torch.ones(3,8,82)
  model = VideoTransformer(num_frame=8, in_chans=82, embed_dim_ratio=8, depth=4, \
               num_heads=8, mlp_ratio=2., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,\
               drop_path_rate=0.2, norm_layer=None)
  # model = VideoTransformer()    # or initialize in this way.
  if torch.cuda.is_available():
      model = model.cuda()
      A = A.cuda()
      model.train()
      predict = model(A)
  else:
      model.train()
      predict = model(A)    

  print(predict.shape)

torch.Size([3, 1, 82])


In [None]:
# block = Block(dim=8,num_heads=8)
# y = torch.ones(512, 9, 680)
# y = block(y)