In [1]:
from models import get_masked_vit_base_patch16_224
from models import get_masked_vit_base_patch16_224_no_decoder
from utils.parser import parse_args, load_config
from torch.utils.data import default_collate
from einops import rearrange

import torch
import sys
import argparse
import numpy as np

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [2]:
def parse_args():
    """
    Parse the following arguments for a default parser for PySlowFast users.
    Args:
        shard_id (int): shard id for the current machine. Starts from 0 to
            num_shards - 1. If single machine is used, then set shard id to 0.
        num_shards (int): number of shards using by the job.
        init_method (str): initialization method to launch the job with multiple
            devices. Options includes TCP or shared file-system for
            initialization. details can be find in
            https://pytorch.org/docs/stable/distributed.html#tcp-initialization
        cfg (str): path to the config file.
        opts (argument): provide addtional options from the command line, it
            overwrites the config loaded from file.
    """
    parser = argparse.ArgumentParser(
        description="Provide SlowFast video training and testing pipeline."
    )
    parser.add_argument(
        "--shard_id",
        help="The shard id of current node, Starts from 0 to num_shards - 1",
        default=0,
        type=int,
    )
    parser.add_argument(
        "--num_shards",
        help="Number of shards using by the job",
        default=1,
        type=int,
    )
    parser.add_argument(
        "--init_method",
        help="Initialization method, includes TCP or shared file-system",
        default="tcp://localhost:9999",
        type=str,
    )
    parser.add_argument(
        "--cfg",
        dest="cfg_file",
        help="Path to the config file",
        default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="See slowfast/config/defaults.py for all options",
        default=None,
        nargs=argparse.REMAINDER,
    )
    if len(sys.argv) == 1:
        parser.print_help()
    return parser.parse_args([])

In [7]:
cfg = './models/configs/Kinetics/TimeSformer_divST_8x32_224.yaml'
opt = parse_args()
opt.cfg_file = cfg
config = load_config(opt)
# config.TIMESFORMER.PRETRAINED_MODEL = './../pretrained/enc_mae_dec_vmae.pth'
config.MODEL.TUBELET_SIZE=2
config.MODEL.NUM_FRAMES=16

In [8]:
model = get_masked_vit_base_patch16_224(cfg=config, no_head=True, no_mask=False)
model.to(device);

In [9]:
B = 2
C = 768
img_size = 224
patch_size = 16
frames = config.MODEL.NUM_FRAMES
tubelet_size = config.MODEL.TUBELET_SIZE
num_tokens = (img_size//patch_size)*(img_size//patch_size)*(frames//tubelet_size)

batch = []
for i in range(B):
    batch.append(torch.randn(3,frames,img_size,img_size))

batch = default_collate(batch)
video = batch
print(num_tokens)

1568


In [10]:
print('video', video.shape)
videos_patch = rearrange(video, 'b c (t p0) (h p1) (w p2) -> b (t h w) (p0 p1 p2 c)', p0=tubelet_size, p1=patch_size, p2=patch_size)
print('videos_patch',videos_patch.shape)

video torch.Size([2, 3, 16, 224, 224])
videos_patch torch.Size([2, 1568, 1536])


In [11]:
video = video.to(device)
out = model(video, mask=True)

In [13]:
B_v, _, C_v = videos_patch.shape
label = videos_patch[out[2]].reshape(B_v, -1, C_v)
print('label', label.shape)

label torch.Size([2, 976, 1536])


In [14]:
if isinstance(out, tuple) == True:
    print('Tuple of length:', len(out))
    for i in range(len(out)):
        print(out[i].shape)
else:
    print('returns tensor of shape:', out.shape)

Tuple of length: 3
torch.Size([2, 768])
torch.Size([2, 976, 1536])
torch.Size([2, 1568])
