In [1]:
from models import get_masked_vit_base_patch16_224
from utils.parser import parse_args, load_config
from torch.utils.data import default_collate

import torch
import sys
import argparse
import numpy as np

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
def parse_args():
    """
    Parse the following arguments for a default parser for PySlowFast users.
    Args:
        shard_id (int): shard id for the current machine. Starts from 0 to
            num_shards - 1. If single machine is used, then set shard id to 0.
        num_shards (int): number of shards using by the job.
        init_method (str): initialization method to launch the job with multiple
            devices. Options includes TCP or shared file-system for
            initialization. details can be find in
            https://pytorch.org/docs/stable/distributed.html#tcp-initialization
        cfg (str): path to the config file.
        opts (argument): provide addtional options from the command line, it
            overwrites the config loaded from file.
    """
    parser = argparse.ArgumentParser(
        description="Provide SlowFast video training and testing pipeline."
    )
    parser.add_argument(
        "--shard_id",
        help="The shard id of current node, Starts from 0 to num_shards - 1",
        default=0,
        type=int,
    )
    parser.add_argument(
        "--num_shards",
        help="Number of shards using by the job",
        default=1,
        type=int,
    )
    parser.add_argument(
        "--init_method",
        help="Initialization method, includes TCP or shared file-system",
        default="tcp://localhost:9999",
        type=str,
    )
    parser.add_argument(
        "--cfg",
        dest="cfg_file",
        help="Path to the config file",
        default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="See slowfast/config/defaults.py for all options",
        default=None,
        nargs=argparse.REMAINDER,
    )
    if len(sys.argv) == 1:
        parser.print_help()
    return parser.parse_args([])

In [3]:
cfg = './models/configs/Kinetics/TimeSformer_divST_8x32_224.yaml'
opt = parse_args()
opt.cfg_file = cfg
config = load_config(opt)

In [4]:
model = get_masked_vit_base_patch16_224(cfg=config, no_head=True, no_mask=False)

1024


In [5]:
B = 2
C = 768
num_tokens = 1568
img_size = 224
patch_size = 16
num_patches = img_size//patch_size
frames = 8

batch = []
for i in range(B):
    batch.append(torch.randn(3,4,224,224))

batch = default_collate(batch)
video = batch

In [6]:
model.to(device)
video = video.to(device)

out = model(video, mask=True)

torch.Size([392, 4, 512])


RuntimeError: shape '[392, 4, 3, 12, 42]' is invalid for input of size 2408448

In [None]:
if isinstance(out, tuple) == True:
    print('Tuple of length:', len(out))
    for i in range(len(out)):
        print(out[i].shape)
else:
    print('returns tensor of shape:', out.shape)