### Load the model

In [16]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r101', pretrained=True)
model_slow = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
model50 = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
x3d = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_m', pretrained=True)

Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main


### Import remaining functions

In [18]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

### Setup device

In [21]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

### Download the id to label mapping for the Kinetics 400 dataset

In [None]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [81]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

### Define input transform

In [17]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

NameError: name 'ApplyTransformToKey' is not defined

### Run Inference

In [None]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

### Load the video and transform it to the input format required by the model.

In [85]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
print(video_data['video'].shape)

# Apply a transform to normalize the video input
video_data = transform(video_data)
print(video_data['video'].shape)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

torch.Size([3, 64, 240, 320])
torch.Size([3, 8, 256, 256])


### Get predictions

In [None]:
# Pass the input clip through the model
preds = model(inputs[None, ...])
print(inputs.shape)
print(preds)
print(preds.shape)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

In [74]:
from functools import reduce
import torch.nn as nn

In [None]:
from pytorchvideo.models.head import ResNetBasicHead

classification_head = ResNetBasicHead(pool=nn.AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0)),
                                     dropout=nn.Dropout(p=0.5, inplace=False),
                                     proj=nn.Linear(in_features=2048, out_features=10, bias=True),
                                     output_pool=nn.AdaptiveAvgPool3d(output_size=1))

In [77]:
#model.blocks._modules['5'] = classification_head
import pytorchvideo.models.x3d
from pytorchvideo.models.head import ResNetBasicHead

num_classes = 101
cls_head = ResNetBasicHead(pool=nn.AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0)),
                                       dropout=nn.Dropout(p=0.5, inplace=False),
                                       proj=nn.Linear(in_features=2048, out_features=num_classes, bias=True),
                                       output_pool=nn.AdaptiveAvgPool3d(output_size=1))
for name, layer in cls_head.named_modules():
    print(name, layer)

 ResNetBasicHead(
  (pool): AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0))
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=2048, out_features=101, bias=True)
  (output_pool): AdaptiveAvgPool3d(output_size=1)
)
pool AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0))
dropout Dropout(p=0.5, inplace=False)
proj Linear(in_features=2048, out_features=101, bias=True)
output_pool AdaptiveAvgPool3d(output_size=1)


In [76]:
for name, layer in model_slow.blocks._modules['5'].named_modules():
    print(name, layer)

 ResNetBasicHead(
  (pool): AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0))
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=2048, out_features=400, bias=True)
  (output_pool): AdaptiveAvgPool3d(output_size=1)
)
pool AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0))
dropout Dropout(p=0.5, inplace=False)
proj Linear(in_features=2048, out_features=400, bias=True)
output_pool AdaptiveAvgPool3d(output_size=1)


## SlowFast

In [1]:
import torch
# Choose the `slowfast_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [2]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 

In [3]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

In [4]:
json_filename = "kinetics_classnames.json"
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [19]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
#clip_duration = (num_frames * sampling_rate)/frames_per_second
clip_duration = (16 * 4) / 30

In [22]:
video_path = 'archery.mp4'
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
print(video_data["video"].shape)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]
print(inputs[0].shape)
print(inputs[1].shape)

torch.Size([3, 64, 240, 320])
torch.Size([1, 3, 8, 256, 256])
torch.Size([1, 3, 32, 256, 256])


In [32]:
# Pass the input clip through the model
preds = model(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

Top 5 predicted labels: archery, throwing axe, playing paintball, golf driving, riding or walking with horse


In [20]:
print(video.get_clip(start_sec=start_sec, end_sec=end_sec)["video"].shape)

torch.Size([3, 300, 240, 320])


In [21]:
wandb.init(project='Sample Videos')

VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [22]:
wandb.log({"video": wandb.Video(video.get_clip(start_sec=start_sec, end_sec=end_sec)["video"].permute(1, 0, 2, 3).numpy(), fps=16, format="mp4")})




In [12]:
print(end_sec)

2.1333333333333333


In [9]:
from pytorchvideo.models.hub import mvit_base_16x4
from pytorchvideo.models.head import ResNetBasicHead, VisionTransformerBasicHead, SequencePool
import torch.nn as nn

model = mvit_base_16x4(pretrained=False)

In [5]:
for name, layer in model.head.named_modules():
    print(name, layer)

 VisionTransformerBasicHead(
  (sequence_pool): SequencePool()
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=768, out_features=400, bias=True)
)
sequence_pool SequencePool()
dropout Dropout(p=0.5, inplace=False)
proj Linear(in_features=768, out_features=400, bias=True)


In [12]:
num_classes = 100
cls_head = VisionTransformerBasicHead(sequence_pool=SequencePool("cls"),
                                                  dropout=nn.Dropout(p=0.5, inplace=False),
                                                  proj=nn.Linear(in_features=768, out_features=num_classes, bias=True))

In [13]:
model.head = cls_head

In [14]:
for name, layer in model.head.named_modules():
    print(name, layer)

 VisionTransformerBasicHead(
  (sequence_pool): SequencePool()
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=768, out_features=100, bias=True)
)
sequence_pool SequencePool()
dropout Dropout(p=0.5, inplace=False)
proj Linear(in_features=768, out_features=100, bias=True)
