### Load the model

In [1]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

Using cache found in /Users/phuber/.cache/torch/hub/facebookresearch_pytorchvideo_main


### Import remaining functions

In [2]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

### Setup device

In [3]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

### Download the id to label mapping for the Kinetics 400 dataset

In [4]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [5]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

### Define input transform

In [6]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

### Run Inference

In [7]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

### Load the video and transform it to the input format required by the model.

In [10]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration
end_sec = 3
print(end_sec)

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
print(video_data['video'].shape)

# Apply a transform to normalize the video input
video_data = transform(video_data)
print(video_data['video'].shape)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

3
torch.Size([3, 90, 240, 320])
torch.Size([3, 8, 256, 256])


### Get predictions

In [10]:
# Pass the input clip through the model
preds = model(inputs[None, ...])
print(inputs.shape)
print(preds)
print(preds.shape)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

torch.Size([3, 8, 256, 256])
tensor([[ 7.3658e+00,  4.5376e+00,  9.3245e-01, -1.5395e+00, -1.0053e+01,
          6.9937e+01, -4.3952e+00,  2.3782e+00,  1.0018e+00,  1.5337e+00,
         -4.3518e+00, -3.4403e+00,  7.3954e-01, -1.1670e+00,  1.1805e+00,
         -1.4303e-02, -1.0332e+00,  9.5922e-01, -1.4973e+00, -3.0453e+00,
          1.1070e+00,  5.4270e+00, -1.5748e-01, -3.6760e+00,  8.3234e+00,
         -2.7329e+00, -3.2130e+00, -6.4531e+00, -2.6370e+00, -5.2812e+00,
         -3.8525e+00, -2.5389e+00, -3.4044e+00, -6.7294e+00, -1.4267e+00,
          8.0389e-01, -7.1033e+00, -4.8424e+00, -1.7084e-01,  3.4026e+00,
          8.4727e+00,  4.3644e+00,  6.1376e+00,  2.9348e+00, -1.5195e+00,
         -4.1009e-01, -2.4971e+00,  4.7193e+00,  1.3953e-01,  5.1990e+00,
          1.1383e+00,  2.0798e+00, -3.9347e+00,  4.4492e+00,  5.8390e-01,
          6.1401e+00,  8.1690e+00, -5.0845e+00, -6.2952e+00, -3.2534e+00,
         -6.2249e+00,  4.6930e+00, -1.8951e+00, -7.9065e+00, -8.9854e+00,
         

In [45]:
from functools import reduce
import torch.nn as nn

In [46]:
from pytorchvideo.models.head import ResNetBasicHead

classification_head = ResNetBasicHead(pool=nn.AvgPool3d(kernel_size=(8, 7, 7), stride=(1, 1, 1), padding=(0, 0, 0)),
                                     dropout=nn.Dropout(p=0.5, inplace=False),
                                     proj=nn.Linear(in_features=2048, out_features=10, bias=True),
                                     output_pool=nn.AdaptiveAvgPool3d(output_size=1))

In [47]:
model.blocks._modules['5'] = classification_head

In [48]:
for name, layer in model.named_modules():
    if isinstance(layer, torch.nn.Linear):
        print(name, layer)

blocks.5.proj Linear(in_features=2048, out_features=10, bias=True)
