# SlowFast

*Author: FAIR PyTorchVideo*

**SlowFast networks pretrained on the Kinetics 400 dataset**


### Example Usage

#### Imports

Load the model:

In [1]:
pip install git+https://github.com/facebookresearch/fvcore.git

Collecting git+https://github.com/facebookresearch/fvcore.git
  Cloning https://github.com/facebookresearch/fvcore.git to /tmp/pip-req-build-8d6t_sjf
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/fvcore.git /tmp/pip-req-build-8d6t_sjf
  Resolved https://github.com/facebookresearch/fvcore.git to commit b120a6d3c01c238e6e99312916bf61d1f51cdb07
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install torchvision

Note: you may need to restart the kernel to use updated packages.


In [19]:
import torch
# Choose the `slowfast_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

Using cache found in /home/jovyan/.cache/torch/hub/facebookresearch_pytorchvideo_main


Import remaining functions:

In [20]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 

#### Setup

Set the model to eval mode and move to desired device.

In [21]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.

In [22]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [23]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

#### Define input transform

In [24]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

#### Run Inference

Download an example video.

In [40]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

In [41]:
pip install av

Note: you may need to restart the kernel to use updated packages.


In [55]:
video_path = 'v_ApplyEyeMakeup_g01_c01.avi'

Load the video and transform it to the input format required by the model.

In [70]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]

In [69]:
video_data.

dict_items([('video', [tensor([[[[-0.9127, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          [-0.9127, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          [-0.9036, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          ...,
          [-1.6635, -1.7514, -1.6615,  ...,  0.0497,  0.0954,  0.1208],
          [-1.8940, -1.8855, -1.6936,  ...,  0.0497,  0.0954,  0.1208],
          [-1.6473, -1.6688, -1.4789,  ...,  0.0497,  0.0954,  0.1208]],

         [[-0.9020, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          [-0.9020, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          [-0.9020, -0.9020, -0.9065,  ...,  2.1307,  2.1307,  2.1307],
          ...,
          [-1.6635, -1.7514, -1.6615,  ..., -0.1037, -0.0744, -0.0299],
          [-1.8940, -1.8855, -1.6936,  ..., -0.1037, -0.0744, -0.0299],
          [-1.6473, -1.6688, -1.4789,  ..., -0.1037, -0.0744, -0.0299]],

         [[-0.9020, -0.9020, -0.9065,  ...,  2.1307,  2.1436,  2.1481],
       

#### Get Predictions

In [57]:
# Pass the input clip through the model
preds = model(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

Top 5 predicted labels: filling eyebrows, applying cream, waxing eyebrows, brush painting, brushing teeth


### Model Description
SlowFast model architectures are based on [1] with pretrained weights using the 8x8 setting
on the Kinetics dataset. 

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- |  ----------- | ----------- |
| SlowFast | R50   | 8x8                        | 76.94 | 92.69 | 65.71     | 34.57      |
| SlowFast | R101  | 8x8                        | 77.90 | 93.27 | 127.20    | 62.83      |


### References
[1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition"
https://arxiv.org/pdf/1812.03982.pdf

In [65]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open('v_ApplyEyeMakeup_g01_c01.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()


In [66]:
HTML("""
<video controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)