# Torch Hub Inference Tutorial

In this tutorial you'll learn:
- how to load a pretrained model using Torch Hub 
- run inference to classify the action in a demo video


NOTE: Currently this tutorial will only work with a local clone of the PyTorchVideo GitHub repo. 

### Import modules

In [1]:
import json 
import torch
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
) 

### Setup 

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. 
This will be used to get the category label names from the predicted class ids.

In [2]:
!wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json 

--2021-04-15 10:08:25--  https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10326 (10K) [text/plain]
Saving to: ‘kinetics_classnames.json.6’


2021-04-15 10:08:25 (23.0 MB/s) - ‘kinetics_classnames.json.6’ saved [10326/10326]



In [3]:
with open("kinetics_classnames.json", "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

### Load Model using Torch Hub API

PyTorchVideo provides several pretrained models through Torch Hub. Available models are described in [model zoo documentation](https://github.com/facebookresearch/pytorchvideo/blob/master/docs/source/model_zoo.md#kinetics-400). 

Here we are selecting the `slow_r50` model which was trained using a 8x8 setting on the Kinetics 400 dataset. 


In [4]:
# Device on which to run the model
device = "cuda:1"

# Pick a pretrained model 
model_name = "slow_r50"

# Local path to the parent folder of hubconf.py in the pytorchvideo codebase
path = '../' 
model = torch.hub.load(path, source="local", model=model_name, pretrained=True)

# Set to eval mode and move to desired device
model = model.eval()
model = model.to(device)

### Define the transformations for the input required by the model

Before passing the video into the model we need to apply some input transforms and sample a clip of the correct duration. 

In [5]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30

# Note that this transform is specific to the slow_R50 model. 
# If you want to try another of the torch hub models you will need to modify this transform
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

### Load an example video
We can test the classification of an example video from the kinetics validation set such as this [archery video](https://www.youtube.com/watch?v=3and4vWkW4s).

In [6]:
# Download the example video file
# !wget https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4 

In [7]:
# Load the example video
# video_path = "archery.mp4"  

# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
# start_sec = 0

In [8]:
def get_inp(video_path, start_sec, clip_duration):
    end_sec = start_sec + clip_duration 

    # Initialize an EncodedVideo helper class
    video = EncodedVideo.from_path(video_path)

    # Load the desired clip
    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

    # Apply a transform to normalize the video input
    video_data = transform(video_data)

    # Move the inputs to the desired device
    inputs = video_data["video"]
    inputs = inputs.to(device)
    return inputs

In [9]:
# clip_duration

In [10]:
# inputs = get_inp(video_path, start_sec, clip_duration)

### Get model predictions

In [11]:
def get_preds(inputs, model, kinetics_id_to_classname):
    # Pass the input clip through the model 
    preds = model(inputs[None, ...])

    # Get the predicted classes 
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)

    preds1 = preds.topk(k=5)
    pred_classes = preds1.indices
    pred_scores = preds1.values.tolist()[0]
    # Map the predicted classes to the label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes[0]]
#     print("Predicted labels: %s" % ", ".join(pred_class_names))
    return (pred_class_names, pred_scores)

In [12]:
def full_pipe(video_path, start_sec, clip_duration, model, kinetics_id_to_classname):
    inputs = get_inp(video_path, start_sec, clip_duration)
    pred_cls = get_preds(inputs, model, kinetics_id_to_classname)
    return pred_cls

# Visualize

In [13]:
from IPython.display import HTML
from pathlib import Path

In [14]:
# video_path = '../../temp_dir2/archery.mp4'
# !cp {video_path} ./yt_vids/
# start_sec = 0
# pred_class_names = full_pipe(video_path, start_sec, clip_duration, model, kinetics_id_to_classname)

In [15]:
# from pathlib import Path

In [16]:
# video_path_html = Path('yt_vids') / Path(video_path).name

In [17]:
# video_path_html

In [18]:
def display_html(video_path_html, start_sec, end_sec, pred_class_names):
    return HTML(f"""
<video width="320" height="240" controls>
  <source src="{video_path_html}#t={start_sec},{end_sec}" type="video/mp4">
</video>
<br>
<span> Predicted Classes: {pred_class_names} </span>
""")

In [19]:
def viz(video_path, start_sec, clip_duration, model, kinetics_id_to_classname):
#     video_path = '../../temp_dir2/archery.mp4'
    !cp {video_path} ./yt_vids/
#     start_sec = 2
    video_path_html = Path('yt_vids') / Path(video_path).name
    pred_class_names, pred_scores = full_pipe(video_path, start_sec, clip_duration, model, kinetics_id_to_classname)
    pred_cls_scores = [(x, y) for x, y in zip(pred_class_names, pred_scores)]
    print(pred_class_names, pred_scores)
    return display_html(video_path_html, start_sec, start_sec+clip_duration, pred_cls_scores)

In [20]:
video_path = '../../temp_dir2/archery.mp4'

In [25]:
viz(video_path, 4, clip_duration, model, kinetics_id_to_classname)

['archery', 'throwing axe', 'stretching arm', 'playing paintball', 'golf driving'] [1.0, 6.124903375269271e-22, 4.0900786258189156e-25, 2.903209968671495e-26, 2.6353279045492036e-26]


In [None]:
viz(video_path, 4, clip_duration, model, kinetics_id_to_classname)

In [42]:
video_path = '/home/arkas/Datasets/vsitu_data/vsitu_trimmed_videos_mp4/v_-33sUQVbQ24_seg_100_110_trimmed.mp4'
viz(video_path, 6, clip_duration, model, kinetics_id_to_classname)

['surfing water', 'water sliding', 'water skiing', 'canoeing or kayaking', 'tobogganing'] [0.3097037672996521, 0.2552003562450409, 0.09520424902439117, 0.054224465042352676, 0.04071252793073654]


In [32]:
kinetics_classnames

{'"sharpening knives"': 290,
 '"eating ice cream"': 115,
 '"cutting nails"': 81,
 '"changing wheel"': 53,
 '"bench pressing"': 19,
 'deadlifting': 88,
 '"eating carrots"': 111,
 'marching': 192,
 '"throwing discus"': 358,
 '"playing flute"': 231,
 '"cooking on campfire"': 72,
 '"breading or breadcrumbing"': 33,
 '"playing badminton"': 218,
 '"ripping paper"': 276,
 '"playing saxophone"': 244,
 '"milking cow"': 197,
 '"juggling balls"': 169,
 '"flying kite"': 130,
 'capoeira': 43,
 '"making jewelry"': 187,
 'drinking': 100,
 '"playing cymbals"': 228,
 '"cleaning gutters"': 61,
 '"hurling (sport)"': 161,
 '"playing organ"': 239,
 '"tossing coin"': 361,
 'wrestling': 395,
 '"driving car"': 103,
 'headbutting': 150,
 '"gymnastics tumbling"': 147,
 '"making bed"': 186,
 'abseiling': 0,
 '"holding snake"': 155,
 '"rock climbing"': 278,
 '"cooking egg"': 71,
 '"long jump"': 182,
 '"bee keeping"': 17,
 '"trimming or shaving beard"': 365,
 '"cleaning shoes"': 63,
 '"dancing gangnam style"': 86,