# ref https://github.com/facebookresearch/pytorchvideo/blob/master/tutorials/video_detection_example/video_detection_inference_tutorial.ipynb

download zip to cloud hard drive 
https://github.com/facebookresearch/detectron2

In [1]:
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-ciyq1oh1
  Running command git clone -q https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-ciyq1oh1
Collecting yacs>=0.1.6
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting fvcore<0.1.6,>=0.1.5
  Downloading fvcore-0.1.5.post20210825.tar.gz (49 kB)
[K     |████████████████████████████████| 49 kB 2.8 MB/s 
[?25hCollecting iopath<0.1.10,>=0.1.7
  Downloading iopath-0.1.9-py3-none-any.whl (27 kB)
Collecting omegaconf>=2.1
  Downloading omegaconf-2.1.1-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 3.2 MB/s 
[?25hCollecting hydra-core>=1.1
  Downloading hydra_core-1.1.1-py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 40.5 MB/s 
[?25hCollecting black==21.4b2
  Downloading black-21.4b2-py3-none-any.whl (130 kB)
[K     |████████████████████████████████| 130 

In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive/slowfast
%ls

/content/gdrive/MyDrive/slowfast
Anaconda3-5.2.0-Linux-x86_64.sh  IMG_2291.MOV
ava_action_list.txt              IMG_4515.MOV
ckpt.t7                          [0m[01;34m__pycache__[0m/
darknet.py                       slowfast_colab.ipynb
DeepSort_and_KeyPoints.ipynb     slowfast_HIIT.avi
deep_sort_pytorch-master.zip     Slow-Fast-pytorch-implementation-master.zip
deepsort_slowfast.ipynb          slowfast_weight.pth
detectron2-master.zip            visualization.py
fasterRCNN_slowfast.ipynb        workout.mp4
[01;34mframes[0m/                          yolov3.cfg
IMG_2291.avi                     yolov3.weights


In [4]:
!python visualization.py

In [5]:
!pip install pytorchvideo

Collecting pytorchvideo
  Downloading pytorchvideo-0.1.2.tar.gz (115 kB)
[K     |████████████████████████████████| 115 kB 5.3 MB/s 
Collecting av
  Downloading av-8.0.3-cp37-cp37m-manylinux2010_x86_64.whl (37.2 MB)
[K     |████████████████████████████████| 37.2 MB 29 kB/s 
[?25hCollecting parameterized
  Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Building wheels for collected packages: pytorchvideo
  Building wheel for pytorchvideo (setup.py) ... [?25l[?25hdone
  Created wheel for pytorchvideo: filename=pytorchvideo-0.1.2-py3-none-any.whl size=166136 sha256=67ae8227e320775dd45703fd5c8b5e9aa8a9720d98d07aa403348765ac5c686f
  Stored in directory: /root/.cache/pip/wheels/4c/f5/ee/ee524d0e7df398623c64816181cd6a99fb9b7eab56d0608ace
Successfully built pytorchvideo
Installing collected packages: parameterized, av, pytorchvideo
Successfully installed av-8.0.3 parameterized-0.8.1 pytorchvideo-0.1.2


In [6]:
from functools import partial
import numpy as np
import imageio
import cv2
import torch

import detectron2
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor

import pytorchvideo
from pytorchvideo.transforms.functional import (
    uniform_temporal_subsample,
    short_side_scale_with_boxes,
    clip_boxes_to_image,
)
from torchvision.transforms._functional_video import normalize
from pytorchvideo.data.ava import AvaLabeledVideoFramePaths
from pytorchvideo.models.hub import slow_r50_detection,slowfast_r50_detection # Another option is slowfast_r50_detection

from visualization import VideoVisualizer

  "The _functional_video module is deprecated. Please use the functional module instead."


#Load Model using Torch Hub API

PyTorchVideo provides several pretrained models through Torch Hub. Available models are described in model zoo documentation.

Here we are selecting the slow_r50_detection model which was trained using a 4x16 setting on the Kinetics 400 dataset and fine tuned on AVA V2.2 actions dataset.

NOTE: to run on GPU in Google Colab, in the menu bar selet: Runtime -> Change runtime type -> Harware Accelerator -> GPU

In [7]:
device = 'cuda' # or 'cpu'
video_model = slowfast_r50_detection(True) # Another option is slowfast_r50_detection
#video_model = slow_r50_detection(True) # Another option is slowfast_r50_detection
video_model = video_model.eval().to(device)

Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/ava/SLOWFAST_8x8_R50_DETECTION.pyth" to /root/.cache/torch/hub/checkpoints/SLOWFAST_8x8_R50_DETECTION.pyth


  0%|          | 0.00/259M [00:00<?, ?B/s]

#Load an off-the-shelf Detectron2 object detector

We use the object detector to detect bounding boxes for the people. These bounding boxes later feed into our video action detection model. For more details, please refer to the Detectron2's object detection tutorials.

To install Detectron2, please follow the instructions mentioned https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md

In [8]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.55  # set threshold for this model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
predictor = DefaultPredictor(cfg)

# This method takes in an image and generates the bounding boxes for people in the image.
def get_person_bboxes(inp_img, predictor):
    predictions = predictor(inp_img.cpu().detach().numpy())['instances'].to('cpu')
    boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
    scores = predictions.scores if predictions.has("scores") else None
    classes = np.array(predictions.pred_classes.tolist() if predictions.has("pred_classes") else None)
    predicted_boxes = boxes[np.logical_and(classes==0, scores>0.75 )].tensor.cpu() # only person
    return predicted_boxes

model_final_280758.pkl: 167MB [00:06, 25.3MB/s]                           


#Define the transformations for the input required by the model

Before passing the video and bounding boxes into the model we need to apply some input transforms and sample a clip of the correct frame rate in the clip.

Here, below we define a method that can pre-process the clip and bounding boxes. It generates inputs accordingly for both Slow (Resnet) and SlowFast models depending on the parameterization of the variable slow_fast_alpha.

In [9]:
def ava_inference_transform(
    clip, 
    boxes,
    #num_frames = 4, #if using slow_r50_detection, change this to 4
    num_frames = 32, #if using slowfast_r50_detection, change this to 32
    crop_size = 256, 
    data_mean = [0.45, 0.45, 0.45], 
    data_std = [0.225, 0.225, 0.225],
    #slow_fast_alpha = None, #if using slow_r50_detection, change this to None
    slow_fast_alpha = 4, #if using slowfast_r50_detection, change this to 4
):

    boxes = np.array(boxes)
    ori_boxes = boxes.copy()

    # Image [0, 255] -> [0, 1].
    clip = uniform_temporal_subsample(clip, num_frames)
    clip = clip.float()
    clip = clip / 255.0

    height, width = clip.shape[2], clip.shape[3]
    # The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
    # range of [0, width] for x and [0,height] for y
    boxes = clip_boxes_to_image(boxes, height, width)

    # Resize short side to crop_size. Non-local and STRG uses 256.
    clip, boxes = short_side_scale_with_boxes(
        clip,
        size=crop_size,
        boxes=boxes,
    )
    
    # Normalize images by mean and std.
    clip = normalize(
        clip,
        np.array(data_mean, dtype=np.float32),
        np.array(data_std, dtype=np.float32),
    )
    
    boxes = clip_boxes_to_image(
        boxes, clip.shape[2],  clip.shape[3]
    )
    
    # Incase of slowfast, generate both pathways
    if slow_fast_alpha is not None:
        fast_pathway = clip
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            clip,
            1,
            torch.linspace(
                0, clip.shape[1] - 1, clip.shape[1] // slow_fast_alpha
            ).long(),
        )
        clip = [slow_pathway, fast_pathway]
    
    return clip, torch.from_numpy(boxes), ori_boxes

#Setup
Download the id to label mapping for the AVA V2.2 dataset on which the Torch Hub models were finetuned. This will be used to get the category label names from the predicted class ids.

Create a visualizer to visualize and plot the results(labels + bounding boxes).

In [10]:
# Create an id to label name mapping
label_map, allowed_class_ids = AvaLabeledVideoFramePaths.read_label_map('ava_action_list.txt')
# Create a video visualizer that can plot bounding boxes and visualize actions on bboxes.
video_visualizer = VideoVisualizer(81, label_map, top_k=3, mode="thres",thres=0.5)

#Load an example video

In [12]:
# Load the video
video_name = './IMG_2291.MOV'
encoded_vid = pytorchvideo.data.encoded_video.EncodedVideo.from_path(video_name)
print('Completed loading encoded video.')

Completed loading encoded video.


In [None]:
encoded_vid.get_clip(0,1)['video'].shape

torch.Size([3, 31, 540, 960])


#Generate bounding boxes and action predictions for all clips in the video.

In [None]:
clip_duration = 1.0 # Duration of clip used for each inference step.
gif_imgs = []
picture_count =0

# Video predictions are generated at an internal of 1 sec from 0 seconds to the end seconds in the video.
for time_stamp in range(int(encoded_vid.duration)):  # time stamps in video for which clip is sampled.   
    print("Generating predictions for time stamp: {} sec".format(time_stamp))
    
    # Generate clip around the designated time stamps
    inp_imgs = encoded_vid.get_clip(
        time_stamp - clip_duration/2.0, # start second
        time_stamp + clip_duration/2.0  # end second
    )
    inp_imgs = inp_imgs['video']
    
    # Generate people bbox predictions using Detectron2's off the self pre-trained predictor
    # We use the the middle image in each clip to generate the bounding boxes.
    inp_img = inp_imgs[:,inp_imgs.shape[1]//2,:,:]
    inp_img = inp_img.permute(1,2,0)
    
    # Predicted boxes are of the form List[(x_1, y_1, x_2, y_2)]
    predicted_boxes = get_person_bboxes(inp_img, predictor) 
    if len(predicted_boxes) == 0: 
        print("Skipping clip no frames detected at time stamp: ", time_stamp)
        continue
        
    # Preprocess clip and bounding boxes for video action recognition.
    inputs, inp_boxes, _ = ava_inference_transform(inp_imgs, predicted_boxes.numpy())
    # Prepend data sample id for each bounding box. 
    # For more details refere to the RoIAlign in Detectron2
    inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0],1), inp_boxes], dim=1)
    
    # Generate actions predictions for the bounding boxes in the clip.
    # The model here takes in the pre-processed video clip and the detected bounding boxes.
    if isinstance(inputs, list):
        inputs = [inp.unsqueeze(0).to(device) for inp in inputs]
    else:
        inputs = inputs.unsqueeze(0).to(device)
    preds = video_model(inputs, inp_boxes.to(device))

    preds= preds.to('cpu')
    # The model is trained on AVA and AVA labels are 1 indexed so, prepend 0 to convert to 0 index.
    preds = torch.cat([torch.zeros(preds.shape[0],1), preds], dim=1)
    
    # Plot predictions on the video and save for later visualization.
    inp_imgs = inp_imgs.permute(1,2,3,0)
    inp_imgs = inp_imgs/255.0
    out_img_pred = video_visualizer.draw_clip_range(inp_imgs, preds, predicted_boxes)
    #gif_imgs += out_img_pred
    
    out_img_pred = np.array(out_img_pred)
    for index in range(out_img_pred.shape[0]):
        img = out_img_pred[index]
        img = (255*img).astype(np.uint8)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        cv2.imwrite('./frames/output_%s.jpg'%(picture_count), img)
        picture_count += 1
print("Finished generating predictions.")

Generating predictions for time stamp: 0 sec


  b = self.tensor[item]


Generating predictions for time stamp: 1 sec
Generating predictions for time stamp: 2 sec
Generating predictions for time stamp: 3 sec
Generating predictions for time stamp: 4 sec
Generating predictions for time stamp: 5 sec
Generating predictions for time stamp: 6 sec
Generating predictions for time stamp: 7 sec
Generating predictions for time stamp: 8 sec
Generating predictions for time stamp: 9 sec
Generating predictions for time stamp: 10 sec
Generating predictions for time stamp: 11 sec
Generating predictions for time stamp: 12 sec
Generating predictions for time stamp: 13 sec
Generating predictions for time stamp: 14 sec
Finished generating predictions.


#Save predictions as video¶
The generated video consists of bounding boxes with predicted actions for each bounding box.

In [None]:
img = cv2.imread('./frames/output_0.jpg')  # 读取保存的任意一张图片
size = (img.shape[1],img.shape[0])  #获取视频中图片宽高度信息

cap = cv2.VideoCapture(video_name)
total_frames = int(cap.get(7))
vid = imageio.get_reader(video_name, 'ffmpeg')
fps = vid.get_meta_data()['fps']
fourcc = cv2.VideoWriter_fourcc(*"DIVX") # 视频编码格式
videoWrite = cv2.VideoWriter('./%s_modellargeoutput.mp4'%(video_name),fourcc,fps,size)# 根据图片的大小，创建写入对象 （文件名，支持的编码器，帧率，视频大小（图片大小））

for i in range(0, total_frames):
    fileName = "./frames/output_"+str(i)+'.jpg'    #循环读取所有的图片,假设以数字顺序命名
    img = cv2.imread(fileName)
    videoWrite.write(img)# 将图片写入所创建的视频对象
videoWrite.release()

FileNotFoundError: ignored