In [6]:
!pip install av
!conda install -y -c conda-forge ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting av
  Downloading http://mirrors.tencentyun.com/pypi/packages/41/b7/4b1095af7f8e87c0f54fc0a3de9472d09583eaf2e904a60f0817819fff11/av-8.0.3-cp36-cp36m-manylinux2010_x86_64.whl (37.2 MB)
[K     |████████████████████████████████| 37.2 MB 499 kB/s eta 0:00:01    |█▋                              | 1.9 MB 8.6 MB/s eta 0:00:05     |██████████████████████████████▊ | 35.8 MB 499 kB/s eta 0:00:03
[?25hInstalling collected packages: av
Successfully installed av-8.0.3


In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from typing import Dict, Union
from tqdm import tqdm
import models.r21d.transforms.rgb_transforms as T
from torchvision.models.video import r2plus1d_18
from torchvision.transforms import Compose
from torchvision.io.video import read_video
from utils.utils import (action_on_extraction, form_list_from_user_input,
                         form_slices, reencode_video_with_diff_fps, show_predictions_on_dataset)
PRE_CENTRAL_CROP_SIZE = (128, 171)
KINETICS_MEAN = [0.43216, 0.394666, 0.37645]
KINETICS_STD = [0.22803, 0.22145, 0.216989]
CENTRAL_CROP_MIN_SIDE_SIZE = 112
DEFAULT_R21D_STEP_SIZE = 16
DEFAULT_R21D_STACK_SIZE = 16
class ExtractR21D(nn.Module):
    
    def __init__(self,video_path,output_path,
                 step_size=None,stack_size=None):
        super(ExtractR21D,self).__init__()
        self.file_names_list = os.listdir(video_path)
        self.file_path_list = [video_path+file_name for file_name in self.file_names_list]
        self.step_size = step_size
        self.stack_size = stack_size
        self.on_extraction = 'save_numpy'
        if self.step_size is None:
            self.step_size = DEFAULT_R21D_STEP_SIZE
        if self.stack_size is None:
            self.stack_size = DEFAULT_R21D_STACK_SIZE
        self.transforms = Compose([
            T.ToFloatTensorInZeroOne(),
            T.Resize(PRE_CENTRAL_CROP_SIZE),
            T.Normalize(mean=KINETICS_MEAN, std=KINETICS_STD),
            T.CenterCrop((CENTRAL_CROP_MIN_SIDE_SIZE, CENTRAL_CROP_MIN_SIDE_SIZE))
        ])
        self.show_pred = False
        self.output_path = output_path
        self.extraction_fps = None
        self.feature_type = 'r21d_rgb'
        self.model = r2plus1d_18(pretrained=True)
        self.model_class = self.model.fc
        self.model.fc = torch.nn.Identity()
    def forward(self,indices):
        device = indices.device
        for idx in tqdm(indices):
            if idx == 36:
                continue
            if(os.path.exists(self.output_path+self.file_names_list[idx].strip('.mp4')+'.npy')):
                print(self.output_path+self.file_names_list[idx].strip('.mp4')+'.npy',' alread exists')
                continue
                
            feats_dict = self.extract(device, self.model, self.model_class, self.file_path_list[idx])
            action_on_extraction(feats_dict, self.file_path_list[idx], self.output_path, self.on_extraction)
    def extract(self, device: torch.device, model: torch.nn.Module, classifier: torch.nn.Module,
                video_path: Union[str, None] = None
                ) -> Dict[str, np.ndarray]:
        # take the video, change fps and save to the tmp folder
        if self.extraction_fps is not None:
            video_path = reencode_video_with_diff_fps(video_path, self.tmp_path, self.extraction_fps)

        # read a video
        rgb, audio, info = read_video(video_path, pts_unit='sec')
        # prepare data (first -- transform, then -- unsqueeze)
        # mafp: 这一步 爆了
        rgb = self.transforms(rgb)
        rgb = rgb.unsqueeze(0)
        # slice the
        slices = form_slices(rgb.size(2), self.stack_size, self.step_size)

        vid_feats = []

        for stack_idx, (start_idx, end_idx) in enumerate(slices):
            # inference
            with torch.no_grad():
                output = model(rgb[:, :, start_idx:end_idx, :, :].to(device))
                vid_feats.extend(output.tolist())

                # show predicitons on kinetics dataset (might be useful for debugging)
                if self.show_pred:
                    logits = classifier(output)
                    print(f'{video_path} @ frames ({start_idx}, {end_idx})')
                    show_predictions_on_dataset(logits, 'kinetics')

        feats_dict = {
            self.feature_type: np.array(vid_feats),
        }

        return feats_dict

In [2]:
video_path = '/home/tione/notebook/dataset/videos/video_5k/train_5k/'
# video_path = './extractor_test/'
output_path = '/home/tione/notebook/dataset/r21d/'
extractor = ExtractR21D(video_path=video_path,output_path=output_path)

In [3]:
indices = torch.arange(len(extractor.file_path_list)).to('cuda')
extractor.to('cuda')

ExtractR21D(
  (model): VideoResNet(
    (stem): R2Plus1dStem(
      (0): Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
      (1): BatchNorm3d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (4): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv2Plus1D(
            (0): Conv3d(64, 144, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
            (1): BatchNorm3d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv3d(144, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
          )
          (1): BatchNorm3d(64, e

In [4]:
extractor.file_path_list[35]

'/home/tione/notebook/dataset/videos/video_5k/train_5k/bbb760adf099faa1515f3a1400363465.mp4'

In [None]:
extractor(indices)

  0%|          | 1/5000 [00:27<37:46:58, 27.21s/it]

/home/tione/notebook/dataset/r21d/3cacaddece4c28adeb2c4af9dbd0e89c.npy  alread exists
/home/tione/notebook/dataset/r21d/d6003f9a5fb29ca73cc64d9895f0d1ae.npy  alread exists
/home/tione/notebook/dataset/r21d/035bb046761f754705902e6e3c6133e7.npy  alread exists
/home/tione/notebook/dataset/r21d/983f03fbaa0022b18ca8a687c9fbd535.npy  alread exists
/home/tione/notebook/dataset/r21d/5d4596503b99ee8f1342543a71b5627f.npy  alread exists
/home/tione/notebook/dataset/r21d/c2906db45588e9648f37770e6e3f2577.npy  alread exists


  0%|          | 8/5000 [00:33<26:45:50, 19.30s/it]

/home/tione/notebook/dataset/r21d/a5ef52e90254ba404d388f520d1c084e.npy  alread exists
/home/tione/notebook/dataset/r21d/aa19531cb0524686eefcca8005e4bdb8.npy  alread exists
/home/tione/notebook/dataset/r21d/02d1949d6153d625a78721cf75130d5f.npy  alread exists
/home/tione/notebook/dataset/r21d/0015c7e3437265d2f49b0263675bdf52.npy  alread exists
/home/tione/notebook/dataset/r21d/64034291d857f775a248daf152fe4813.npy  alread exists
/home/tione/notebook/dataset/r21d/050d3d2583b4bf4c5ee28952df022cbe.npy  alread exists
/home/tione/notebook/dataset/r21d/ec242567405100af68e9ae00aa53fa23.npy  alread exists
/home/tione/notebook/dataset/r21d/569661cce63d3e3e8ec552ad643122d7.npy  alread exists
/home/tione/notebook/dataset/r21d/90eef2b2560c92a0ab34d66ec1b842a9.npy  alread exists
/home/tione/notebook/dataset/r21d/85de53f21dee3db750dcf83ffa1fd93b.npy  alread exists
/home/tione/notebook/dataset/r21d/30915e8ffa54212794bc4843e13a0e3f.npy  alread exists
/home/tione/notebook/dataset/r21d/2b59602855f68ff5d6d4

  0%|          | 24/5000 [00:53<19:11:29, 13.88s/it]

/home/tione/notebook/dataset/r21d/cd648426ded2af9e4c3761adf3a6b507.npy  alread exists
/home/tione/notebook/dataset/r21d/ed39314ca9ce1a3498bc029966ba23a8.npy  alread exists
/home/tione/notebook/dataset/r21d/7c56c996c8fea236b30e5b7b3f42ee16.npy  alread exists
/home/tione/notebook/dataset/r21d/c271d20ac0fc5d22bd0a3a7410ff670b.npy  alread exists
/home/tione/notebook/dataset/r21d/73e4fe01ec576541a9d130e49d8ac599.npy  alread exists
/home/tione/notebook/dataset/r21d/b0154c94c989a90e673c910406dadf69.npy  alread exists
/home/tione/notebook/dataset/r21d/fe770d133b6d2ecc985f998bdf89691b.npy  alread exists
/home/tione/notebook/dataset/r21d/e8c560d99cbc3857d3a37630ab6151f5.npy  alread exists
/home/tione/notebook/dataset/r21d/09ffa72f6fa2f793bfbce98628ff00f5.npy  alread exists
/home/tione/notebook/dataset/r21d/a4a71f753d0f8c55b70c9fd2ea5d3a40.npy  alread exists


  1%|          | 35/5000 [01:21<14:28:18, 10.49s/it]

/home/tione/notebook/dataset/r21d/bbb760adf099faa1515f3a1400363465.npy  alread exists


  1%|          | 39/5000 [01:46<10:16:54,  7.46s/it]

In [7]:
rgb, audio, info = read_video(extractor.file_path_list[0], pts_unit='sec')

In [None]:
rgb = extractor.transforms(rgb)