In [6]:
!pip install av
!conda install -y -c conda-forge ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting av
  Downloading http://mirrors.tencentyun.com/pypi/packages/41/b7/4b1095af7f8e87c0f54fc0a3de9472d09583eaf2e904a60f0817819fff11/av-8.0.3-cp36-cp36m-manylinux2010_x86_64.whl (37.2 MB)
[K     |████████████████████████████████| 37.2 MB 499 kB/s eta 0:00:01    |█▋                              | 1.9 MB 8.6 MB/s eta 0:00:05     |██████████████████████████████▊ | 35.8 MB 499 kB/s eta 0:00:03
[?25hInstalling collected packages: av
Successfully installed av-8.0.3


In [5]:
import os
import torch
import torch.nn as nn
import numpy as np
from typing import Dict, Union
from tqdm import tqdm
import models.r21d.transforms.rgb_transforms as T
from torchvision.models.video import r2plus1d_18
from torchvision.transforms import Compose
from torchvision.io.video import read_video
from utils.utils import (action_on_extraction, form_list_from_user_input,
                         form_slices, reencode_video_with_diff_fps, show_predictions_on_dataset)
PRE_CENTRAL_CROP_SIZE = (128, 171)
KINETICS_MEAN = [0.43216, 0.394666, 0.37645]
KINETICS_STD = [0.22803, 0.22145, 0.216989]
CENTRAL_CROP_MIN_SIDE_SIZE = 112
DEFAULT_R21D_STEP_SIZE = 16
DEFAULT_R21D_STACK_SIZE = 16
class ExtractR21D(nn.Module):
    
    def __init__(self,video_path,output_path,
                 step_size=None,stack_size=None):
        super(ExtractR21D,self).__init__()
        self.file_names_list = os.listdir(video_path)
        self.file_path_list = [video_path+file_name for file_name in self.file_names_list]
        self.step_size = step_size
        self.stack_size = stack_size
        self.on_extraction = 'save_numpy'
        if self.step_size is None:
            self.step_size = DEFAULT_R21D_STEP_SIZE
        if self.stack_size is None:
            self.stack_size = DEFAULT_R21D_STACK_SIZE
        self.transforms = Compose([
            T.ToFloatTensorInZeroOne(),
            T.Resize(PRE_CENTRAL_CROP_SIZE),
            T.Normalize(mean=KINETICS_MEAN, std=KINETICS_STD),
            T.CenterCrop((CENTRAL_CROP_MIN_SIDE_SIZE, CENTRAL_CROP_MIN_SIDE_SIZE))
        ])
        self.show_pred = False
        self.output_path = output_path
        self.extraction_fps = None
        self.feature_type = 'r21d_rgb'
        self.model = r2plus1d_18(pretrained=True)
        self.model_class = self.model.fc
        self.model.fc = torch.nn.Identity()
        self.big_video = []
    def forward(self,indices):
        # print(indices)
        device = indices.device
        for idx in tqdm(indices):
            print(idx)
            if(os.path.exists(self.output_path+self.file_names_list[idx].strip('.mp4')+'.npy')):
                #print(self.output_path+self.file_names_list[idx].strip('.mp4')+'.npy',' alread exists')
                continue
                
            feats_dict = self.extract(device, self.model, self.model_class, self.file_path_list[idx])
            action_on_extraction(feats_dict, self.file_path_list[idx], self.output_path, self.on_extraction)
    def extract(self, device: torch.device, model: torch.nn.Module, classifier: torch.nn.Module,
                video_path: Union[str, None] = None
                ) -> Dict[str, np.ndarray]:
        # take the video, change fps and save to the tmp folder
        if self.extraction_fps is not None:
            video_path = reencode_video_with_diff_fps(video_path, self.tmp_path, self.extraction_fps)

        # read a video
        rgb, audio, info = read_video(video_path, pts_unit='sec')
        # prepare data (first -- transform, then -- unsqueeze)
        # mafp: 这一步 爆了
        if(rgb.shape[0]>1000):
            self.big_video.append(video_path)
        rgb = self.transforms(rgb)
        rgb = rgb.unsqueeze(0)
        # slice the
        slices = form_slices(rgb.size(2), self.stack_size, self.step_size)

        vid_feats = []

        for stack_idx, (start_idx, end_idx) in enumerate(slices):
            # inference
            with torch.no_grad():
                output = model(rgb[:, :, start_idx:end_idx, :, :].to(device))
                vid_feats.extend(output.tolist())

                # show predicitons on kinetics dataset (might be useful for debugging)
                if self.show_pred:
                    logits = classifier(output)
                    print(f'{video_path} @ frames ({start_idx}, {end_idx})')
                    show_predictions_on_dataset(logits, 'kinetics')

        feats_dict = {
            self.feature_type: np.array(vid_feats),
        }

        return feats_dict

In [6]:
video_path = '/home/tione/notebook/dataset/videos/video_5k/train_5k/'
# video_path = './extractor_test/'
output_path = '/home/tione/notebook/dataset/r21d/'
extractor = ExtractR21D(video_path=video_path,output_path=output_path)

In [7]:
indices = torch.arange(len(extractor.file_path_list)).to('cuda')
extractor.to('cuda')

ExtractR21D(
  (model): VideoResNet(
    (stem): R2Plus1dStem(
      (0): Conv3d(3, 45, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
      (1): BatchNorm3d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv3d(45, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (4): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv2Plus1D(
            (0): Conv3d(64, 144, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
            (1): BatchNorm3d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
            (3): Conv3d(144, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
          )
          (1): BatchNorm3d(64, e

In [8]:
extractor.file_path_list[38]

'/home/tione/notebook/dataset/videos/video_5k/train_5k/eea68fbf63097a89806eb342f44e4352.mp4'

In [None]:
extractor(indices)

  0%|          | 0/5000 [00:00<?, ?it/s]

tensor(0, device='cuda:0')


  0%|          | 1/5000 [00:28<39:07:17, 28.17s/it]

tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(3, device='cuda:0')
tensor(4, device='cuda:0')
tensor(5, device='cuda:0')
tensor(6, device='cuda:0')
tensor(7, device='cuda:0')


  0%|          | 8/5000 [00:33<27:41:06, 19.97s/it]

tensor(8, device='cuda:0')
tensor(9, device='cuda:0')
tensor(10, device='cuda:0')
tensor(11, device='cuda:0')
tensor(12, device='cuda:0')
tensor(13, device='cuda:0')
tensor(14, device='cuda:0')
tensor(15, device='cuda:0')
tensor(16, device='cuda:0')
tensor(17, device='cuda:0')
tensor(18, device='cuda:0')
tensor(19, device='cuda:0')
tensor(20, device='cuda:0')
tensor(21, device='cuda:0')
tensor(22, device='cuda:0')
tensor(23, device='cuda:0')


  0%|          | 24/5000 [00:55<19:52:16, 14.38s/it]

tensor(24, device='cuda:0')
tensor(25, device='cuda:0')
tensor(26, device='cuda:0')
tensor(27, device='cuda:0')
tensor(28, device='cuda:0')
tensor(29, device='cuda:0')
tensor(30, device='cuda:0')
tensor(31, device='cuda:0')
tensor(32, device='cuda:0')
tensor(33, device='cuda:0')
tensor(34, device='cuda:0')


  1%|          | 35/5000 [01:22<14:55:04, 10.82s/it]

tensor(35, device='cuda:0')
tensor(36, device='cuda:0')


In [7]:
rgb, audio, info = read_video(extractor.file_path_list[39], pts_unit='sec')

In [18]:
rgb = extractor.transforms(rgb)

In [8]:
rgb.shape

torch.Size([3560, 1280, 720, 3])