In [9]:
!pip install timm

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting timm
  Downloading http://mirrors.tencentyun.com/pypi/packages/ee/08/1ccaf8d516935666b7fa5f6aaddf157c66208ea0c93bb847ae09f166354f/timm-0.4.9-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 7.9 MB/s eta 0:00:01
Installing collected packages: timm
Successfully installed timm-0.4.9


In [14]:
import os
import cv2
video_path = '../dataset/videos/video_5k/train_5k/'
file_names = os.listdir(video_path)
save_path = '../dataset/dense_frames/train_5k/'

In [15]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms as T
import torchvision.models as models
import albumentations as A
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform
transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
A_transform = A.Compose([
            A.Resize(256,256),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],  # mean on ImageNet
                std=[0.229, 0.224, 0.225],  # std on ImageNet
            ),
            ToTensorV2(),
        ])

In [16]:
import cv2
import sys
import numpy as np
import torch
import torch.nn as nn
def frame_iterator_list(filename, every_ms=1000, max_num_frames=300):
    video_capture = cv2.VideoCapture()
    if not video_capture.open(filename):
        print(sys.stderr, 'Error: Cannot open video file ' + filename)
        return
    last_ts = -99999  # The timestamp of last retrieved frame.
    num_retrieved = 0

    frame_all = []
    while num_retrieved < max_num_frames:
        # Skip frames
        while video_capture.get(cv2.CAP_PROP_POS_MSEC) < every_ms + last_ts:
            if not video_capture.read()[0]:
                return frame_all

        last_ts = video_capture.get(cv2.CAP_PROP_POS_MSEC)
        has_frames, frame = video_capture.read()
        if not has_frames:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = frame[:, :, ::-1]
        augmented = A_transform(image = frame)
        frame = augmented['image'].unsqueeze(0)
        frame_all.append(frame)
        num_retrieved += 1

    return frame_all

In [17]:
import timm
class CustomResNext(nn.Module):
    def __init__(self):
        super(CustomResNext, self).__init__()
        self.model = timm.create_model('resnext50_32x4d',pretrained=True)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Identity()
    def forward(self,x):
        x = self.model(x)
        return x
model = CustomResNext()
model.eval()
model.to('cuda')

CustomResNext(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act2): ReLU(inplace=True)
        (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_

In [18]:
from tqdm import tqdm
for file_name in tqdm(file_names):
    video_id = file_name[:-4]
    save_name = save_path+video_id+'.npy'
    if(os.path.exists(save_name)):
        continue
    frame_list = frame_iterator_list(video_path+file_name,every_ms=500)
    video_feature = []
    frame_batch = torch.cat(frame_list,dim=0).cuda()
    pred = model(frame_batch)
    # print(pred.shape)
    video_feature = pred.cpu().detach().numpy()
    np.save(save_name,video_feature)
    # break

  0%|          | 1/5000 [00:06<9:04:32,  6.54s/it]


RuntimeError: CUDA out of memory. Tried to allocate 58.00 MiB (GPU 0; 31.72 GiB total capacity; 30.68 GiB already allocated; 34.88 MiB free; 30.71 GiB reserved in total by PyTorch)

In [1]:
# 制作相应的路径文件txt
import linecache
import os
file_name = '../dataset/tagging/GroundTruth/datafile/val_resnet.txt'
train_good = '/home/tione/notebook/dataset/tagging/GroundTruth/datafile/train_good.txt'
val_good = '/home/tione/notebook/dataset/tagging/GroundTruth/datafile/val_good.txt'
data_num_per_sample  = 6

with open(file_name,'w') as f:
    for index in range(500):
        data_list = []
        for line_i in range(data_num_per_sample*index+1,data_num_per_sample*(index+1)):
            line = linecache.getline(val_good,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        video_file_name = os.path.basename(data_list[0])
        data_list[0] = '../dataset/frames/train_5k/'+video_file_name
        
        for i in range(len(data_list)):
            f.write(data_list[i]+'\r\n')
        f.write('\r\n')

In [4]:
# 制作相应的路径文件txt
import linecache
import os
file_name = '../dataset/tagging/GroundTruth/datafile/self_sup_VAT_R.txt'
selfsup_file = './self_sup_VAT.txt'
data_num_per_sample  = 4

with open(file_name,'w') as f:
    for index in range(10000):
        data_list = []
        for line_i in range(data_num_per_sample*index+1,data_num_per_sample*(index+1)):
            line = linecache.getline(selfsup_file,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        video_file_name = os.path.basename(data_list[0])
        if(data_list[0].split('/')[3]=='tagging_dataset_train_5k'):
            data_list[0] = '../dataset/frames/train_5k/'+video_file_name
        else:
            data_list[0] = '../dataset/frames/test_5k/'+video_file_name
        for i in range(len(data_list)):
            f.write(data_list[i]+'\r\n')
        f.write('\r\n')

In [11]:
count = len(open(selfsup_file,'rU').readlines())

  if __name__ == '__main__':


In [12]:
count

36315

In [1]:
from dataloader.dataloader import TestingDataset,MultimodaFeaturesDataset
import yaml
import linecache
config_path = './config/config.yaml'
config = yaml.load(open(config_path))
dataset = TestingDataset(config['DatasetConfig'])
train_full_path = '/home/tione/notebook/dataset/tagging/GroundTruth/datafile/train_full.txt'
train_test = './self_sup_VAT.txt'
with open(train_test,'w') as f:
    for index in range(5000):
        data_list = []
        for line_i in range(6*index+1,6*(index+1)):
            line = linecache.getline(train_full_path,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        video_path = data_list[0]+'\r\n'
        audio_path = data_list[1]+'\r\n'
        text_path = data_list[3]+'\r\n'
        f.write(video_path)
        f.write(audio_path)
        f.write(text_path)
        f.write('\r\n')
with open(train_test,'a') as f:
    for index in range(5000):
        feat_dict = dataset[index]
        video_path = feat_dict['video_path']+'\r\n'
        audio_path = feat_dict['audio_path']+'\r\n'
        text_path = feat_dict['text_path']+'\r\n'
        f.write(video_path)
        f.write(audio_path)
        f.write(text_path)
        f.write('\r\n')



In [2]:
count = len(open('./self_sup_VAT.txt','rU').readlines())

  if __name__ == '__main__':


In [3]:
count

40000