In [1]:
import os
import cv2
import numpy as np
from numpy import dot
from numpy.linalg import norm
import sys
import glob
import json
import h5py
import math
from tqdm import tqdm
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import torchvision.ops.roi_align as roi_align
import pathlib
import torchvision.transforms as T

device = torch.device('cuda')

## Data Preparation

In [2]:
# Arguments
class args:
    msvd = True # or msvd = False for MSR-VTT
    slice_framepos=2
    dset ='../' # change based on dataset location
    max_frames = 20
    eval_frame_order =0 
    output_dir='pretrained'
    cache_dir=''
    features_path='..'
    msrvtt_csv ='msrvtt.csv'
    data_path ='MSRVTT_data.json'
    max_words=32
    feature_framerate=1
    cross_model="cross-base"
    local_rank=0


In [3]:
# Load object detection model
model = torch.hub.load('ultralytics/yolov5', 'yolov5l6', pretrained=True)
model = model.to(device)
model.eval()
print()

Using cache found in /home/oem/.cache/torch/hub/ultralytics_yolov5_master
[31m[1mrequirements:[0m tqdm>=4.64.0 not found and is required by YOLOv5, attempting auto-update...

[31m[1mrequirements:[0m protobuf<=3.20.1 not found and is required by YOLOv5, attempting auto-update...

[31m[1mrequirements:[0m 2 packages updated per /home/oem/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

YOLOv5 🚀 2022-8-18 Python-3.8.8 torch-1.12.0+cu116 CUDA:0 (NVIDIA RTX A6000, 48685MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v6.2/yolov5l6.pt to yolov5l6.pt...
ERROR: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Re-attempting https://storage.googleapis.com/ultralytics/yolov5/v6.2/yolov5l6.pt to yolov5l6.pt...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  C




In [4]:
# Import dataloader
if args.msvd:
    %run ../dataloaders/dataloader_msvd.py import MSVD_Loader
else:
    %run ../dataloaders/dataloader_msrvtt.py import MSRVTT_RawDataLoader

In [5]:
# Set configuration
if args.msvd:

    dset_path = os.path.join(os.path.join(args.dset,'dataset'),'MSVD')
    features_path = os.path.join(dset_path,'raw') # video .avi    
    name_list = glob.glob(features_path+os.sep+'*')
    args.features_path = features_path

    url2id = {}
    data_path =os.path.join(os.path.join(dset_path,'captions','youtube_mapping.txt'))
    args.data_path = data_path
    for line in open(data_path,'r').readlines():
        url2id[line.strip().split(' ')[0]] = line.strip().split(' ')[-1]

    path_to_saved_models = "extracted/msvd"
    pathlib.Path(path_to_saved_models).mkdir(parents=True, exist_ok=True)
    save_file = path_to_saved_models+'/MSVD_OBJECT_FEAT_FASTERRCNN_RESNET50.hdf5'
    args.max_words =30
    
else:
  
    dset_path = os.path.join(os.path.join(args.dset,'dataset'),'MSRVTT')
    features_path = os.path.join(dset_path,'raw')
    args.features_path = features_path
    data_path=os.path.join(dset_path,'MSRVTT_data.json')
    args.data_path = data_path
    args.msrvtt_csv = os.path.join(dset_path,'msrvtt.csv')
    name_list = glob.glob(features_path+os.sep+'*')
    
    path_to_saved_models = "extracted/msrvtt"
    pathlib.Path(path_to_saved_models).mkdir(parents=True, exist_ok=True)
    save_file = path_to_saved_models+'/MSRVTT_OBJECT_FEAT_FASTERRCNN_RESNET50.hdf5'
    args.max_words =73

In [7]:
# Feature extractor
def save_features(mod, inp, outp):
    features.append(outp)

# layer_to_hook = 'backbone.body.layer4.2.relu'
# layer_to_hook = 'roi_heads.box_roi_pool'

layer_to_hook = 'model.11.cv2.act'
# layer_to_hook = 'backbone.body.layer4'
for name, layer in model.model.model.named_modules():
# for name, layer in model.named_modules():
    if name == layer_to_hook:
        layer.register_forward_hook(save_features)

In [9]:
# Load dataset
if args.msvd :
    videos= MSVD_Loader(
        features_path=args.features_path,
        max_words=args.max_words,
        feature_framerate=args.feature_framerate,
        max_frames=args.max_frames,
        frame_order=args.eval_frame_order,
        slice_framepos=args.slice_framepos,
        transform_type = 1,
        data_path = args.data_path
) 
else:
    videos= MSRVTT_RawDataLoader(
        csv_path=args.msrvtt_csv,
        features_path=args.features_path,
        max_words=args.max_words,
        feature_framerate=args.feature_framerate,
        max_frames=args.max_frames,
        frame_order=args.eval_frame_order,
        slice_framepos=args.slice_framepos,
        transform_type = 1,
)

Video number: 1970
Id number: 1970


## Generate Object Features

In [10]:
output_features = []
threshold = 0.5
model.conf = 0.5
features = None
stop = False
list_videoid = []

with torch.no_grad():
    with h5py.File(save_file, 'w') as f:
        for video_id,video,video_mask in tqdm(videos):
            if features is not None:
                del features
            features = []
            if (type(video) == bool):
                stop = True
            if stop:
                break

            tensor = video[0]

            roi_align_out_per_video = []
            for i in range(len(tensor)): 
                input = torch.tensor(tensor[i:i+1]).float()
                video_frame,num,channel,h,w = input.shape
                input = input.view(video_frame,channel, h, w)

                transform = T.ToPILImage()
                img = transform(input[0])

                output = model(img)

                spat_scale = min(features[i].shape[2]/input.shape[2], features[i].shape[3]/input.shape[3])
                roi_align_out_per_frame = []
                for j, box in enumerate(output.xyxy[0].cpu().numpy()): # for each box
                    if len(roi_align_out_per_frame)==9: # max object per frame is 9
                        break
                    roi_align_out = roi_align(features[i], [output.xyxy[0][:,:4][j:j+1]], output_size=1, spatial_scale=spat_scale, aligned=True)
                    roi_align_out_per_frame.append(torch.squeeze(roi_align_out).cpu().numpy())
                if len(roi_align_out_per_frame)<9: # add zero padding if less than 5 object
                    
                    for y in range(len(roi_align_out_per_frame), 9):
                        zero_padding = [0]*1024 # length of the roi_align_out is also 1024, hardcoded for now
                        roi_align_out_per_frame.append(zero_padding)
                
                roi_align_out_per_frame = np.stack(roi_align_out_per_frame)
                f.create_dataset(video_id+'-'+str(i), data = roi_align_out_per_frame)
                del output

100%|███████████████████████████████████████| 1970/1970 [24:18<00:00,  1.35it/s]
