In [None]:
#Uncomment for colab

from google.colab import drive
drive.mount('/content/drive')

## Setup

In [None]:
%%capture
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [None]:
%%capture
!git clone --recursive https://github.com/SayakMukherjee/BMT.git
%cd BMT/

!wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh -q --show-progress
!bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local

In [None]:
import os
from pathlib import Path
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
from sample.single_video_prediction import get_video_duration

In [None]:
%%capture
# feature extraction
!conda env create -f ./submodules/video_features/conda_env_i3d.yml

In [None]:
!conda env create -f ./submodules/video_features/conda_env_vggish.yml

In [None]:
# captioning model
!conda env create -f ./conda_env.yml

# spacy language model
!/usr/local/envs/bmt/bin/python -m spacy download en

In [None]:
!wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/glove.840B.300d.zip -q --show-progress
!wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/best_cap_model.pt -q --show-progress
!wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/best_prop_model.pt -q --show-progress
!wget https://storage.googleapis.com/audioset/vggish_model.ckpt -q --show-progress

!mkdir .vector_cache
!mv glove.840B.300d.zip ./.vector_cache/
!mv best_cap_model.pt ./sample/
!mv best_prop_model.pt ./sample/
!mv vggish_model.ckpt ./submodules/video_features/models/vggish/checkpoints/

## Caption Generation: Bi-Modal Tranformer

Adapted from the original work by Iashin et al. [Bi-modal Transfor](https://github.com/v-iashin/BMT)

In [None]:
# upload a video
MY_VIDEO_PATH = '/content/drive/MyDrive/MM/tvsum/video/98MoyGZKHXc.mp4'

# Preparing the paths
VIDEO_DURATION = get_video_duration(MY_VIDEO_PATH)

FEATURES_CACHE_PATH = '/content/drive/MyDrive/MM/tvsum/features'
FEATURES_PATH_STUB = os.path.join(FEATURES_CACHE_PATH, Path(MY_VIDEO_PATH).stem)

FEATURE_PATH_RGB = f'{FEATURES_PATH_STUB}_rgb.npy'
FEATURE_PATH_FLOW = f'{FEATURES_PATH_STUB}_flow.npy'

In [None]:
PROPOSAL_CKPT = '/content/BMT/sample/best_prop_model.pt'
CAPTIONING_CKPT = '/content/BMT/sample/best_cap_model.pt'

In [None]:
FEATURE_PATH_VGGISH = f'{FEATURES_PATH_STUB}_vggish.npy'

In [None]:
# Extract I3D features (visual)
!cd ./submodules/video_features && /usr/local/envs/i3d/bin/python main.py \
    --feature_type i3d \
    --on_extraction save_numpy \
    --device_ids 0 \
    --extraction_fps 1 \
    --step_size 1 \
    --stack_size 10 \
    --video_paths $MY_VIDEO_PATH \
    --output_path $FEATURES_CACHE_PATH

In [None]:
# Extract VGGish features (audio)
!cd ./submodules/video_features && /usr/local/envs/vggish/bin/python main.py \
    --feature_type vggish \
    --on_extraction save_numpy \
    --device_ids 0 \
    --video_paths $MY_VIDEO_PATH \
    --output_path $FEATURES_CACHE_PATH

In [None]:
# captioning parameters
MAX_PROP_PER_VIDEO = 100
NMS_TIOU_THRESHOLD = 0.4

# Running single video prediction
!/usr/local/envs/bmt/bin/python ./sample/single_video_prediction.py \
    --prop_generator_model_path $PROPOSAL_CKPT \
    --pretrained_cap_model_path $CAPTIONING_CKPT \
    --vggish_features_path '/content/drive/MyDrive/MM/tvsum/features/-esJrBWj2d8_vggish.npy' \
    --rgb_features_path '/content/drive/MyDrive/MM/tvsum/new_features/-esJrBWj2d8_rgb.npy' \
    --flow_features_path '/content/drive/MyDrive/MM/tvsum/new_features/-esJrBWj2d8_flow.npy' \
    --duration_in_secs $VIDEO_DURATION \
    --device_id 0 \
    --max_prop_per_vid $MAX_PROP_PER_VIDEO \
    --nms_tiou_thresh $NMS_TIOU_THRESHOLD

## Captions Encoding

Adapted from the text encoder section of the work by Radford et al. [Contrastive Language-Image Pre-Training (CLIP)](https://github.com/openai/CLIP)

In [None]:
# Hand-written captions as BMT needs fine-tuning on TVSum which is beyond the current scope

captions_list = [{'start': 0.0, 'end': 20.0, 'sentence': 'man is talking about repair kits in cars'}, 
                 {'start': 20.0, 'end': 40.0, 'sentence': 'man stops car and gets out of the car and sits on road and talks to the camera'}, 
                 {'start': 40.0, 'end': 60.0, 'sentence': 'man is talking to the camera and touches the wheels and the back of the car'}, 
                 {'start': 60.0, 'end': 80.0, 'sentence': 'man is talking to the camera and takes out items from the car back and sits and touches wheels'}, 
                 {'start': 80.0, 'end': 100.0, 'sentence': 'man is talking to the camera and touches the wheels then he gets up and opens car front door'}, 
                 {'start': 100.0, 'end': 120.0, 'sentence': 'a person’s hand and finger and wheel and car starts'}, 
                 {'start': 120.0, 'end': 140.0, 'sentence': 'car wheels and car starts and man is driving car and talking to the camera'}, 
                 {'start': 140.0, 'end': 160.0, 'sentence': 'man is talking to the camera and car is going forward on the road'}]
captions_list[0]

In [None]:
# sorting the captions to ensure there is no overlap
newlist = sorted(captions_list, key=lambda d: d['start']) 
newlist
list_of_sentences = []
for item in newlist:
  list_of_sentences.append(item['sentence'])

In [None]:
list_of_sentences # print

In [None]:
# Tokenizing the list of captions using CLIP

import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

text = clip.tokenize(list_of_sentences).to(device)

with torch.no_grad():
    text_features = model.encode_text(text)

In [None]:
# gpu to cpu

from torch.functional import Tensor
import numpy as np

np_array_from_gpu = Tensor.cpu(text_features)
np.save('/content/drive/MyDrive/MM/tvsum/features/98MoyGZKHXc_manual_cc_embedding.npy', np_array_from_gpu)