In [1]:
import os
import torch

In [2]:
os.chdir("/home/shuaman/video_sm/video_summarization")

In [3]:
!pwd

/home/shuaman/video_sm/video_summarization


In [4]:
from src.models import MSVA
from src.utils import VSMDataset, parse_configuration
from src.utils.utils_model import *

In [5]:
msva = MSVA()

In [6]:
path_weights_summe = "/home/shuaman/video_sm/MSVA/model_weights/summe_random_non_overlap_0.5359.tar.pth"
path_weights_tvsum = "/home/shuaman/video_sm/MSVA/model_weights/tvsum_random_non_overlap_0.6271.tar.pth"

In [31]:
msva.load_state_dict(torch.load(path_weights_summe))

<All keys matched successfully>

In [7]:
dict_paths = {
        'path_tvsum':"/data/shuaman/video_summarization/datasets/processed_datasets/eccv16_dataset_tvsum_google_pool5.h5",
        'path_summe':"/data/shuaman/video_summarization/datasets/processed_datasets/eccv16_dataset_summe_google_pool5.h5",
        'path_ovp':"/data/shuaman/video_summarization/datasets/processed_datasets/eccv16_dataset_ovp_google_pool5.h5",
        'path_youtube':"/data/shuaman/video_summarization/datasets/processed_datasets/eccv16_dataset_youtube_google_pool5.h5",
        'path_cosum':"/data/shuaman/video_summarization/datasets/processed_datasets/dataset_cosum_processed.h5",
#         'path_tvsum':"/data/shuaman/video_summarization/datasets/processed_datasets/dataset_tvsum_processed.h5",
#         'path_summe':"/data/shuaman/video_summarization/datasets/processed_datasets/dataset_summe_processed.h5",
}

In [8]:
dict_use_feature = get_flags_features("i3d", "googlenet")

In [9]:
dict_use_feature

{'googlenet': True,
 'resnext': False,
 'inceptionv3': False,
 'i3d_rgb': True,
 'i3d_flow': True,
 'resnet3d': False}

In [10]:
params = {
        'batch_size': 1,
        'num_workers': 4
        }


In [11]:
path_split = "/home/shuaman/video_sm/video_summarization/splits/vasnet_splits/summe_splits.json"
splits = parse_configuration(path_split)
split = splits[0]

In [12]:
split.keys()

dict_keys(['train_keys', 'test_keys'])

In [13]:
dataset_paths = get_paths('summe', 'canonical', **dict_paths)
dataset_paths

'/data/shuaman/video_summarization/datasets/processed_datasets/eccv16_dataset_summe_google_pool5.h5'

In [14]:
training_generator, test_generator = get_dataloaders(dataset_paths, split, 
                                                     dict_use_feature, params,
                                                    "/data/shuaman/video_summarization/datasets/processed_datasets/transformations.pk")

In [15]:
it = 0
for i in training_generator:
    it += 1
    continue
print(it)

20


In [16]:
it = 0
for i in test_generator:
    it += 1
    continue
print(it)

5


In [17]:
device = torch.device("cuda:3")

In [18]:
device

device(type='cuda', index=3)

In [19]:
optimizer = init_optimizer(msva, 0.00005, 0.00001)
criterion = torch.nn.MSELoss()
criterion.to(device)

MSELoss()

In [20]:
sameCount = 0
max_val_fscore = 0
maxkt = 0
maxsp = 0
maxtrl = 0
maxtsl = 0
max_val_fscoreLs=[]

In [21]:
import cv2
import numpy as np
import sys 

In [22]:
def train_step(training_generator, criterion, optimizer):
    msva.train()

    avg_loss = []

    for video_info, label in training_generator:
        
        target = (label['gtscore'].squeeze(0)).cpu().numpy()
        features = [(video_info[key].squeeze(0)).cpu().numpy() for key in video_info.keys() if 'features' in  key]

        shape_desire = target.shape[0]
        features = [cv2.resize(feature, (feature.shape[1],shape_desire), interpolation = cv2.INTER_AREA) for feature in features]

        features = [torch.from_numpy(feature).unsqueeze(0) for feature in features]
        target =  torch.from_numpy(target).unsqueeze(0)

        target -= target.min()
        target = np.true_divide(target, target.max())

        target = target.float().to(device)
        features = [feature.float().to(device) for feature in features]
        seq_len = features[0].shape[1]

        y, _ = msva(features, seq_len)

        loss = criterion(y, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss.append(loss.item())

    avg_loss = np.mean(np.array(avg_loss))

    return avg_loss

In [23]:
msva.to(device)

MSVA(
  (att1_3): SelfAttention(
    (K): Linear(in_features=1024, out_features=1024, bias=False)
    (Q): Linear(in_features=1024, out_features=1024, bias=False)
    (V): Linear(in_features=1024, out_features=1024, bias=False)
    (output_linear): Linear(in_features=1024, out_features=1024, bias=False)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (ka1_3): Linear(in_features=1024, out_features=365, bias=True)
  (kb): Linear(in_features=365, out_features=365, bias=True)
  (kc): Linear(in_features=365, out_features=512, bias=True)
  (kd): Linear(in_features=512, out_features=1, bias=True)
  (sig): Sigmoid()
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (softmax): Softmax(dim=0)
  (layer_norm_y_1_3): LayerNorm()
  (layer_norm_y_4): LayerNorm()
  (layer_norm_kc): LayerNorm()
  (layer_norm_kd): LayerNorm()
)

In [24]:
from src.utils import generate_summary, evaluate_summary
from scipy.stats import kendalltau, spearmanr, rankdata

In [25]:
def eval_function(test_generator):
    msva.eval()

    avg_loss = []
    fms = []
    kts = []
    sps = []

    with torch.no_grad():
        for video_info, label in test_generator:

            target = (label['gtscore'].squeeze(0)).cpu().numpy()
            features = [(video_info[key].squeeze(0)).cpu().numpy() for key in video_info.keys() if 'features' in  key]

            shape_desire = target.shape[0]
            features = [cv2.resize(feature, (feature.shape[1],shape_desire), interpolation = cv2.INTER_AREA) for feature in features]

            features = [torch.from_numpy(feature).unsqueeze(0) for feature in features]
            target =  torch.from_numpy(target).unsqueeze(0)

            target -= target.min()
            target = np.true_divide(target, target.max())

            target = target.float().to(device)
            features = [feature.float().to(device) for feature in features]

            y, _ = msva(features, shape_desire)

            criterion = torch.nn.MSELoss()
            criterion.to(device)

            test_loss = criterion(y, target)

            avg_loss.append(test_loss.item())
            summary = y[0].detach().cpu().numpy()

            machine_summary = generate_summary(summary, (video_info["change_points"].squeeze(0)).cpu().numpy(),
                                              (video_info["n_frames"].squeeze(0)).cpu().numpy(), (video_info["n_frame_per_seg"].squeeze(0)).cpu().numpy(),
                                                (video_info["picks"].squeeze(0)).cpu().numpy())

            eval_metric = 'avg' if video_info["name_dataset"][0] == "tvsum" else 'max'
            fm, _, _ = evaluate_summary(machine_summary, (label["user_summary"].squeeze(0)).cpu().numpy(),
                                            eval_metric)

            fms.append(fm)
            y_pred2 = machine_summary
            y_true2 = (label["user_summary"].squeeze(0)).cpu().numpy().mean(axis=0)
            pS = spearmanr(y_pred2, y_true2)[0]
            kT = kendalltau(rankdata(-np.array(y_true2)), rankdata(-np.array(y_pred2)))[0]
            kts.append(kT)
            sps.append(pS)


    f_score = np.mean(fms)
    kt = np.mean(kts)
    sp = np.mean(sps)
    avg_loss = np.mean(np.array(avg_loss))

    return f_score, kt, sp, avg_loss

In [32]:
for epoch in range(2):
    train_loss = train_step(training_generator, criterion, optimizer)
    f_score, kt, sp, test_loss = eval_function(test_generator)

In [33]:
sp

0.12407581160149403

In [34]:
kt

0.1102586827115101

In [35]:
f_score

0.43061976583898265

In [36]:
train_loss

0.03655345905572176