In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import json
import torch
import random
import pandas as pd
import numpy as np
import seaborn as sns

from data_handlers import YCDataset, SampleBatchIdx
from models import EmbeddingsMapping
from losses import compute_all_costs, compute_clust_loss, compute_alignment_loss
from dtw import drop_dtw
from utils import compute_normalization_parameters
from torch.utils.data import DataLoader
from torch import nn, log, exp
from torch.nn import functional as F
from tqdm import tqdm
from pathlib import Path
from matplotlib import pyplot as plt

opj = lambda x, y: os.path.join(x, y)

In [3]:
training_df = pd.read_csv('training_with_labels_s3dg.csv')
validation_df = pd.read_csv('validation_with_labels_s3dg.csv')

gt_training = torch.load('s3d_labelled_video_train.pkl')
gt_validation = torch.load('s3d_labelled_video_val.pkl')
# validation_df.head()
# print(gt_training)

In [4]:
# gt_training

In [5]:
batch_size = 1
train_dataset = YCDataset(training_df, video_len=775)
train_dl = DataLoader(train_dataset, batch_size=batch_size)

valid_dataset = YCDataset(validation_df, video_len=775)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size)

In [6]:
# device = 'cuda:1'
# folder_name = Path('ckpts_notext_s3dg_cl4_dtw2.5_learn_adamw_batchnorm_dropout_lr_1e-5')
# state_dict = torch.load(folder_name/'best_model_state_20.pth')
# model = EmbeddingsMapping(512, video_layers=3, text_layers=0, drop_layers=2, learnable_drop=True, normalization_dataset=train_dataset, batch_norm=True)
# model.load_state_dict(state_dict['model'])
# model = model.to(device)
model = None

In [7]:
def linear_sim(x, z):
    return x @ z.T



In [8]:
def run_model_eval(vf, sf, distractor, drop_cost_type):
    with torch.no_grad():
        frame_features = model.map_video(vf)
        step_features = model.map_text(sf)
        if drop_cost_type == 'learn':
            distractor_features = model.compute_distractors(distractor)
        else:
            distractor_features = [None] * frame_features.shape[0]
        return step_features, frame_features, distractor_features
    
def recall_acc(frame_assignment, gt_assignment):
    return ((frame_assignment == gt_assignment).sum())/ gt_assignment.size

def framewise_accuracy(frame_assignment, gt_assignment, num_frames, use_unlabeled=False):
    # to discount unlabeled frames in gt
    if not use_unlabeled:
        unlabled = np.count_nonzero(gt_assignment == -1)
        num_frames = num_frames - unlabled
        fa = np.logical_and(frame_assignment == gt_assignment, gt_assignment != -1).sum()
    else:
        fa = np.count_nonzero((frame_assignment == gt_assignment))
    # framewise accuracy
    fa = fa / num_frames if num_frames != 0 else 0
    return fa

def IoU(frame_assignment, gt_assignment, num_steps):

    intersection, union = 0, 0
    for s in range(num_steps):
        intersection += np.logical_and(gt_assignment == s, frame_assignment == s).sum()
        union += np.logical_or(gt_assignment == s, frame_assignment == s).sum()
    return intersection / union


In [9]:
drop_cost_type = 'logits'
l2_normalize = False
keep_percentile = 0.15
framewise_acc = 0
recall = 0
iou = 0
for idx, batch in tqdm(enumerate(valid_dl)):
#     if idx % 100 == 0:
#         print(recall/(idx+1))
#         print(framewise_acc/(idx+1)) 
#         print(iou/(idx+1)) 
        
    id_, step_len, step_features, video_len, video_features = batch['id'], batch['step_len'], batch['step_feature'], batch['video_len'], batch['video_feature']
    
    if drop_cost_type == 'learn':
            distractors = torch.stack([ s[:size].mean(0) for s, size in zip(step_features, step_len)], dim=0).to(device) # also taking care of the distractor padding (dont worry about it later)
    else:
        distractors = [None] * batch_size

    for _, sample in enumerate(zip(id_, step_len, step_features, video_len, video_features, distractors)):
        _id, s_l, sf, v_l, vf, dif = sample
        
        if model is not None:
            model.eval()
            if drop_cost_type == 'learn':
                dif = dif.to(device)
            sf, vf, distractor = run_model_eval(vf.to(device), sf.to(device), dif, drop_cost_type)
            
            sf, vf, distractor = sf.detach().cpu(), vf.detach().cpu(), distractor
            
            if drop_cost_type == 'learn':
                distractor = distractor.detach().cpu()
            
        else:
#             need this to test no model baseline
            sf, vf, distractor = sf, vf, dif

        sim = sf[:s_l] @ vf[:v_l].T

        zx_costs, drop_costs = compute_all_costs((sf, s_l, vf, v_l, distractor), l2_normalize=l2_normalize, gamma_xz=10, drop_cost_type=drop_cost_type, keep_percentile=keep_percentile)
#         print(zx_costs.shape, drop_costs.shape)
        zx_costs, drop_costs = [t.detach().cpu().numpy() for t in [zx_costs, drop_costs]]
        sim = sim.detach().cpu().numpy()
        
        
        
        optimal_assignment = drop_dtw(zx_costs, drop_costs, return_labels=True) - 1
#         print(optimal_assignment)
#         print(gt_training[_id])
        recall += framewise_accuracy(optimal_assignment, gt_validation[_id], v_l, use_unlabeled=False)
        framewise_acc += framewise_accuracy(optimal_assignment, gt_validation[_id], v_l, use_unlabeled=True).item()
        iou += IoU(optimal_assignment, gt_validation[_id], s_l).item()
#         optimal_assignment

#         simple_assignment = np.argmax(sim, axis=0)
#         simple_assignment[drop_costs < zx_costs.min(0)] = -1
        
print(recall/len(valid_dataset))
print(framewise_acc / len(valid_dataset))
print(iou / len(valid_dataset))

436it [01:00,  7.18it/s]

tensor(0.4702)
0.5681018721346461
0.33076414795019843





In [10]:
def evaluate(batch):
    framewise_acc = 0.
    iou = 0.
    
    id_, step_len, step_features, video_len, video_features = batch['id'], batch['step_len'], batch['step_feature'], batch['video_len'], batch['video_feature']
    
    if drop_cost_type == 'learn':
        distractors = torch.stack([ s[:size].mean(0) for s, size in zip(step_features, step_len)], dim=0).to(device)
    else:
        distractors = [None] * len(id_)
    
    for _, sample in zip(id_, step_len, step_features, video_len, video_features, distractors):
        
        _id, s_l, sf, v_l, vf, dis = sample
        
        if model is not None:
            sf = sf.to(device)
            vf = vf.to(device)
            if dis is not None:
                dis = dis.to(device)
            m_sf, m_vf, m_dis = run_model_eval(vf, sf, dis)
            m_sf, m_vf = m_sf.detach().cpu().numpy(), m_vf.detach().cpu().numpy()
            if dis is not None:
                m_dis = m_dis.detach().cpu().numpy()
            
        else:
            m_sf, m_vf, m_dis = sf, vf, dis
        
        zx_costs, drop_costs = compute_all_costs((m_sf, s_l, m_vf, v_l, m_dis), l2_normalize=l2_normalize, gamma_xz=10, drop_cost_type=drop_cost_type, keep_percentile=keep_percentile)
        zx_costs, drop_costs = [t.detach().cpu().numpy() for t in [zx_costs, drop_costs]]
        optimal_assignment = drop_dtw(zx_costs, drop_costs, return_labels=True) - 1
        
        framewise_acc += framewise_accuracy(optimal_assignment, gt_validation[_id], v_l).item()
        iou += IoU(optimal_assignment, gt_validation[_id], s_l).item()
    return framewise_acc, iou