In [100]:
from data_configs import DATASETS
import argparse
import numpy as np
import json
from tqdm import tqdm
import os
import torch
import numpy as np
import torch.nn.functional as F
from lavis.models import load_model_and_preprocess
from llm_prompting import filter_and_integrate
from torchvision import transforms
from vlm_localizer import localize
import pdb

In [3]:
model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "coco", device='cuda', is_eval=True)
vis_processors = transforms.Compose([
    t for t in vis_processors['eval'].transform.transforms if not isinstance(t, transforms.ToTensor)
])

Position interpolate from 16x16 to 26x26


In [4]:
def calc_iou(candidates, gt):
    start, end = candidates[:,0], candidates[:,1]
    s, e = gt[0], gt[1]
    inter = np.minimum(end, e) - np.maximum(start, s)
    union = np.maximum(end, e) - np.minimum(start, s)
    return inter.clip(min=0) / union

In [86]:
def select_proposal(inputs, gamma=0.6):
    weights = inputs[:, -1].clip(min=0)
    proposals = inputs[:, :-1]
    scores = np.zeros_like(weights)

    for j in range(scores.shape[0]):
        iou = calc_iou(proposals, proposals[j])
        scores[j] += (iou ** gamma * weights).sum()

    idx = np.argsort(-scores)
    return inputs[idx], idx

In [6]:
def get_args():
    parser = argparse.ArgumentParser(description='Evaluation for training-free video temporal grounding.')
    parser.add_argument('--dataset', default='charades', type=str, help='Specify the dataset. See supported datasets in data_configs.py.')
    parser.add_argument('--split', default='default', type=str, help='Specify the split. See supported splits in data_configs.py.')
    parser.add_argument('--llm_output', default=None, type=str, help='LLM prompt output. If not specified, use only VLM for evaluation.')

    return parser.parse_args()

In [7]:
def calc_scores(video_features, sentences):
    with torch.no_grad():
        text = model.tokenizer(sentences, padding='max_length', truncation=True, max_length=35, return_tensors="pt").to('cuda')                    
        text_output = model.Qformer.bert(text.input_ids, attention_mask=text.attention_mask, return_dict=True)
        text_feat = model.text_proj(text_output.last_hidden_state[:,0,:])
    
    v1 = F.normalize(text_feat, dim=-1)
    v2 = F.normalize(torch.tensor(video_features, device='cuda', dtype=v1.dtype), dim=-1)
    # 텍스트와 비디오 특징 간의 내적(유사도) 계산
    scores = torch.einsum('md,npd->mnp', v1, v2)
    scores, _ = scores.max(dim=-1)
    scores = scores.mean(dim=0, keepdim=True)

    return scores

In [101]:
def eval_with_llm(data, feature_path, stride, max_stride_factor, pad_sec=0.0):
    ious = []
    thresh = np.array([0.3, 0.5, 0.7])
    recall = np.array([0, 0, 0])

    pbar = tqdm(data.items())
    for vid, ann in pbar:
        duration = ann['duration']
        video_feature = np.load(os.path.join(feature_path, vid+'.npy'))

        for i in range(len(ann['sentences'])):
            # sub queries
            sub_query_proposals = []
            if 'query_json' in ann['response'][i]:
                relation = ann['response'][i]['relationship']
                # j의 range가 1부터 시작하는 이유는 0번째는 sub-query가 아닌 전체 query이기 때문
                for j in range(1, len(ann['response'][i]['query_json'])):
                    query_json = [{'descriptions': q} for q in ann['response'][i]['query_json'][j]['descriptions']]
                    # 하나의 description에 대해 10개 이하의 response(st:end, confidence) / 10개 이하인 이유는 10개를 뽑지만 nms에 의해 억제된 경우 그 이하의 proposal들이 반환되기 때문
                    answers = localize(video_feature, duration, query_json, stride, int(video_feature.shape[0] * max_stride_factor))
                    proposals = []
                    proposal_to_description_map = []
                    
                    # 각 description에 대한 response에서 상위 3개만 proposal에 저장 -> proposals에는 총 9개의 구간 저장
                    for t in range(3):
                        for idx, p in enumerate(answers):
                            if len(p['response']) > t:
                                proposals.append([p['response'][t]['start'], p['response'][t]['end'], p['response'][t]['confidence']])
                                proposal_to_description_map.append(query_json[idx]['descriptions']) 
                    
                    proposals = np.array(proposals)
                    proposals, selected_idx = select_proposal(np.array(proposals))
                    # 하나의 sub-query에 대해서 3개의 proposal을 선택
                    sub_query_proposals.append(proposals[:3])
                    selected_idx = selected_idx[:3]
                    
                    selected_description = [proposal_to_description_map[idx] for idx in selected_idx[:3]]

            else:
                relation = 'single-query'

            # query, 원문 쿼리 하나 + llm 생성 description 3개 => 4개의 description
            query_json = [{'descriptions': ann['sentences'][i]}]
            if 'query_json' in ann['response'][i]:
                query_json += [{'descriptions': q} for q in ann['response'][i]['query_json'][0]['descriptions']]
            answers = localize(video_feature, duration, query_json, stride, int(video_feature.shape[0] * max_stride_factor))
            proposals = []
            proposal_to_description_map = []  # description 인덱스를 추적하기 위한 리스트
            
            for t in range(3):
                for idx, p in enumerate(answers):
                    if len(p['response']) > t:
                        proposals.append([p['response'][t]['start'], p['response'][t]['end'], p['response'][t]['confidence']])
                        proposal_to_description_map.append(query_json[idx]['descriptions'])  # 해당 proposal의 description 저장

            print(sub_query_proposals)
            integrated_sub_query_proposals, index = filter_and_integrate(sub_query_proposals, relation)
            print(integrated_sub_query_proposals)
            print(index)
            print('\n')
            # 총 12개의 proposals에서 앞 7개의 proposals 가져옴 -> 각 description 별 1개씩 + 3개
            # proposals = proposals[:7]
            # proposal_to_description_map = proposal_to_description_map[:7]

            # proposals, selected_idx = select_proposal(np.array(proposals))

            # # 가장 높은 score로 선택된 proposal에 해당하는 description을 query_json에서 찾음
            # selected_description = proposal_to_description_map[selected_idx[0]]

In [102]:
with open('./dataset/activitynet/llm_outputs-parsed_query.json') as f:
    data = json.load(f)
eval_with_llm(data, './datasets/ActivityNet/',40, 1, 0.0)


  0%|          | 0/4885 [00:00<?, ?it/s]

[array([[ 0.        , 21.21154022,  1.        ],
       [ 0.        , 21.21154022,  1.        ],
       [ 0.        , 21.21154022,  1.        ]]), array([[ 0.        , 21.21154022,  1.        ],
       [ 0.        , 21.21154022,  1.        ],
       [ 0.        , 21.21154022,  1.        ]])]
[0.0, 21.21154022216797, 1.0]
[0.0, 21.21154022216797, 1.0]




  0%|          | 0/4885 [00:01<?, ?it/s]

[array([[ 0.        , 22.80240631,  1.        ],
       [ 0.        , 22.80240631,  1.        ],
       [ 0.        , 22.27211761,  1.        ]]), array([[ 2.65144231, 49.84711838,  1.        ],
       [ 2.65144231, 50.37740707,  1.        ],
       [ 5.30288462, 49.31682968,  1.        ]]), array([[ 0.        , 21.74182892,  1.        ],
       [ 0.        , 21.74182892,  1.        ],
       [ 0.        , 22.80240631,  1.        ]])]





ValueError: not enough values to unpack (expected 2, got 0)

: 