Video retrieval by embedding multiple frames using CLIP where multiple frame vectors were combined to a single vector using mean pooling.

In [9]:
import os
import csv
from towhee import ops, pipe, register
from towhee.operator import PyOperator
from towhee import DataCollection
from tqdm import tqdm
import pandas as pd
import json
import numpy as np
from helpers import milvus_utils
from helpers.extract_frames import extract_frame, extract_n_frames
from helpers.eval_utils import twohee_data_col_to_df, get_all_eval_scores

In [10]:
# CONSTANTS

# Files
MSRVTT_SAMPLES = "./MSRVTT_1K.csv"
MSRVTT_SAMPLES_WITH_4_FRAMES = "./MSRVTT_1K_4_frames.csv"
MSRVTT_SAMPLES_WITH_8_FRAMES = "./MSRVTT_1K_8_frames.csv"
# file created using raw FIRE judgements, see clean_fire_judgements.ipynb
FIRE_BENCHMARK_Q_JUDGEMENTS = "./fire_benchmark_q_judgements.csv" 

# Database Collections
FRAME_4_RET_COLLECTION = "multi_frame_ret_4"
FRAME_8_RET_COLLECTION = "multi_frame_ret_8"

MULTI_FRAMES_4_DIR = "./frames_data/1k_multi_frames_4"
MULTI_FRAMES_8_DIR = "./frames_data/1k_multi_frames_8"

In [11]:
raw_samples_df = pd.read_csv(MSRVTT_SAMPLES)
raw_samples_df[['video_id', 'video_path', 'sentence']].head()

Unnamed: 0,video_id,video_path,sentence
0,video7579,./test_1k_compress/video7579.mp4,a girl wearing red top and black trouser is pu...
1,video7725,./test_1k_compress/video7725.mp4,young people sit around the edges of a room cl...
2,video9258,./test_1k_compress/video9258.mp4,a person is using a phone
3,video7365,./test_1k_compress/video7365.mp4,cartoon people are eating at a restaurant
4,video8068,./test_1k_compress/video8068.mp4,a woman on a couch talks to a a man


Before we embed video frames, we need to extract multiple frames (4 and 8 for each of the 2 experiments) from each of the 1000 videos.

In [21]:
def create_multi_frame_dataset(raw_samples_df, num_frames, directory_name):
    raw_video_samples = raw_samples_df.copy()
    for row in tqdm(raw_video_samples.iterrows(), total=len(raw_video_samples)):
        video_path = row[1]['video_path']
        image_name = os.path.basename(video_path).split('.')[0]
        all_frame_paths = extract_n_frames(video_path, directory_name, num_frames)
        for num_f in range(num_frames):
            raw_video_samples.at[row[0], f'frame_path_{num_f+1}'] = all_frame_paths[num_f]
        raw_video_samples.at[row[0], 'frame_id'] = image_name
    
    return raw_video_samples

In [22]:
four_multi_frame_samples_df = create_multi_frame_dataset(raw_samples_df, 4, MULTI_FRAMES_4_DIR)
eight_multi_frame_samples_df = create_multi_frame_dataset(raw_samples_df, 8, MULTI_FRAMES_8_DIR)

100%|██████████| 1000/1000 [00:29<00:00, 33.55it/s]
100%|██████████| 1000/1000 [00:42<00:00, 23.67it/s]


In [24]:
four_multi_frame_samples_df.head()

Unnamed: 0.1,Unnamed: 0,key,vid_key,video_id,sentence,video_path,frame_path_1,frame_path_2,frame_path_3,frame_path_4,frame_path_5,frame_path_6,frame_path_7,frame_path_8,frame_id
0,521,ret521,msr7579,video7579,a girl wearing red top and black trouser is pu...,./test_1k_compress/video7579.mp4,./frames_data/1k_multi_frames_4/video7579_fram...,./frames_data/1k_multi_frames_4/video7579_fram...,./frames_data/1k_multi_frames_4/video7579_fram...,./frames_data/1k_multi_frames_4/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,video7579
1,737,ret737,msr7725,video7725,young people sit around the edges of a room cl...,./test_1k_compress/video7725.mp4,./frames_data/1k_multi_frames_4/video7725_fram...,./frames_data/1k_multi_frames_4/video7725_fram...,./frames_data/1k_multi_frames_4/video7725_fram...,./frames_data/1k_multi_frames_4/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,video7725
2,740,ret740,msr9258,video9258,a person is using a phone,./test_1k_compress/video9258.mp4,./frames_data/1k_multi_frames_4/video9258_fram...,./frames_data/1k_multi_frames_4/video9258_fram...,./frames_data/1k_multi_frames_4/video9258_fram...,./frames_data/1k_multi_frames_4/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,video9258
3,660,ret660,msr7365,video7365,cartoon people are eating at a restaurant,./test_1k_compress/video7365.mp4,./frames_data/1k_multi_frames_4/video7365_fram...,./frames_data/1k_multi_frames_4/video7365_fram...,./frames_data/1k_multi_frames_4/video7365_fram...,./frames_data/1k_multi_frames_4/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,video7365
4,411,ret411,msr8068,video8068,a woman on a couch talks to a a man,./test_1k_compress/video8068.mp4,./frames_data/1k_multi_frames_4/video8068_fram...,./frames_data/1k_multi_frames_4/video8068_fram...,./frames_data/1k_multi_frames_4/video8068_fram...,./frames_data/1k_multi_frames_4/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,video8068


In [25]:
eight_multi_frame_samples_df.head()

Unnamed: 0.1,Unnamed: 0,key,vid_key,video_id,sentence,video_path,frame_path_1,frame_path_2,frame_path_3,frame_path_4,frame_path_5,frame_path_6,frame_path_7,frame_path_8,frame_id
0,521,ret521,msr7579,video7579,a girl wearing red top and black trouser is pu...,./test_1k_compress/video7579.mp4,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,./frames_data/1k_multi_frames_8/video7579_fram...,video7579
1,737,ret737,msr7725,video7725,young people sit around the edges of a room cl...,./test_1k_compress/video7725.mp4,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,./frames_data/1k_multi_frames_8/video7725_fram...,video7725
2,740,ret740,msr9258,video9258,a person is using a phone,./test_1k_compress/video9258.mp4,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,./frames_data/1k_multi_frames_8/video9258_fram...,video9258
3,660,ret660,msr7365,video7365,cartoon people are eating at a restaurant,./test_1k_compress/video7365.mp4,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,./frames_data/1k_multi_frames_8/video7365_fram...,video7365
4,411,ret411,msr8068,video8068,a woman on a couch talks to a a man,./test_1k_compress/video8068.mp4,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,./frames_data/1k_multi_frames_8/video8068_fram...,video8068


In [26]:
# We write the transformed samples data to a CSV file so it can be loaded into the load pipeline
def frame_path_cols(num_frames):
    cols = []
    for i in range(num_frames):
        cols.append(f'frame_path_{i+1}')
    return cols

columns_4_multi = frame_path_cols(4) + ['video_id', 'frame_id', 'sentence']
columns_8_multi = frame_path_cols(8) + ['video_id', 'frame_id', 'sentence']

four_multi_frame_samples_df[columns_4_multi].to_csv(MSRVTT_SAMPLES_WITH_4_FRAMES, index=False)
eight_multi_frame_samples_df[columns_8_multi].to_csv(MSRVTT_SAMPLES_WITH_8_FRAMES, index=False)

In [27]:
# Create the collection in Milvus to store image embeddings
milvus_utils.create_milvus_collection(FRAME_4_RET_COLLECTION, 512)
milvus_utils.create_milvus_collection(FRAME_8_RET_COLLECTION, 512)

<Collection>:
-------------
<name>: multi_frame_ret_8
<description>: video retrieval
<schema>: {'auto_id': False, 'description': 'video retrieval', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}]}

In [72]:
def get_frame_loader_csv(num_frames):
    f_path_col_names = [f"frame_path_{i+1}" for i in range(num_frames)]
    
    def read_frame_loader_csv(csv_path, encoding='utf-8-sig'):
        with open(csv_path, 'r', encoding=encoding) as f:
            data = csv.DictReader(f)
            for line in data:
                raw_id = line['frame_id']
                cleaned_id = raw_id[len('video'):]
                
                yield (int(cleaned_id),) + tuple(line[f] for f in f_path_col_names)
                # This yields cleaned_id, frame_path_1, frame_path_2, ... frame_path_num_frames
    
    return read_frame_loader_csv

In [73]:
I = get_frame_loader_csv(8)(MSRVTT_SAMPLES_WITH_8_FRAMES)
next(I)

(7579,
 './frames_data/1k_multi_frames_8/video7579_frame_001_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_002_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_003_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_004_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_005_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_006_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_007_of_008.jpg',
 './frames_data/1k_multi_frames_8/video7579_frame_008_of_008.jpg')

In [74]:
# Create loader pipelines for 4 and 8 multi frames
four_frame_csv_loader = get_frame_loader_csv(4)
multi_4_frame_loader_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('frame_id', 'f_path_1', 'f_path_2', 'f_path_3', 'f_path_4',), four_frame_csv_loader)
    .map('f_path_1', 'img1', ops.image_decode.cv2('rgb'))
    .map('f_path_2', 'img2', ops.image_decode.cv2('rgb'))
    .map('f_path_3', 'img3', ops.image_decode.cv2('rgb'))
    .map('f_path_4', 'img4', ops.image_decode.cv2('rgb'))
    .map('img1', 'vec1', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img2', 'vec2', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img3', 'vec3', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img4', 'vec4', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map(('vec1', 'vec2', 'vec3', 'vec4'), 'vec', lambda v1, v2, v3, v4: np.mean([v1, v2, v3, v4], axis=0))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map(('frame_id', 'vec'), (), ops.ann_insert.milvus_client(collection_name=FRAME_4_RET_COLLECTION))
    .output('frame_id')
)


eight_frame_csv_loader = get_frame_loader_csv(8)
multi_8_frame_loader_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('frame_id', 'f_path_1', 'f_path_2', 'f_path_3', 'f_path_4', 'f_path_5', 'f_path_6', 'f_path_7', 'f_path_8'), eight_frame_csv_loader)
    .map('f_path_1', 'img1', ops.image_decode.cv2('rgb'))
    .map('f_path_2', 'img2', ops.image_decode.cv2('rgb'))
    .map('f_path_3', 'img3', ops.image_decode.cv2('rgb'))
    .map('f_path_4', 'img4', ops.image_decode.cv2('rgb'))
    .map('f_path_5', 'img5', ops.image_decode.cv2('rgb'))
    .map('f_path_6', 'img6', ops.image_decode.cv2('rgb'))
    .map('f_path_7', 'img7', ops.image_decode.cv2('rgb'))
    .map('f_path_8', 'img8', ops.image_decode.cv2('rgb'))
    
    .map('img1', 'vec1', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img2', 'vec2', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img3', 'vec3', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img4', 'vec4', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img5', 'vec5', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img6', 'vec6', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img7', 'vec7', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img8', 'vec8', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map(('vec1', 'vec2', 'vec3', 'vec4', 'vec5', 'vec6', 'vec7', 'vec8'), 'vec', lambda v1, v2, v3, v4, v5, v6, v7, v8: np.mean([v1, v2, v3, v4, v5, v6, v7, v8], axis=0))

    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map(('frame_id', 'vec'), (), ops.ann_insert.milvus_client(collection_name=FRAME_8_RET_COLLECTION))
    .output('frame_id')
)

2025-04-17 14:26:53,194 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:26:53,235 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:26:53,280 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 14:26:53,285 - 23280021504 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 14:26:53,475 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2025-04-17 14:26:53,570 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: ht

In [75]:
# Run both loader pipelines
multi_4_frame_loader_pipeline(MSRVTT_SAMPLES_WITH_4_FRAMES)
print('4 frame loader pipeline finished.')
multi_8_frame_loader_pipeline(MSRVTT_SAMPLES_WITH_8_FRAMES)
print('8 frame loader pipeline finished.')

2025-04-17 14:27:06,803 - 17664241664 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-17 14:27:06,810 - 17697894400 - node.py-node:167 - INFO: Begin to run Node-read_frame_loader_csv-0


2025-04-17 14:27:06,811 - 17664241664 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-1
2025-04-17 14:27:06,815 - 17681068032 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-2
2025-04-17 14:27:06,820 - 17731547136 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-3
2025-04-17 14:27:06,852 - 17714720768 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-4
2025-04-17 14:27:06,856 - 17765199872 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-5
2025-04-17 14:27:06,874 - 17748373504 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-6
2025-04-17 14:27:06,885 - 17782026240 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-7
2025-04-17 14:27:06,895 - 17697894400 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-8
2025-04-17 14:27:06,899 - 17798852608 - node.py-node:167 - INFO: Begin to run Node-lambda-9
2025-04-17 14:27:06,902 - 17815678976 - node.py-node:167

4 frame loader pipeline finished.


2025-04-17 14:30:18,703 - 18093535232 - node.py-node:167 - INFO: Begin to run Node-lambda-17
2025-04-17 14:30:18,707 - 18144014336 - node.py-node:167 - INFO: Begin to run Node-lambda-18
2025-04-17 14:30:18,709 - 18110361600 - node.py-node:167 - INFO: Begin to run Node-ann-insert/milvus-client-19
2025-04-17 14:30:18,712 - 18160840704 - node.py-node:167 - INFO: Begin to run Node-_output


8 frame loader pipeline finished.


2025-04-17 14:55:51,518 - 17664241664 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-17 14:55:51,520 - 17681068032 - node.py-node:167 - INFO: Begin to run Node-read_frame_loader_csv-0
2025-04-17 14:55:51,523 - 17731547136 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-1
2025-04-17 14:55:51,524 - 17714720768 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-3
2025-04-17 14:55:51,524 - 17664241664 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-4
2025-04-17 14:55:51,524 - 17765199872 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-5
2025-04-17 14:55:51,525 - 17748373504 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-6
2025-04-17 14:55:51,525 - 17782026240 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-7
2025-04-17 14:55:51,527 - 17697894400 - node.py-node:167 - INFO: Begin to run Node-lambda-9
2025-04-17 14:55:51,527 - 17798852608 - node.py-node:167 - INFO: Begin t

In [82]:
def read_frame_search_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['frame_id'], line['sentence']
            
multi_4_frame_search_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top15_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_4_RET_COLLECTION, limit=15))
    .map('top15_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)

multi_8_frame_search_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top15_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_8_RET_COLLECTION, limit=15))
    .map('top15_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)

2025-04-17 14:46:50,584 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:46:50,666 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:46:50,708 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 14:46:50,713 - 13520564224 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 14:46:50,767 - 13520564224 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-17 14:46:50,838 - 13520564224 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co

In [83]:
multi_4_frame_msr_results = DataCollection(multi_4_frame_search_pipeline(MSRVTT_SAMPLES_WITH_4_FRAMES))
multi_4_frame_msr_results.show()

2025-04-17 14:46:52,278 - 13473230848 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-17 14:46:52,280 - 13490057216 - node.py-node:167 - INFO: Begin to run Node-read_frame_search_csv-0
2025-04-17 14:46:52,281 - 13473230848 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-1
2025-04-17 14:46:52,282 - 13523709952 - node.py-node:167 - INFO: Begin to run Node-lambda-2
2025-04-17 14:46:52,283 - 13506883584 - node.py-node:167 - INFO: Begin to run Node-ann-search/milvus-client-3
2025-04-17 14:46:52,287 - 13540536320 - node.py-node:167 - INFO: Begin to run Node-lambda-4
2025-04-17 14:46:52,289 - 15042899968 - node.py-node:167 - INFO: Begin to run Node-_output


rel_frame_id,query,top1,top5,top10,top15
video7579,a girl wearing red top and black trouser is putting a sweater on a dog,"[[7579, 1.3646539449691772]] len=1","[[7579, 1.3646539449691772],[9451, 1.4305797815322876],[9603, 1.4387223720550537],[9405, 1.4451756477355957],...] len=5","[[7579, 1.3646539449691772],[9451, 1.4305797815322876],[9603, 1.4387223720550537],[9405, 1.4451756477355957],...] len=10","[[7579, 1.3646539449691772],[9451, 1.4305797815322876],[9603, 1.4387223720550537],[9405, 1.4451756477355957],...] len=15"
video7725,young people sit around the edges of a room clapping and raising their arms while others dance in the center during a party,"[[8441, 1.4233133792877197]] len=1","[[8441, 1.4233133792877197],[7444, 1.4263453483581543],[7725, 1.4346461296081543],[8339, 1.4430490732192993],...] len=5","[[8441, 1.4233133792877197],[7444, 1.4263453483581543],[7725, 1.4346461296081543],[8339, 1.4430490732192993],...] len=10","[[8441, 1.4233133792877197],[7444, 1.4263453483581543],[7725, 1.4346461296081543],[8339, 1.4430490732192993],...] len=15"
video9258,a person is using a phone,"[[9257, 1.4228744506835938]] len=1","[[9257, 1.4228744506835938],[9697, 1.4255503416061401],[9829, 1.4330558776855469],[9258, 1.4334453344345093],...] len=5","[[9257, 1.4228744506835938],[9697, 1.4255503416061401],[9829, 1.4330558776855469],[9258, 1.4334453344345093],...] len=10","[[9257, 1.4228744506835938],[9697, 1.4255503416061401],[9829, 1.4330558776855469],[9258, 1.4334453344345093],...] len=15"
video7365,cartoon people are eating at a restaurant,"[[9777, 1.3961538076400757]] len=1","[[9777, 1.3961538076400757],[7365, 1.4227542877197266],[9537, 1.4308252334594727],[7741, 1.4411455392837524],...] len=5","[[9777, 1.3961538076400757],[7365, 1.4227542877197266],[9537, 1.4308252334594727],[7741, 1.4411455392837524],...] len=10","[[9777, 1.3961538076400757],[7365, 1.4227542877197266],[9537, 1.4308252334594727],[7741, 1.4411455392837524],...] len=15"
video8068,a woman on a couch talks to a a man,"[[7724, 1.3794373273849487]] len=1","[[7724, 1.3794373273849487],[7234, 1.4042013883590698],[7341, 1.4079279899597168],[9347, 1.408487319946289],...] len=5","[[7724, 1.3794373273849487],[7234, 1.4042013883590698],[7341, 1.4079279899597168],[9347, 1.408487319946289],...] len=10","[[7724, 1.3794373273849487],[7234, 1.4042013883590698],[7341, 1.4079279899597168],[9347, 1.408487319946289],...] len=15"


In [84]:
multi_8_frame_msr_results = DataCollection(multi_8_frame_search_pipeline(MSRVTT_SAMPLES_WITH_8_FRAMES))
multi_8_frame_msr_results.show()

2025-04-17 14:47:21,407 - 13456404480 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-17 14:47:21,409 - 14935945216 - node.py-node:167 - INFO: Begin to run Node-read_frame_search_csv-0
2025-04-17 14:47:21,410 - 13456404480 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-1
2025-04-17 14:47:21,411 - 15059726336 - node.py-node:167 - INFO: Begin to run Node-lambda-2
2025-04-17 14:47:21,412 - 15093379072 - node.py-node:167 - INFO: Begin to run Node-ann-search/milvus-client-3
2025-04-17 14:47:21,413 - 15110205440 - node.py-node:167 - INFO: Begin to run Node-lambda-4
2025-04-17 14:47:21,416 - 15127031808 - node.py-node:167 - INFO: Begin to run Node-_output


rel_frame_id,query,top1,top5,top10,top15
video7579,a girl wearing red top and black trouser is putting a sweater on a dog,"[[7579, 1.360194444656372]] len=1","[[7579, 1.360194444656372],[9451, 1.4077376127243042],[7730, 1.4435832500457764],[9034, 1.4514113664627075],...] len=5","[[7579, 1.360194444656372],[9451, 1.4077376127243042],[7730, 1.4435832500457764],[9034, 1.4514113664627075],...] len=10","[[7579, 1.360194444656372],[9451, 1.4077376127243042],[7730, 1.4435832500457764],[9034, 1.4514113664627075],...] len=15"
video7725,young people sit around the edges of a room clapping and raising their arms while others dance in the center during a party,"[[7725, 1.3989646434783936]] len=1","[[7725, 1.3989646434783936],[7444, 1.4208428859710693],[8556, 1.4406566619873047],[8441, 1.4462244510650635],...] len=5","[[7725, 1.3989646434783936],[7444, 1.4208428859710693],[8556, 1.4406566619873047],[8441, 1.4462244510650635],...] len=10","[[7725, 1.3989646434783936],[7444, 1.4208428859710693],[8556, 1.4406566619873047],[8441, 1.4462244510650635],...] len=15"
video9258,a person is using a phone,"[[9257, 1.422074317932129]] len=1","[[9257, 1.422074317932129],[9697, 1.4267330169677734],[9258, 1.4296905994415283],[8945, 1.4394712448120117],...] len=5","[[9257, 1.422074317932129],[9697, 1.4267330169677734],[9258, 1.4296905994415283],[8945, 1.4394712448120117],...] len=10","[[9257, 1.422074317932129],[9697, 1.4267330169677734],[9258, 1.4296905994415283],[8945, 1.4394712448120117],...] len=15"
video7365,cartoon people are eating at a restaurant,"[[9777, 1.3951191902160645]] len=1","[[9777, 1.3951191902160645],[9537, 1.4230724573135376],[7365, 1.4236586093902588],[7741, 1.4339861869812012],...] len=5","[[9777, 1.3951191902160645],[9537, 1.4230724573135376],[7365, 1.4236586093902588],[7741, 1.4339861869812012],...] len=10","[[9777, 1.3951191902160645],[9537, 1.4230724573135376],[7365, 1.4236586093902588],[7741, 1.4339861869812012],...] len=15"
video8068,a woman on a couch talks to a a man,"[[7724, 1.3635627031326294]] len=1","[[7724, 1.3635627031326294],[7341, 1.4104743003845215],[9347, 1.4147610664367676],[7685, 1.4171432256698608],...] len=5","[[7724, 1.3635627031326294],[7341, 1.4104743003845215],[9347, 1.4147610664367676],[7685, 1.4171432256698608],...] len=10","[[7724, 1.3635627031326294],[7341, 1.4104743003845215],[9347, 1.4147610664367676],[7685, 1.4171432256698608],...] len=15"


In [95]:
multi_4_frame_msr_results_df = twohee_data_col_to_df(multi_4_frame_msr_results)
multi_4_frame_msr_results_df.to_csv('query_results/multi_4_frame_queries_msrvtt.csv', index=False)
multi_4_frame_msr_results_df

Unnamed: 0,rel_frame_id,query,top1,top5,top10,top15,ground_truth
0,video7579,a girl wearing red top and black trouser is pu...,"[[7579, 1.3646539449691772]]","[[7579, 1.3646539449691772], [9451, 1.43057978...","[[7579, 1.3646539449691772], [9451, 1.43057978...","[[7579, 1.3646539449691772], [9451, 1.43057978...",7579
1,video7725,young people sit around the edges of a room cl...,"[[8441, 1.4233133792877197]]","[[8441, 1.4233133792877197], [7444, 1.42634534...","[[8441, 1.4233133792877197], [7444, 1.42634534...","[[8441, 1.4233133792877197], [7444, 1.42634534...",7725
2,video9258,a person is using a phone,"[[9257, 1.4228744506835938]]","[[9257, 1.4228744506835938], [9697, 1.42555034...","[[9257, 1.4228744506835938], [9697, 1.42555034...","[[9257, 1.4228744506835938], [9697, 1.42555034...",9258
3,video7365,cartoon people are eating at a restaurant,"[[9777, 1.3961538076400757]]","[[9777, 1.3961538076400757], [7365, 1.42275428...","[[9777, 1.3961538076400757], [7365, 1.42275428...","[[9777, 1.3961538076400757], [7365, 1.42275428...",7365
4,video8068,a woman on a couch talks to a a man,"[[7724, 1.3794373273849487]]","[[7724, 1.3794373273849487], [7234, 1.40420138...","[[7724, 1.3794373273849487], [7234, 1.40420138...","[[7724, 1.3794373273849487], [7234, 1.40420138...",8068
...,...,...,...,...,...,...,...
995,video7034,man in black shirt is holding a baby upside do...,"[[9885, 1.4678269624710083]]","[[9885, 1.4678269624710083], [9320, 1.47455167...","[[9885, 1.4678269624710083], [9320, 1.47455167...","[[9885, 1.4678269624710083], [9320, 1.47455167...",7034
996,video7568,the queen of england is seen walking with an e...,"[[7568, 1.2751219272613525]]","[[7568, 1.2751219272613525], [7116, 1.45783209...","[[7568, 1.2751219272613525], [7116, 1.45783209...","[[7568, 1.2751219272613525], [7116, 1.45783209...",7568
997,video7979,people talking about a fight,"[[7211, 1.4365811347961426]]","[[7211, 1.4365811347961426], [7501, 1.43993902...","[[7211, 1.4365811347961426], [7501, 1.43993902...","[[7211, 1.4365811347961426], [7501, 1.43993902...",7979
998,video7356,a vehicle with details on what comes with it b...,"[[7356, 1.3352746963500977]]","[[7356, 1.3352746963500977], [8819, 1.39669334...","[[7356, 1.3352746963500977], [8819, 1.39669334...","[[7356, 1.3352746963500977], [8819, 1.39669334...",7356


In [96]:
multi_8_frame_msr_results_df = twohee_data_col_to_df(multi_8_frame_msr_results)
multi_8_frame_msr_results_df.to_csv('query_results/multi_8_frame_queries_msrvtt.csv', index=False)
multi_8_frame_msr_results_df

Unnamed: 0,rel_frame_id,query,top1,top5,top10,top15,ground_truth
0,video7579,a girl wearing red top and black trouser is pu...,"[[7579, 1.360194444656372]]","[[7579, 1.360194444656372], [9451, 1.407737612...","[[7579, 1.360194444656372], [9451, 1.407737612...","[[7579, 1.360194444656372], [9451, 1.407737612...",7579
1,video7725,young people sit around the edges of a room cl...,"[[7725, 1.3989646434783936]]","[[7725, 1.3989646434783936], [7444, 1.42084288...","[[7725, 1.3989646434783936], [7444, 1.42084288...","[[7725, 1.3989646434783936], [7444, 1.42084288...",7725
2,video9258,a person is using a phone,"[[9257, 1.422074317932129]]","[[9257, 1.422074317932129], [9697, 1.426733016...","[[9257, 1.422074317932129], [9697, 1.426733016...","[[9257, 1.422074317932129], [9697, 1.426733016...",9258
3,video7365,cartoon people are eating at a restaurant,"[[9777, 1.3951191902160645]]","[[9777, 1.3951191902160645], [9537, 1.42307245...","[[9777, 1.3951191902160645], [9537, 1.42307245...","[[9777, 1.3951191902160645], [9537, 1.42307245...",7365
4,video8068,a woman on a couch talks to a a man,"[[7724, 1.3635627031326294]]","[[7724, 1.3635627031326294], [7341, 1.41047430...","[[7724, 1.3635627031326294], [7341, 1.41047430...","[[7724, 1.3635627031326294], [7341, 1.41047430...",8068
...,...,...,...,...,...,...,...
995,video7034,man in black shirt is holding a baby upside do...,"[[9522, 1.4664983749389648]]","[[9522, 1.4664983749389648], [9320, 1.47163987...","[[9522, 1.4664983749389648], [9320, 1.47163987...","[[9522, 1.4664983749389648], [9320, 1.47163987...",7034
996,video7568,the queen of england is seen walking with an e...,"[[7568, 1.2713391780853271]]","[[7568, 1.2713391780853271], [7116, 1.43614828...","[[7568, 1.2713391780853271], [7116, 1.43614828...","[[7568, 1.2713391780853271], [7116, 1.43614828...",7568
997,video7979,people talking about a fight,"[[7211, 1.4263927936553955]]","[[7211, 1.4263927936553955], [7835, 1.44549965...","[[7211, 1.4263927936553955], [7835, 1.44549965...","[[7211, 1.4263927936553955], [7835, 1.44549965...",7979
998,video7356,a vehicle with details on what comes with it b...,"[[7356, 1.3739084005355835]]","[[7356, 1.3739084005355835], [9358, 1.39286637...","[[7356, 1.3739084005355835], [9358, 1.39286637...","[[7356, 1.3739084005355835], [9358, 1.39286637...",7356


In [90]:
# Get eval metrics
from pprint import pprint
print("Metrics for 4 frame results")
pprint(get_all_eval_scores(multi_4_frame_msr_results_df))
print("Metrics for 8 frame results")
pprint(get_all_eval_scores(multi_8_frame_msr_results_df))

Metrics for 4 frame results
{'map': 0.4118039682539681,
 'ndcg@1': 0.311,
 'ndcg@10': 0.46854068360509477,
 'ndcg@5': 0.4313712173413061,
 'recall@1': 0.311,
 'recall@10': 0.651,
 'recall@5': 0.537}
Metrics for 8 frame results
{'map': 0.42762857142857114,
 'ndcg@1': 0.332,
 'ndcg@10': 0.48185493150879904,
 'ndcg@5': 0.44768332562758373,
 'recall@1': 0.332,
 'recall@10': 0.656,
 'recall@5': 0.551}


# Try evaluation against queries from FIRE benchmark

We are working with a sample of MSR-VTT and our evaluation pipeline supports only one relevant query per video, hence we need to filter the full FIRE benchmark to only include videos we have sampled and ones with a single relevant result.

FIRE_BENCHMARK_Q_JUDGEMENTS is created in the notebook `./clean_fire_judgements.ipynb`

In [94]:
# Run query pipeline using FIRE


# CSV parser function and pipeline recreated since the FIRE csv uses `video_id` instead of `frame_id`
def read_frame_search_fire_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['video_id'], line['sentence']
            
multi_4_frame_search_fire_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_fire_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top10_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_4_RET_COLLECTION, limit=15))
    .map('top10_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)

multi_8_frame_search_fire_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_fire_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top10_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_8_RET_COLLECTION, limit=15))
    .map('top10_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)
            
fire_query_results_4 = DataCollection(multi_4_frame_search_fire_pipeline(FIRE_BENCHMARK_Q_JUDGEMENTS))
fire_query_results_8 = DataCollection(multi_8_frame_search_fire_pipeline(FIRE_BENCHMARK_Q_JUDGEMENTS))



2025-04-17 14:58:15,028 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:58:15,070 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 14:58:15,115 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 14:58:15,120 - 16535318528 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 14:58:15,184 - 16535318528 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-17 14:58:15,638 - 16535318528 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co

In [97]:
# Save FIRE queries
fire_query_results_4_df = twohee_data_col_to_df(fire_query_results_4)
fire_query_results_4_df.to_csv('query_results/multi_4_frame_queries_fire.csv', index=False)
fire_query_results_8_df = twohee_data_col_to_df(fire_query_results_8)
fire_query_results_8_df.to_csv('query_results/multi_8_frame_queries_fire.csv', index=False)


In [98]:
multi_4_fire_scores = get_all_eval_scores(fire_query_results_4_df)
multi_8_fire_scores = get_all_eval_scores(fire_query_results_8_df)
print("FIRE Metrics for 4 frame results")
pprint(multi_4_fire_scores)
print("FIRE Metrics for 8 frame results")
pprint(multi_8_fire_scores)

FIRE Metrics for 4 frame results
{'map': 0.5466370943281771,
 'ndcg@1': 0.445859872611465,
 'ndcg@10': 0.5940961890411308,
 'ndcg@5': 0.5800861023425389,
 'recall@1': 0.445859872611465,
 'recall@10': 0.7420382165605095,
 'recall@5': 0.697452229299363}
FIRE Metrics for 8 frame results
{'map': 0.5742796481649985,
 'ndcg@1': 0.49044585987261147,
 'ndcg@10': 0.616343918349418,
 'ndcg@5': 0.5961737289559949,
 'recall@1': 0.49044585987261147,
 'recall@10': 0.7484076433121019,
 'recall@5': 0.6878980891719745}


## Final scores for multi-frame-based video retrieval  

In [103]:
scores_df = pd.DataFrame([get_all_eval_scores(multi_4_frame_msr_results_df), 
                          get_all_eval_scores(multi_8_frame_msr_results_df),
                          get_all_eval_scores(fire_query_results_4_df),
                          get_all_eval_scores(fire_query_results_8_df)
                          ], 
                         index=['4-Frames - MSRVTT Annotations', '8-Frames - MSRVTT Annotations',
                                '4-Frames - FIRE Queries', '8-Frames - FIRE Queries'])

styled_scores = scores_df.style.set_caption("Multi-Frame-based Retrieval Comparison").format("{:.3f}")

styled_scores

Unnamed: 0,recall@1,recall@5,recall@10,map,ndcg@1,ndcg@5,ndcg@10
4-Frames - MSRVTT Annotations,0.311,0.537,0.651,0.412,0.311,0.431,0.469
8-Frames - MSRVTT Annotations,0.332,0.551,0.656,0.428,0.332,0.448,0.482
4-Frames - FIRE Queries,0.446,0.697,0.742,0.547,0.446,0.58,0.594
8-Frames - FIRE Queries,0.49,0.688,0.748,0.574,0.49,0.596,0.616
