Video retrieval by embedding single frames using CLIP

In [None]:
import os
import csv
from towhee import ops, pipe
from towhee import DataCollection
from tqdm import tqdm
import pandas as pd
import numpy as np
from helpers import milvus_utils
from helpers.extract_frames import extract_frame
from helpers.eval_utils import twohee_data_col_to_df, get_all_eval_scores

In [2]:
# CONSTANTS

# Files
MSRVTT_SAMPLES = "./MSRVTT_1K.csv"
MSRVTT_SAMPLES_SINGLE_FRAME = "./MSRVTT_1K_frames_single.csv"
SINGLE_FRAMES_DIR = "./1k_frames_single"
# file created using raw FIRE judgements, see clean_fire_judgements.ipynb
FIRE_BENCHMARK_Q_JUDGEMENTS = "./fire_benchmark_q_judgements.csv" 


# Database Collections
FRAME_RET_COLLECTION = "single_frame_ret"

In [3]:
raw_samples_df = pd.read_csv(MSRVTT_SAMPLES)
raw_samples_df[['video_id', 'video_path', 'sentence']].head()

Unnamed: 0,video_id,video_path,sentence
0,video7579,./test_1k_compress/video7579.mp4,a girl wearing red top and black trouser is pu...
1,video7725,./test_1k_compress/video7725.mp4,young people sit around the edges of a room cl...
2,video9258,./test_1k_compress/video9258.mp4,a person is using a phone
3,video7365,./test_1k_compress/video7365.mp4,cartoon people are eating at a restaurant
4,video8068,./test_1k_compress/video8068.mp4,a woman on a couch talks to a a man


## Embedding Single Video Frames

Before we embed video frames, we need to extract and/or construct a single frame from each of the 1000 videos.

In [4]:
def create_single_frame_dataset(raw_samples_df):
    raw_video_samples = raw_samples_df.copy()
    for row in tqdm(raw_video_samples.iterrows(), total=len(raw_video_samples)):
        video_path = row[1]['video_path']
        image_name = os.path.basename(video_path).split('.')[0]
        image_path = os.path.join(SINGLE_FRAMES_DIR, image_name) + ".jpg"
        # extract frame 
        extract_frame(video_path, image_path)
        # add column with val to current row
        raw_video_samples.at[row[0], 'frame_path'] = image_path
        raw_video_samples.at[row[0], 'frame_id'] = image_name
    
    return raw_video_samples

In [5]:
single_frame_samples_df = create_single_frame_dataset(raw_samples_df)
single_frame_samples_df

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:10<00:00, 96.95it/s]


Unnamed: 0.1,Unnamed: 0,key,vid_key,video_id,sentence,video_path,frame_path,frame_id
0,521,ret521,msr7579,video7579,a girl wearing red top and black trouser is pu...,./test_1k_compress/video7579.mp4,./1k_frames_single/video7579.jpg,video7579
1,737,ret737,msr7725,video7725,young people sit around the edges of a room cl...,./test_1k_compress/video7725.mp4,./1k_frames_single/video7725.jpg,video7725
2,740,ret740,msr9258,video9258,a person is using a phone,./test_1k_compress/video9258.mp4,./1k_frames_single/video9258.jpg,video9258
3,660,ret660,msr7365,video7365,cartoon people are eating at a restaurant,./test_1k_compress/video7365.mp4,./1k_frames_single/video7365.jpg,video7365
4,411,ret411,msr8068,video8068,a woman on a couch talks to a a man,./test_1k_compress/video8068.mp4,./1k_frames_single/video8068.jpg,video8068
...,...,...,...,...,...,...,...,...
995,106,ret106,msr7034,video7034,man in black shirt is holding a baby upside do...,./test_1k_compress/video7034.mp4,./1k_frames_single/video7034.jpg,video7034
996,270,ret270,msr7568,video7568,the queen of england is seen walking with an e...,./test_1k_compress/video7568.mp4,./1k_frames_single/video7568.jpg,video7568
997,860,ret860,msr7979,video7979,people talking about a fight,./test_1k_compress/video7979.mp4,./1k_frames_single/video7979.jpg,video7979
998,435,ret435,msr7356,video7356,a vehicle with details on what comes with it b...,./test_1k_compress/video7356.mp4,./1k_frames_single/video7356.jpg,video7356


We write the transformed samples data to a CSV file so it can be loaded into the load pipeline.

In [6]:
single_frame_samples_df[['video_id', 'frame_path', 'frame_id', 'sentence']].to_csv(MSRVTT_SAMPLES_SINGLE_FRAME, index=False)

We create a collection to add the embedded frames to

In [7]:
# Create the collection in Milvus to store image embeddings
milvus_utils.create_milvus_collection(FRAME_RET_COLLECTION, 512)

<Collection>:
-------------
<name>: single_frame_ret
<description>: video retrieval
<schema>: {'auto_id': False, 'description': 'video retrieval', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}]}

In [8]:
def read_frame_loader_csv(csv_path, encoding='utf-8-sig'):
    with open(csv_path, 'r', encoding=encoding) as f:
        data = csv.DictReader(f)
        for line in data:
            raw_id = line['frame_id']
            cleaned_id = raw_id[len('video'):]
            yield int(cleaned_id), line['frame_path']

single_frame_loader_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('frame_id', 'frame_path'), read_frame_loader_csv)
    .map('frame_path', 'img', ops.image_decode.cv2('rgb'))
    .map('img', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map(('frame_id', 'vec'), (), ops.ann_insert.milvus_client(collection_name=FRAME_RET_COLLECTION))
    .output()
)

2025-04-17 12:33:54,946 - 8454604864 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 12:33:55,028 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:33:55,065 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:33:55,105 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 12:33:55,107 - 14456926208 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 12:33:55,162 - 14456926208 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-

In [9]:
single_frame_loader_pipeline(MSRVTT_SAMPLES_SINGLE_FRAME)

2025-04-17 12:33:55,938 - 15585062912 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-17 12:33:55,939 - 15601889280 - node.py-node:167 - INFO: Begin to run Node-read_frame_loader_csv-0
2025-04-17 12:33:55,939 - 15618715648 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-1
2025-04-17 12:33:55,939 - 15635542016 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-2
2025-04-17 12:33:55,939 - 15652368384 - node.py-node:167 - INFO: Begin to run Node-lambda-3
2025-04-17 12:33:55,940 - 15669194752 - node.py-node:167 - INFO: Begin to run Node-ann-insert/milvus-client-4
2025-04-17 12:33:55,940 - 15585062912 - node.py-node:167 - INFO: Begin to run Node-_output


<towhee.runtime.data_queue.DataQueue at 0x316715e70>

1000 single frame embeddings are now loaded! Next, we query the collection against the MSRVTT annotations first.

In [15]:
def read_frame_search_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['frame_id'], line['sentence']
            
frame_search_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top15_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_RET_COLLECTION, limit=15))
    .map('top15_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)

queries_ret_dc = DataCollection(frame_search_pipeline(MSRVTT_SAMPLES_SINGLE_FRAME))

2025-04-17 12:38:57,399 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:38:57,430 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:38:57,488 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 12:38:57,500 - 15829102592 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 12:38:57,590 - 15829102592 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-17 12:38:57,602 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:

In [16]:
queries_ret_dc.show()

rel_frame_id,query,top1,top5,top10,top15
video7579,a girl wearing red top and black trouser is putting a sweater on a dog,"[[7579, 1.4025176763534546]] len=1","[[7579, 1.4025176763534546],[7113, 1.4259119033813477],[8044, 1.4578959941864014],[7361, 1.4616591930389404],...] len=5","[[7579, 1.4025176763534546],[7113, 1.4259119033813477],[8044, 1.4578959941864014],[7361, 1.4616591930389404],...] len=10","[[7579, 1.4025176763534546],[7113, 1.4259119033813477],[8044, 1.4578959941864014],[7361, 1.4616591930389404],...] len=15"
video7725,young people sit around the edges of a room clapping and raising their arms while others dance in the center during a party,"[[8441, 1.3825130462646484]] len=1","[[8441, 1.3825130462646484],[7725, 1.4156618118286133],[9908, 1.4577584266662598],[9600, 1.4639201164245605],...] len=5","[[8441, 1.3825130462646484],[7725, 1.4156618118286133],[9908, 1.4577584266662598],[9600, 1.4639201164245605],...] len=10","[[8441, 1.3825130462646484],[7725, 1.4156618118286133],[9908, 1.4577584266662598],[9600, 1.4639201164245605],...] len=15"
video9258,a person is using a phone,"[[7728, 1.4489479064941406]] len=1","[[7728, 1.4489479064941406],[7029, 1.4623463153839111],[9258, 1.4660117626190186],[9257, 1.4693928956985474],...] len=5","[[7728, 1.4489479064941406],[7029, 1.4623463153839111],[9258, 1.4660117626190186],[9257, 1.4693928956985474],...] len=10","[[7728, 1.4489479064941406],[7029, 1.4623463153839111],[9258, 1.4660117626190186],[9257, 1.4693928956985474],...] len=15"
video7365,cartoon people are eating at a restaurant,"[[8911, 1.4093154668807983]] len=1","[[8911, 1.4093154668807983],[9777, 1.4120123386383057],[7365, 1.4395549297332764],[7747, 1.4573276042938232],...] len=5","[[8911, 1.4093154668807983],[9777, 1.4120123386383057],[7365, 1.4395549297332764],[7747, 1.4573276042938232],...] len=10","[[8911, 1.4093154668807983],[9777, 1.4120123386383057],[7365, 1.4395549297332764],[7747, 1.4573276042938232],...] len=15"
video8068,a woman on a couch talks to a a man,"[[9793, 1.3899463415145874]] len=1","[[9793, 1.3899463415145874],[7724, 1.4295322895050049],[7549, 1.4335342645645142],[8074, 1.438852071762085],...] len=5","[[9793, 1.3899463415145874],[7724, 1.4295322895050049],[7549, 1.4335342645645142],[8074, 1.438852071762085],...] len=10","[[9793, 1.3899463415145874],[7724, 1.4295322895050049],[7549, 1.4335342645645142],[8074, 1.438852071762085],...] len=15"


In [21]:
single_frame_queries_msrvtt = twohee_data_col_to_df(queries_ret_dc)
single_frame_queries_msrvtt

Unnamed: 0,rel_frame_id,query,top1,top5,top10,top15,ground_truth
0,video7579,a girl wearing red top and black trouser is pu...,"[[7579, 1.4025176763534546]]","[[7579, 1.4025176763534546], [7113, 1.42591190...","[[7579, 1.4025176763534546], [7113, 1.42591190...","[[7579, 1.4025176763534546], [7113, 1.42591190...",7579
1,video7725,young people sit around the edges of a room cl...,"[[8441, 1.3825130462646484]]","[[8441, 1.3825130462646484], [7725, 1.41566181...","[[8441, 1.3825130462646484], [7725, 1.41566181...","[[8441, 1.3825130462646484], [7725, 1.41566181...",7725
2,video9258,a person is using a phone,"[[7728, 1.4489479064941406]]","[[7728, 1.4489479064941406], [7029, 1.46234631...","[[7728, 1.4489479064941406], [7029, 1.46234631...","[[7728, 1.4489479064941406], [7029, 1.46234631...",9258
3,video7365,cartoon people are eating at a restaurant,"[[8911, 1.4093154668807983]]","[[8911, 1.4093154668807983], [9777, 1.41201233...","[[8911, 1.4093154668807983], [9777, 1.41201233...","[[8911, 1.4093154668807983], [9777, 1.41201233...",7365
4,video8068,a woman on a couch talks to a a man,"[[9793, 1.3899463415145874]]","[[9793, 1.3899463415145874], [7724, 1.42953228...","[[9793, 1.3899463415145874], [7724, 1.42953228...","[[9793, 1.3899463415145874], [7724, 1.42953228...",8068
...,...,...,...,...,...,...,...
995,video7034,man in black shirt is holding a baby upside do...,"[[9037, 1.4728624820709229]]","[[9037, 1.4728624820709229], [9028, 1.47589492...","[[9037, 1.4728624820709229], [9028, 1.47589492...","[[9037, 1.4728624820709229], [9028, 1.47589492...",7034
996,video7568,the queen of england is seen walking with an e...,"[[7568, 1.2998905181884766]]","[[7568, 1.2998905181884766], [8306, 1.51000714...","[[7568, 1.2998905181884766], [8306, 1.51000714...","[[7568, 1.2998905181884766], [8306, 1.51000714...",7568
997,video7979,people talking about a fight,"[[8490, 1.4737529754638672]]","[[8490, 1.4737529754638672], [7352, 1.47550010...","[[8490, 1.4737529754638672], [7352, 1.47550010...","[[8490, 1.4737529754638672], [7352, 1.47550010...",7979
998,video7356,a vehicle with details on what comes with it b...,"[[7597, 1.4469205141067505]]","[[7597, 1.4469205141067505], [7701, 1.46081280...","[[7597, 1.4469205141067505], [7701, 1.46081280...","[[7597, 1.4469205141067505], [7701, 1.46081280...",7356


In [23]:
# Save query results for further evaluation
single_frame_queries_msrvtt.to_csv('query_results/single_frame_queries_msrvtt.csv', index=False)

In [25]:
get_all_eval_scores(single_frame_queries_msrvtt)

{'recall@1': 0.255,
 'recall@5': 0.466,
 'recall@10': 0.565,
 'map': 0.3457964285714284,
 'ndcg@1': 0.255,
 'ndcg@5': 0.36551375797361213,
 'ndcg@10': 0.39791424686211235}

### Try evaluation against queries from FIRE benchmark

We are working with a sample of MSR-VTT and our evaluation pipeline supports only one relevant query per video, hence we need to filter the full FIRE benchmark to only include videos we have sampled and ones with a single relevant result.

FIRE_BENCHMARK_Q_JUDGEMENTS is created in the notebook `./clean_fire_judgements.ipynb`

In [32]:
# Run query pipeline using FIRE


# CSV parser function and pipeline recreated since the FIRE csv uses `video_id` instead of `frame_id`
def read_frame_search_fire_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['video_id'], line['sentence']
            
frame_search_fire_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_fire_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top10_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_RET_COLLECTION, limit=15))
    .map('top10_raw_res', ('top1', 'top5', 'top10', 'top15'), lambda x: (x[:1], x[:5], x[:10], x[:15]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10', 'top15')
)

fire_query_results_single_frame_dc = DataCollection(frame_search_fire_pipeline(FIRE_BENCHMARK_Q_JUDGEMENTS))

2025-04-17 12:47:36,332 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:47:36,363 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-17 12:47:36,410 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-17 12:47:36,416 - 16080465920 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-17 12:47:36,511 - 16080465920 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-17 12:47:36,518 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:

In [33]:
fire_query_results_single_frame_dc.show()

rel_frame_id,query,top1,top5,top10,top15
video8469,two parrots in a bird cage one white chick and on green adult,"[[8469, 1.4449390172958374]] len=1","[[8469, 1.4449390172958374],[7849, 1.4497870206832886],[7822, 1.4854648113250732],[9014, 1.4984512329101562],...] len=5","[[8469, 1.4449390172958374],[7849, 1.4497870206832886],[7822, 1.4854648113250732],[9014, 1.4984512329101562],...] len=10","[[8469, 1.4449390172958374],[7849, 1.4497870206832886],[7822, 1.4854648113250732],[9014, 1.4984512329101562],...] len=15"
video9687,a man chopping lobster and taking off the shell,"[[7820, 1.40888512134552]] len=1","[[7820, 1.40888512134552],[9742, 1.4197094440460205],[9687, 1.4254179000854492],[8834, 1.4288145303726196],...] len=5","[[7820, 1.40888512134552],[9742, 1.4197094440460205],[9687, 1.4254179000854492],[8834, 1.4288145303726196],...] len=10","[[7820, 1.40888512134552],[9742, 1.4197094440460205],[9687, 1.4254179000854492],[8834, 1.4288145303726196],...] len=15"
video7698,two women are walking in a parking lot,"[[7558, 1.4385546445846558]] len=1","[[7558, 1.4385546445846558],[9039, 1.4457066059112549],[7698, 1.4519243240356445],[8044, 1.4523491859436035],...] len=5","[[7558, 1.4385546445846558],[9039, 1.4457066059112549],[7698, 1.4519243240356445],[8044, 1.4523491859436035],...] len=10","[[7558, 1.4385546445846558],[9039, 1.4457066059112549],[7698, 1.4519243240356445],[8044, 1.4523491859436035],...] len=15"
video9503,a woman is talking about how jeans with patches or rips is trendy,"[[9503, 1.4195761680603027]] len=1","[[9503, 1.4195761680603027],[8825, 1.4488005638122559],[9039, 1.4948625564575195],[9698, 1.4951632022857666],...] len=5","[[9503, 1.4195761680603027],[8825, 1.4488005638122559],[9039, 1.4948625564575195],[9698, 1.4951632022857666],...] len=10","[[9503, 1.4195761680603027],[8825, 1.4488005638122559],[9039, 1.4948625564575195],[9698, 1.4951632022857666],...] len=15"
video8903,a naked child runs through a field,"[[9031, 1.3999378681182861]] len=1","[[9031, 1.3999378681182861],[9805, 1.4242286682128906],[8125, 1.4620842933654785],[8903, 1.4644503593444824],...] len=5","[[9031, 1.3999378681182861],[9805, 1.4242286682128906],[8125, 1.4620842933654785],[8903, 1.4644503593444824],...] len=10","[[9031, 1.3999378681182861],[9805, 1.4242286682128906],[8125, 1.4620842933654785],[8903, 1.4644503593444824],...] len=15"


In [34]:
single_frame_queries_fire = twohee_data_col_to_df(fire_query_results_single_frame_dc)
single_frame_queries_fire

Unnamed: 0,rel_frame_id,query,top1,top5,top10,top15,ground_truth
0,video8469,two parrots in a bird cage one white chick and...,"[[8469, 1.4449390172958374]]","[[8469, 1.4449390172958374], [7849, 1.44978702...","[[8469, 1.4449390172958374], [7849, 1.44978702...","[[8469, 1.4449390172958374], [7849, 1.44978702...",8469
1,video9687,a man chopping lobster and taking off the shell,"[[7820, 1.40888512134552]]","[[7820, 1.40888512134552], [9742, 1.4197094440...","[[7820, 1.40888512134552], [9742, 1.4197094440...","[[7820, 1.40888512134552], [9742, 1.4197094440...",9687
2,video7698,two women are walking in a parking lot,"[[7558, 1.4385546445846558]]","[[7558, 1.4385546445846558], [9039, 1.44570660...","[[7558, 1.4385546445846558], [9039, 1.44570660...","[[7558, 1.4385546445846558], [9039, 1.44570660...",7698
3,video9503,a woman is talking about how jeans with patche...,"[[9503, 1.4195761680603027]]","[[9503, 1.4195761680603027], [8825, 1.44880056...","[[9503, 1.4195761680603027], [8825, 1.44880056...","[[9503, 1.4195761680603027], [8825, 1.44880056...",9503
4,video8903,a naked child runs through a field,"[[9031, 1.3999378681182861]]","[[9031, 1.3999378681182861], [9805, 1.42422866...","[[9031, 1.3999378681182861], [9805, 1.42422866...","[[9031, 1.3999378681182861], [9805, 1.42422866...",8903
...,...,...,...,...,...,...,...
309,video9801,selena gomez clips of her videos and her danci...,"[[9801, 1.3812708854675293]]","[[9801, 1.3812708854675293], [8073, 1.41013109...","[[9801, 1.3812708854675293], [8073, 1.41013109...","[[9801, 1.3812708854675293], [8073, 1.41013109...",9801
310,video8267,video of gymasts practicing to roll,"[[8339, 1.3414828777313232]]","[[8339, 1.3414828777313232], [9771, 1.35292196...","[[8339, 1.3414828777313232], [9771, 1.35292196...","[[8339, 1.3414828777313232], [9771, 1.35292196...",8267
311,video8928,a woman walking along side a river in a bikini,"[[8928, 1.4537211656570435]]","[[8928, 1.4537211656570435], [7025, 1.45391392...","[[8928, 1.4537211656570435], [7025, 1.45391392...","[[8928, 1.4537211656570435], [7025, 1.45391392...",8928
312,video7782,a little girl talking to her and is scared,"[[7542, 1.4020490646362305]]","[[7542, 1.4020490646362305], [9404, 1.40517711...","[[7542, 1.4020490646362305], [9404, 1.40517711...","[[7542, 1.4020490646362305], [9404, 1.40517711...",7782


In [35]:
single_frame_queries_fire.to_csv('query_results/single_frame_queries_fire.csv', index=False)

In [None]:
get_all_eval_scores(single_frame_queries_fire)

{'recall@1': 0.3853503184713376,
 'recall@5': 0.5828025477707006,
 'recall@10': 0.6528662420382165,
 'map': 0.4671317359215448,
 'ndcg@1': 0.3853503184713376,
 'ndcg@5': 0.4886329774890437,
 'ndcg@10': 0.5116411440087274}

## Final Scores for Single Frame Retrieval

In [49]:
msrvtt_ans_scores = get_all_eval_scores(single_frame_queries_msrvtt)
fire_ans_scores = get_all_eval_scores(single_frame_queries_fire)
# Combine the scores into a single DataFrame for better presentation
scores_df = pd.DataFrame([msrvtt_ans_scores, fire_ans_scores], index=['MSRVTT', 'FIRE'])

# Style the DataFrame for better visualization
styled_scores = scores_df.style.set_caption("Model Metrics Comparison").format("{:.3f}")

styled_scores

Unnamed: 0,recall@1,recall@5,recall@10,map,ndcg@1,ndcg@5,ndcg@10
MSRVTT,0.255,0.466,0.565,0.346,0.255,0.366,0.398
FIRE,0.385,0.583,0.653,0.467,0.385,0.489,0.512
