Video retrieval by embedding single frames using CLIP

In [1]:
import os
import csv
from towhee import ops, pipe, register
from towhee.operator import PyOperator
from towhee import DataCollection
from tqdm import tqdm
import pandas as pd
import json
import numpy as np
from helpers import milvus_utils
from helpers.extract_frames import extract_frame, extract_n_frames

Connected to Milvus server at port 19530


In [23]:
# CONSTANTS

# Files
MSRVTT_SAMPLES = "./MSRVTT_1K.csv"
MSRVTT_SAMPLES_WITH_FRAMES = "./MSRVTT_1K_frames.csv"
# file created using raw FIRE judgements, see clean_fire_judgements.ipynb
FIRE_BENCHMARK_Q_JUDGEMENTS = "./fire_benchmark_q_judgements.csv" 

# Database Collections
FRAME_RET_COLLECTION = "msrvtt_multi_frame_ret_1"

In [3]:
raw_samples_df = pd.read_csv(MSRVTT_SAMPLES)
raw_samples_df[['video_id', 'video_path', 'sentence']].head()

Unnamed: 0,video_id,video_path,sentence
0,video7579,./test_1k_compress/video7579.mp4,a girl wearing red top and black trouser is pu...
1,video7725,./test_1k_compress/video7725.mp4,young people sit around the edges of a room cl...
2,video9258,./test_1k_compress/video9258.mp4,a person is using a phone
3,video7365,./test_1k_compress/video7365.mp4,cartoon people are eating at a restaurant
4,video8068,./test_1k_compress/video8068.mp4,a woman on a couch talks to a a man


Before we embed video frames, we need to extract and/or construct a single frame from each of the 1000 videos.

In [20]:
import cv2


def extract_n_frames_2(video_path, output_folder, n=10):
    """
    Extract n equally spaced frames from a video file and save them to a directory.
    
    Args:
        video_path (str): Path to the input video file
        output_folder (str): Path to the output directory to save frames
        n (int): Number of equally spaced frames to extract (default: 10)
    """
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the video file
    video = cv2.VideoCapture(video_path)
    
    # Get the total number of frames in the video
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Calculate the frame interval to get n equally spaced frames
    if total_frames <= n:
        # If video has fewer frames than requested, extract all frames
        frame_indices = list(range(total_frames))
    else:
        # Calculate indices of equally spaced frames
        frame_indices = [int(i * total_frames / n) for i in range(n)]
    
    # Get the video filename for naming the frames
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    
    # Extract the frames at the calculated indices
    for i, frame_index in enumerate(frame_indices):
        # Set the video position to the desired frame
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        
        # Read the frame
        ret, frame = video.read()
        
        # Break if frame reading failed
        if not ret:
            print(f"Failed to read frame at index {frame_index}")
            continue
        
        # Generate the output filename
        output_filename = f"{video_name}_frame_{i+1:03d}_of_{n:03d}.jpg"
        output_path = os.path.join(output_folder, output_filename)
        
        # Save the frame as an image
        cv2.imwrite(output_path, frame)
    
    # Release the video capture object
    video.release()
    
    # Return the list of saved frame paths
    frame_paths = [os.path.join(output_folder, f"{video_name}_frame_{i+1:03d}_of_{n:03d}.jpg") for i in range(len(frame_indices))]
    return frame_paths

In [21]:
for row in tqdm(raw_samples_df.iterrows(), total=len(raw_samples_df)):
    video_path = row[1]['video_path']
    images_dir = "./test_1k_images_3"
    image_name = os.path.basename(video_path).split('.')[0]
    image_path = os.path.join(images_dir, image_name) + ".jpg"
    # extract 4 frames per video
    all_frame_paths = extract_n_frames_2(video_path, images_dir, 4)
    f1, f2, f3, f4 = all_frame_paths
    # extract_frame(video_path, image_path)
    # add column with val to current row
    raw_samples_df.at[row[0], 'frame_path_1'] = f1
    raw_samples_df.at[row[0], 'frame_path_2'] = f2
    raw_samples_df.at[row[0], 'frame_path_3'] = f3
    raw_samples_df.at[row[0], 'frame_path_4'] = f4
    
    raw_samples_df.at[row[0], 'frame_id'] = image_name
    
# Now this should contain new columns with the frame_path and frame_id
raw_samples_df

100%|██████████| 1000/1000 [00:22<00:00, 45.05it/s]


Unnamed: 0.1,Unnamed: 0,key,vid_key,video_id,sentence,video_path,frame_path_1,frame_path_2,frame_path_3,frame_path_4,frame_id
0,521,ret521,msr7579,video7579,a girl wearing red top and black trouser is pu...,./test_1k_compress/video7579.mp4,./test_1k_images_3/video7579_frame_001_of_004.jpg,./test_1k_images_3/video7579_frame_002_of_004.jpg,./test_1k_images_3/video7579_frame_003_of_004.jpg,./test_1k_images_3/video7579_frame_004_of_004.jpg,video7579
1,737,ret737,msr7725,video7725,young people sit around the edges of a room cl...,./test_1k_compress/video7725.mp4,./test_1k_images_3/video7725_frame_001_of_004.jpg,./test_1k_images_3/video7725_frame_002_of_004.jpg,./test_1k_images_3/video7725_frame_003_of_004.jpg,./test_1k_images_3/video7725_frame_004_of_004.jpg,video7725
2,740,ret740,msr9258,video9258,a person is using a phone,./test_1k_compress/video9258.mp4,./test_1k_images_3/video9258_frame_001_of_004.jpg,./test_1k_images_3/video9258_frame_002_of_004.jpg,./test_1k_images_3/video9258_frame_003_of_004.jpg,./test_1k_images_3/video9258_frame_004_of_004.jpg,video9258
3,660,ret660,msr7365,video7365,cartoon people are eating at a restaurant,./test_1k_compress/video7365.mp4,./test_1k_images_3/video7365_frame_001_of_004.jpg,./test_1k_images_3/video7365_frame_002_of_004.jpg,./test_1k_images_3/video7365_frame_003_of_004.jpg,./test_1k_images_3/video7365_frame_004_of_004.jpg,video7365
4,411,ret411,msr8068,video8068,a woman on a couch talks to a a man,./test_1k_compress/video8068.mp4,./test_1k_images_3/video8068_frame_001_of_004.jpg,./test_1k_images_3/video8068_frame_002_of_004.jpg,./test_1k_images_3/video8068_frame_003_of_004.jpg,./test_1k_images_3/video8068_frame_004_of_004.jpg,video8068
...,...,...,...,...,...,...,...,...,...,...,...
995,106,ret106,msr7034,video7034,man in black shirt is holding a baby upside do...,./test_1k_compress/video7034.mp4,./test_1k_images_3/video7034_frame_001_of_004.jpg,./test_1k_images_3/video7034_frame_002_of_004.jpg,./test_1k_images_3/video7034_frame_003_of_004.jpg,./test_1k_images_3/video7034_frame_004_of_004.jpg,video7034
996,270,ret270,msr7568,video7568,the queen of england is seen walking with an e...,./test_1k_compress/video7568.mp4,./test_1k_images_3/video7568_frame_001_of_004.jpg,./test_1k_images_3/video7568_frame_002_of_004.jpg,./test_1k_images_3/video7568_frame_003_of_004.jpg,./test_1k_images_3/video7568_frame_004_of_004.jpg,video7568
997,860,ret860,msr7979,video7979,people talking about a fight,./test_1k_compress/video7979.mp4,./test_1k_images_3/video7979_frame_001_of_004.jpg,./test_1k_images_3/video7979_frame_002_of_004.jpg,./test_1k_images_3/video7979_frame_003_of_004.jpg,./test_1k_images_3/video7979_frame_004_of_004.jpg,video7979
998,435,ret435,msr7356,video7356,a vehicle with details on what comes with it b...,./test_1k_compress/video7356.mp4,./test_1k_images_3/video7356_frame_001_of_004.jpg,./test_1k_images_3/video7356_frame_002_of_004.jpg,./test_1k_images_3/video7356_frame_003_of_004.jpg,./test_1k_images_3/video7356_frame_004_of_004.jpg,video7356


In [22]:
# We write the transformed samples data to a CSV file so it can be loaded into the load pipeline
# raw_samples_df[['video_id', 'frame_path', 'frame_id', 'sentence']].to_csv(MSRVTT_SAMPLES_WITH_FRAMES, index=False)
raw_samples_df[['video_id', 'frame_path_1', 'frame_path_2', 'frame_path_3', 'frame_path_4', 'frame_id', 'sentence']].to_csv(MSRVTT_SAMPLES_WITH_FRAMES, index=False)

In [24]:
# Create the collection in Milvus to store image embeddings
milvus_utils.create_milvus_collection(FRAME_RET_COLLECTION, 512)

<Collection>:
-------------
<name>: msrvtt_multi_frame_ret_1
<description>: video retrieval
<schema>: {'auto_id': False, 'description': 'video retrieval', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}]}

In [27]:
def read_frame_loader_csv(csv_path, encoding='utf-8-sig'):
    with open(csv_path, 'r', encoding=encoding) as f:
        data = csv.DictReader(f)
        for line in data:
            raw_id = line['frame_id']
            cleaned_id = raw_id[len('video'):]
            yield int(cleaned_id), line['frame_path_1'], line['frame_path_2'], line['frame_path_3'], line['frame_path_4']

frame_loader_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('frame_id', 'f_path_1', 'f_path_2','f_path_3', 'f_path_4'), read_frame_loader_csv)
    .map('f_path_1', 'img1', ops.image_decode.cv2('rgb'))
    .map('f_path_2', 'img2', ops.image_decode.cv2('rgb'))
    .map('f_path_3', 'img3', ops.image_decode.cv2('rgb'))
    .map('f_path_4', 'img4', ops.image_decode.cv2('rgb'))
    .map('img1', 'vec1', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img2', 'vec2', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img3', 'vec3', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map('img4', 'vec4', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='image', device='mps'))
    .map(('vec1', 'vec2', 'vec3', 'vec4'), 'vec', lambda v1, v2, v3, v4: np.mean([v1, v2, v3, v4], axis=0))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map(('frame_id', 'vec'), (), ops.ann_insert.milvus_client(collection_name=FRAME_RET_COLLECTION))
    .output()
)

2025-04-16 23:40:15,873 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 23:40:15,944 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 23:40:16,005 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-16 23:40:16,008 - 18983612416 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-16 23:40:16,099 - 18983612416 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-16 23:40:16,277 - 18983612416 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co

In [28]:
frame_loader_pipeline(MSRVTT_SAMPLES_WITH_FRAMES)

2025-04-16 23:40:21,793 - 20839493632 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-16 23:40:21,793 - 21464379392 - node.py-node:167 - INFO: Begin to run Node-read_frame_loader_csv-0
2025-04-16 23:40:21,793 - 21481205760 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-1
2025-04-16 23:40:21,794 - 21498032128 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-2
2025-04-16 23:40:21,794 - 21514858496 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-3
2025-04-16 23:40:21,794 - 21531684864 - node.py-node:167 - INFO: Begin to run Node-image-decode/cv2-4
2025-04-16 23:40:21,795 - 20839493632 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-5
2025-04-16 23:40:21,795 - 21565337600 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-6
2025-04-16 23:40:21,796 - 21582163968 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-7
2025-04-16 23:40:21,796 - 21548511232 - node.py-node:167 - INF

<towhee.runtime.data_queue.DataQueue at 0x39ed33ac0>

In [29]:
def read_frame_search_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['frame_id'], line['sentence']
            
frame_search_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top10_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_RET_COLLECTION, limit=10))
    # .map('vec', 'top10_raw_res', 
    #      ops.ann_search.milvus_client(collection_name=VIDEO_RET_COLLECTION, limit=10))
    .map('top10_raw_res', ('top1', 'top5', 'top10'), lambda x: (x[:1], x[:5], x[:10]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10')
    # .output('vec')
)



2025-04-16 23:43:48,760 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 23:43:48,845 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 23:43:48,878 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-16 23:43:48,881 - 18807287808 - connectionpool.py-connectionpool:1049 - DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2025-04-16 23:43:48,971 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2025-04-16 23:43:48,994 - 18807287808 - connectionpool.py-connectionpool:544 - DEBUG: h

2025-04-16 23:43:49,679 - 18807287808 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/refs%2Fpr%2F10/model.safetensors HTTP/1.1" 302 0


In [30]:
ret_dc = DataCollection(frame_search_pipeline(MSRVTT_SAMPLES_WITH_FRAMES))

2025-04-16 23:43:52,744 - 18807287808 - node.py-node:167 - INFO: Begin to run Node-_input
2025-04-16 23:43:52,768 - 18842988544 - node.py-node:167 - INFO: Begin to run Node-read_frame_search_csv-0
2025-04-16 23:43:52,779 - 18859814912 - node.py-node:167 - INFO: Begin to run Node-image-text-embedding/clip-1
2025-04-16 23:43:52,781 - 18876641280 - node.py-node:167 - INFO: Begin to run Node-lambda-2
2025-04-16 23:43:52,783 - 18893467648 - node.py-node:167 - INFO: Begin to run Node-ann-search/milvus-client-3
2025-04-16 23:43:52,838 - 18910294016 - node.py-node:167 - INFO: Begin to run Node-lambda-4
2025-04-16 23:43:52,838 - 18807287808 - node.py-node:167 - INFO: Begin to run Node-_output


In [31]:
ret_dc.show()

rel_frame_id,query,top1,top5,top10
video7579,a girl wearing red top and black trouser is putting a sweater on a dog,"[[7579, 1.3646539449691772]] len=1","[[7579, 1.3646539449691772],[9451, 1.4305797815322876],[9603, 1.4387223720550537],[9405, 1.4451756477355957],...] len=5","[[7579, 1.3646539449691772],[9451, 1.4305797815322876],[9603, 1.4387223720550537],[9405, 1.4451756477355957],...] len=10"
video7725,young people sit around the edges of a room clapping and raising their arms while others dance in the center during a party,"[[8441, 1.4233133792877197]] len=1","[[8441, 1.4233133792877197],[7444, 1.4263453483581543],[7725, 1.4346461296081543],[8339, 1.4430490732192993],...] len=5","[[8441, 1.4233133792877197],[7444, 1.4263453483581543],[7725, 1.4346461296081543],[8339, 1.4430490732192993],...] len=10"
video9258,a person is using a phone,"[[9257, 1.4228744506835938]] len=1","[[9257, 1.4228744506835938],[9697, 1.4255503416061401],[9829, 1.4330558776855469],[9258, 1.4334453344345093],...] len=5","[[9257, 1.4228744506835938],[9697, 1.4255503416061401],[9829, 1.4330558776855469],[9258, 1.4334453344345093],...] len=10"
video7365,cartoon people are eating at a restaurant,"[[9777, 1.3961538076400757]] len=1","[[9777, 1.3961538076400757],[7365, 1.4227542877197266],[9537, 1.4308252334594727],[7741, 1.4411455392837524],...] len=5","[[9777, 1.3961538076400757],[7365, 1.4227542877197266],[9537, 1.4308252334594727],[7741, 1.4411455392837524],...] len=10"
video8068,a woman on a couch talks to a a man,"[[7724, 1.3794373273849487]] len=1","[[7724, 1.3794373273849487],[7234, 1.4042013883590698],[7341, 1.4079279899597168],[9347, 1.408487319946289],...] len=5","[[7724, 1.3794373273849487],[7234, 1.4042013883590698],[7341, 1.4079279899597168],[9347, 1.408487319946289],...] len=10"


In [32]:
# TODO remove this, import from helpers and rerun the whole notebook

def twohee_data_col_to_df(twohee_data_collection):
    res_list = twohee_data_collection.to_list()
    res_obj_list = []
    for r in res_list:
        res_obj = vars(r)
        res_obj_list.append(res_obj)
    res_df = pd.DataFrame(res_obj_list)
    
    # Add ground truth column
    if 'rel_video_id' in res_df.columns:
        res_df['ground_truth'] = res_df['rel_video_id'].apply(
            lambda x: int(x[len('video'):]))
    if 'rel_frame_id' in res_df.columns:
        res_df['ground_truth'] = res_df['rel_frame_id'].apply(
            lambda x: int(x[len('video'):]))
    else:
        raise ValueError("No rel_video_id or rel_frame_id found in the DataCollection")
    return res_df.copy()


def average_precision(ground_truth, predictions):
    """
    Calculate the Average Precision (AP) for a single query.

    Args:
        ground_truth (int): The ground truth video ID.
        predictions (list): List of predicted video IDs.

    Returns:
        float: The Average Precision (AP) score for the query.
    """
    hits = 0
    sum_precision = 0
    for i, pred in enumerate(predictions):
        if pred == ground_truth:
            hits += 1
            sum_precision += hits / (i + 1)
    return sum_precision / hits if hits > 0 else 0


def calculate_mean_average_precision(df):
    """
    Calculate the Mean Average Precision (MAP) for the given dataframe.

    Args:
        df (pd.DataFrame): DataFrame containing columns 'query', 'ground_truth', 'top1', 'top5', 'top10'.

    Returns:
        float: The Mean Average Precision (MAP) score.
    """
    # Calculate AP for each query
    ap_scores = []
    for _, row in df.iterrows():
        ground_truth = row['ground_truth']
        predictions_with_scores = row['top10']
        predictions = [pred[0] for pred in predictions_with_scores]
        ap_scores.append(average_precision(ground_truth, predictions))

    # Calculate MAP
    mean_ap = sum(ap_scores) / len(ap_scores) if ap_scores else 0
    return mean_ap


def calculate_recall(df):
    """
    Calculate recall@1, recall@5, and recall@10 for the given dataframe.

    Args:
        df (pd.DataFrame): DataFrame containing columns 'query', 'ground_truth', 'top1', 'top5', 'top10'.

    Returns:
        dict: A dictionary containing recall@1, recall@5, and recall@10.
    """
    recall_at_1 = 0
    recall_at_5 = 0
    recall_at_10 = 0
    total_queries = len(df)

    for _, row in df.iterrows():
        ground_truth = row['ground_truth']
        if ground_truth in [pred[0] for pred in row['top1']]:
            recall_at_1 += 1
        if ground_truth in [pred[0] for pred in row['top5']]:
            recall_at_5 += 1
        if ground_truth in [pred[0] for pred in row['top10']]:
            recall_at_10 += 1

    return {
        'recall@1': recall_at_1 / total_queries,
        'recall@5': recall_at_5 / total_queries,
        'recall@10': recall_at_10 / total_queries
    }


def ndcg_score(ground_truth, predictions, k=10):
    """
    Calculate the Normalized Discounted Cumulative Gain (NDCG) for a single query.

    Args:
        ground_truth (int): The ground truth video ID.
        predictions (list): List of predicted video IDs with scores [(id, score), ...].
        k (int): The number of top predictions to consider.

    Returns:
        float: The NDCG score for the query.
    """
    def dcg(relevance_scores):
        return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

    # Relevance scores: 1 if the prediction matches the ground truth, else 0
    relevance_scores = [1 if pred[0] ==
                        ground_truth else 0 for pred in predictions[:k]]

    # Calculate DCG and IDCG
    actual_dcg = dcg(relevance_scores)
    ideal_dcg = dcg(sorted(relevance_scores, reverse=True))

    # Return NDCG
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

# call this function to get the NDCG score for each query


def calculate_ndcg(df, k=10):
    """
    Calculate NDCG for the given dataframe.

    Args:
        df (pd.DataFrame): DataFrame containing columns 'query', 'ground_truth', 'top1', 'top5', 'top10'.
        k (int): The number of top predictions to consider.

    Returns:
        float: The mean NDCG score.
    """
    ndcg_scores = []
    for _, row in df.iterrows():
        ground_truth = row['ground_truth']
        predictions_with_scores = row['top10']
        ndcg_scores.append(ndcg_score(
            ground_truth, predictions_with_scores, k))

    return sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0


def get_all_eval_scores(df):
    """Return a dataframe with all evaluation scores: Recall@1, Recall@5, Recall@10, MAP, NDCG@1, NDCG@5, NDCG@10"""
    recall_scores = calculate_recall(df)
    map_score = calculate_mean_average_precision(df)
    ndcg_score_1 = calculate_ndcg(df, k=1)
    ndcg_score_5 = calculate_ndcg(df, k=5)
    ndcg_score_10 = calculate_ndcg(df, k=10)

    eval_scores = {
        'recall@1': recall_scores['recall@1'],
        'recall@5': recall_scores['recall@5'],
        'recall@10': recall_scores['recall@10'],
        'map': map_score,
        'ndcg@1': ndcg_score_1,
        'ndcg@5': ndcg_score_5,
        'ndcg@10': ndcg_score_10
    }

    return eval_scores


In [33]:
twohee_data_col_to_df(ret_dc)

Unnamed: 0,rel_frame_id,query,top1,top5,top10,ground_truth
0,video7579,a girl wearing red top and black trouser is pu...,"[[7579, 1.3646539449691772]]","[[7579, 1.3646539449691772], [9451, 1.43057978...","[[7579, 1.3646539449691772], [9451, 1.43057978...",7579
1,video7725,young people sit around the edges of a room cl...,"[[8441, 1.4233133792877197]]","[[8441, 1.4233133792877197], [7444, 1.42634534...","[[8441, 1.4233133792877197], [7444, 1.42634534...",7725
2,video9258,a person is using a phone,"[[9257, 1.4228744506835938]]","[[9257, 1.4228744506835938], [9697, 1.42555034...","[[9257, 1.4228744506835938], [9697, 1.42555034...",9258
3,video7365,cartoon people are eating at a restaurant,"[[9777, 1.3961538076400757]]","[[9777, 1.3961538076400757], [7365, 1.42275428...","[[9777, 1.3961538076400757], [7365, 1.42275428...",7365
4,video8068,a woman on a couch talks to a a man,"[[7724, 1.3794373273849487]]","[[7724, 1.3794373273849487], [7234, 1.40420138...","[[7724, 1.3794373273849487], [7234, 1.40420138...",8068
...,...,...,...,...,...,...
995,video7034,man in black shirt is holding a baby upside do...,"[[9885, 1.4678269624710083]]","[[9885, 1.4678269624710083], [9320, 1.47455167...","[[9885, 1.4678269624710083], [9320, 1.47455167...",7034
996,video7568,the queen of england is seen walking with an e...,"[[7568, 1.2751219272613525]]","[[7568, 1.2751219272613525], [7116, 1.45783209...","[[7568, 1.2751219272613525], [7116, 1.45783209...",7568
997,video7979,people talking about a fight,"[[7211, 1.4365811347961426]]","[[7211, 1.4365811347961426], [7501, 1.43993902...","[[7211, 1.4365811347961426], [7501, 1.43993902...",7979
998,video7356,a vehicle with details on what comes with it b...,"[[7356, 1.3352746963500977]]","[[7356, 1.3352746963500977], [8819, 1.39669334...","[[7356, 1.3352746963500977], [8819, 1.39669334...",7356


In [34]:
get_all_eval_scores(twohee_data_col_to_df(ret_dc))

{'recall@1': 0.311,
 'recall@5': 0.537,
 'recall@10': 0.651,
 'map': 0.4118039682539681,
 'ndcg@1': 0.311,
 'ndcg@5': 0.4313712173413061,
 'ndcg@10': 0.46854068360509477}

# Try evaluation against queries from FIRE benchmark

We are working with a sample of MSR-VTT and our evaluation pipeline supports only one relevant query per video, hence we need to filter the full FIRE benchmark to only include videos we have sampled and ones with a single relevant result.

FIRE_BENCHMARK_Q_JUDGEMENTS is created in the notebook `./clean_fire_judgements.ipynb`

In [None]:
# Run query pipeline using FIRE


# CSV parser function and pipeline recreated since the FIRE csv uses `video_id` instead of `frame_id`
def read_frame_search_fire_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8-sig') as f:
        data = csv.DictReader(f)
        for line in data:
            yield line['video_id'], line['sentence']
            
frame_search_fire_pipeline = (
    pipe.input('csv_file')
    .flat_map('csv_file', ('rel_frame_id', 'query'), read_frame_search_fire_csv)
    .map('query', 'vec', ops.image_text_embedding.clip(model_name='clip_vit_base_patch16', modality='text', device='mps'))
    .map('vec', 'vec', lambda x: x / np.linalg.norm(x))
    .map('vec', 'top10_raw_res', ops.ann_search.milvus_client(collection_name=FRAME_RET_COLLECTION, limit=10))
    .map('top10_raw_res', ('top1', 'top5', 'top10'), lambda x: (x[:1], x[:5], x[:10]))
    .output('rel_frame_id', 'query', 'top1', 'top5', 'top10')
)
            
fire_query_results = DataCollection(frame_search_fire_pipeline(FIRE_BENCHMARK_Q_JUDGEMENTS))
fire_query_results.show()

2025-04-16 17:45:31,823 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 17:45:31,928 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/config.json HTTP/1.1" 200 0
2025-04-16 17:45:31,979 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/model.safetensors HTTP/1.1" 404 0
2025-04-16 17:45:32,042 - 17460719616 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "GET /api/models/openai/clip-vit-base-patch16 HTTP/1.1" 200 3499
2025-04-16 17:45:32,094 - 8454604864 - connectionpool.py-connectionpool:544 - DEBUG: https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch16/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2025-04-16 17:45:32,188 - 8454604864 - connectionp

rel_frame_id,query,top1,top5,top10
video8469,two parrots in a bird cage one white chick and on green adult,"[[8469, 1.4449390172958374]] len=1","[[8469, 1.4449390172958374],[7849, 1.4497870206832886],[7822, 1.4854648113250732]] len=3","[[8469, 1.4449390172958374],[7849, 1.4497870206832886],[7822, 1.4854648113250732]] len=3"
video9687,a man chopping lobster and taking off the shell,"[[7820, 1.40888512134552]] len=1","[[7820, 1.40888512134552],[9742, 1.4197094440460205],[9687, 1.4254179000854492]] len=3","[[7820, 1.40888512134552],[9742, 1.4197094440460205],[9687, 1.4254179000854492]] len=3"
video7698,two women are walking in a parking lot,"[[7558, 1.4385546445846558]] len=1","[[7558, 1.4385546445846558],[9039, 1.4457066059112549],[7698, 1.4519243240356445]] len=3","[[7558, 1.4385546445846558],[9039, 1.4457066059112549],[7698, 1.4519243240356445]] len=3"
video9503,a woman is talking about how jeans with patches or rips is trendy,"[[9503, 1.4195761680603027]] len=1","[[9503, 1.4195761680603027],[8825, 1.4488005638122559],[9039, 1.4948625564575195]] len=3","[[9503, 1.4195761680603027],[8825, 1.4488005638122559],[9039, 1.4948625564575195]] len=3"
video8903,a naked child runs through a field,"[[9031, 1.3999378681182861]] len=1","[[9031, 1.3999378681182861],[9805, 1.4242286682128906],[8125, 1.4620842933654785]] len=3","[[9031, 1.3999378681182861],[9805, 1.4242286682128906],[8125, 1.4620842933654785]] len=3"


In [55]:
get_all_eval_scores(twohee_data_col_to_df(fire_query_results))

{'recall@1': 0.3853503184713376,
 'recall@5': 0.5222929936305732,
 'recall@10': 0.5222929936305732,
 'map': 0.4437367303609342,
 'ndcg@1': 0.3853503184713376,
 'ndcg@5': 0.46382902575068446,
 'ndcg@10': 0.46382902575068446}