In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from pathlib import Path

img_model = SentenceTransformer('clip-ViT-B-32')
text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1')

def load_clip_features(folder_path):
    embeddings = {}
    embeddings_map = {}
    i = 0

    for file in Path(folder_path).glob('*.npy'):
        embedding = np.load(str(file))
        embeddings[i] = embedding
        embeddings_map[i] = str(file)
        i += 1
    return embeddings, embeddings_map

  from tqdm.autonotebook import tqdm, trange







In [2]:
folder_path = 'data\clip-features-32'
embeddings, embeddings_map = load_clip_features(folder_path)

In [3]:
import numpy as np
import pandas as pd
import os

def load_keyframes_map(folder_path):
    keyframes_map = {}

    for file in Path(folder_path).glob('*.csv'):
        df = pd.read_csv(file, index_col=None, header=0)
        file = os.path.basename(file)
        base, extension = os.path.splitext(file)
        keyframes_map[base] = df
    return keyframes_map

In [4]:
folder_path = 'data\map-keyframes'
keyframes_map = load_keyframes_map(folder_path)

In [5]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageFile
import requests
import torch
import os

texts = ["A big pink fish figure with together with many other fish figures",
         "A doctor holding the patient hand, the doctor is wearing a black watch, the patient has white cloth wrap around an arm",
         "Big eye"]

for text in texts:
    text_embedding = text_model.encode(text);
    text_embedding = text_embedding.astype(np.float16)
    max_score = 0
    max_score_video_idx = 0
    max_score_keyframe_idx = 0

    for i in range(0, len(embeddings)):
        score = util.cos_sim(text_embedding, embeddings[i])
        max_img_idx = torch.argmax(score)

        if (max_score < score[0][max_img_idx].item()):
            max_score = score[0][max_img_idx].item()
            max_score_video_idx = i
            max_score_keyframe_idx = max_img_idx.item()
    
    file = os.path.basename(embeddings_map[max_score_video_idx])
    base, extension = os.path.splitext(file)
    print(max_score, " - ", base)
    print(keyframes_map[base].loc[keyframes_map[base]['n'] == max_score_keyframe_idx + 1])

0.3046875  -  L01_V021
     n  pts_time   fps  frame_idx
64  65    223.24  25.0       5581
0.359619140625  -  L02_V011
     n  pts_time   fps  frame_idx
77  78    305.76  25.0       7644
0.288330078125  -  L10_V007
       n  pts_time   fps  frame_idx
267  268     967.4  25.0      24185
