In [31]:
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

folder_path = "data/pack-pretest"

def load_query(folder_path):
    queries = []

    # Set up translation model
    model_name = "VietAI/envit5-translation"
    tokenizer = AutoTokenizer.from_pretrained(model_name)  
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model = model.to('cuda')

    for file in Path(folder_path).glob('*.txt'):
        with open(file, "r") as file:
            # Get query text
            text = "".join(file.read().splitlines())
            model_output = model.generate(tokenizer(text, return_tensors="pt", padding=True).input_ids.to('cuda'), max_length=1024)
            text_en = tokenizer.batch_decode(model_output, skip_special_tokens=True)[0]
            text_en = text_en.replace("en: ", "")

            # Get query type
            file_name = os.path.basename(file.name)
            base, extension = os.path.splitext(file_name)
            query_type = base.split("-")[-1]
            
            queries.append([text_en, query_type])
    return queries
queries = load_query(folder_path)



In [36]:
print(len(queries))
print(queries[0])

6
['A black, ice-powered boat. The boat is powered by a propeller engine that blows out the back from above. The boat is a rescue vehicle for a victim who fell into an icy lake.', 'kis']


In [37]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from pathlib import Path

img_model = SentenceTransformer('clip-ViT-B-32')
text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32-multilingual-v1')

def load_clip_features(folder_path):
    embeddings = {}
    embeddings_map = {}
    i = 0

    for file in Path(folder_path).glob('*.npy'):
        embedding = np.load(str(file))
        embeddings[i] = embedding
        embeddings_map[i] = str(file)
        i += 1
    return embeddings, embeddings_map






In [38]:
folder_path = 'data\clip-features-32'
embeddings, embeddings_map = load_clip_features(folder_path)

In [39]:
print(len(embeddings))
print(len(embeddings_map))

363
363


In [45]:
import numpy as np
import pandas as pd
import os

def load_keyframes_map(folder_path):
    keyframes_map = {}

    for file in Path(folder_path).glob('*.csv'):
        df = pd.read_csv(file, index_col=None, header=0)
        file = os.path.basename(file)
        base, extension = os.path.splitext(file)
        keyframes_map[base] = df
    return keyframes_map

In [46]:
folder_path = 'data\map-keyframes'
keyframes_map = load_keyframes_map(folder_path)

In [54]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageFile
import requests
import torch
import os

for query in queries:
    query_text = query[0]
    query_type = query[1]

    if (query_type == 'kis'):    
        text_embedding = text_model.encode(query_text);
        text_embedding = text_embedding.astype(np.float16)
        max_score = 0
        max_score_video_idx = 0
        max_score_keyframe_idx = 0

        for i in range(0, len(embeddings)):
            score = util.cos_sim(text_embedding, embeddings[i])
            max_img_idx = torch.argmax(score)

            if (max_score < score[0][max_img_idx].item()):
                max_score = score[0][max_img_idx].item()
                max_score_video_idx = i
                max_score_keyframe_idx = max_img_idx.item()
        
        file = os.path.basename(embeddings_map[max_score_video_idx])
        base, extension = os.path.splitext(file)
        print(max_score, " - ", base)
        print(keyframes_map[base].loc[keyframes_map[base]['n'] == max_score_keyframe_idx + 1])

0.334716796875  -  L09_V017
       n  pts_time   fps  frame_idx
206  207     860.4  25.0      21510
0.31103515625  -  L05_V029
       n  pts_time   fps  frame_idx
175  176     755.0  25.0      18875
0.303466796875  -  L02_V031
       n  pts_time   fps  frame_idx
116  117    410.88  25.0      10272


In [None]:
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageFile
import requests
import torch
import os

texts = ["A big pink fish figure with together with many other fish figures",
         "A doctor holding the patient hand, the doctor is wearing a black watch, the patient has white cloth wrap around an arm",
         "Big eye"]

for text in texts:
    text_embedding = text_model.encode(text);
    text_embedding = text_embedding.astype(np.float16)
    max_score = 0
    max_score_video_idx = 0
    max_score_keyframe_idx = 0

    for i in range(0, len(embeddings)):
        score = util.cos_sim(text_embedding, embeddings[i])
        max_img_idx = torch.argmax(score)

        if (max_score < score[0][max_img_idx].item()):
            max_score = score[0][max_img_idx].item()
            max_score_video_idx = i
            max_score_keyframe_idx = max_img_idx.item()
    
    file = os.path.basename(embeddings_map[max_score_video_idx])
    base, extension = os.path.splitext(file)
    print(max_score, " - ", base)
    print(keyframes_map[base].loc[keyframes_map[base]['n'] == max_score_keyframe_idx + 1])