In [10]:
import numpy as np
from pathlib import Path

def load_clip_features(folder_path):
    embeddings = {}
    embeddings_map = {}
    i = 0

    for file in Path(folder_path).glob('*.npy'):
        embedding = np.load(str(file))
        embedding = embedding.astype(np.float32)
        embeddings[i] = embedding
        embeddings_map[i] = str(file)
        i += 1
    return embeddings, embeddings_map

In [11]:
folder_path = 'data/clip-features-32'
embeddings, embeddings_map = load_clip_features(folder_path)

In [12]:
import numpy as np
import pandas as pd
import os

def load_keyframes_map(folder_path):
    keyframes_map = {}

    for file_name in Path(folder_path).glob('*.csv'):
        df = pd.read_csv(file_name, index_col=None, header=0)
        file_name = os.path.basename(file_name)
        file_base, extension = os.path.splitext(file_name)
        keyframes_map[file_base] = df
    return keyframes_map

In [13]:
folder_path = 'data/map-keyframes'
keyframes_map = load_keyframes_map(folder_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pathlib import Path
import os

folder_path = "data/pack3-groupA"

def load_query(folder_path):
    queries = []

    # Set up translation model
    model_name = "VietAI/envit5-translation"
    tokenizer = AutoTokenizer.from_pretrained(model_name)  
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model = model.to('cuda')

    for file in Path(folder_path).glob('*.txt'):
        with open(file, "r") as file:
            # Get query text
            text = "".join(file.read().splitlines())
            model_output = model.generate(tokenizer(text, return_tensors="pt", padding=True).input_ids.to('cuda'), max_length=512)
            text_en = tokenizer.batch_decode(model_output, skip_special_tokens=True)[0]
            text_en = text_en.replace("en: ", "")

            # Get query type
            file_name = os.path.basename(file.name)
            base, extension = os.path.splitext(file_name)
            query_type = base.split("-")[-1]
            
            queries.append([text_en, query_type, base])
    return queries
queries = load_query(folder_path)

In [15]:
from PIL import Image, ImageFile
import numpy as np
import matplotlib.pyplot as plt

def read_image_from_path(path, size):
    img = Image.open(path).convert('RGB').resize(size)
    return np.array(img)

def plot_results(results, query_folder_path, keyframes_map):
    result_sample = 50
    results = results[:result_sample]
    fig = plt.figure(figsize=(10, result_sample))
    num_images = len(results)
    rows = (num_images + 1) // 2

    for i in range(len(results)):
        ax = fig.add_subplot(rows, 2, i+1)
        base, keyframe_id, score = results[i][0], results[i][1], round(results[i][2],3)

        chunk_id = "keyframes_" + base.split('_')[0]
        keyframe_id = keyframes_map[base].loc[keyframes_map[base]['n'] == keyframe_id]['n'].values[0]
        keyframe_id = "%03d" % keyframe_id + ".jpg"
        query_path = os.path.join('',*[query_folder_path, chunk_id, "keyframes", base, keyframe_id])
        
        ax.imshow(read_image_from_path(query_path, size=(1024, 1024)))
        title = Path(query_path).parts
        ax.set_title(f"Top {i+1} - {score}: {title[4]} - {title[5]}")
        ax.axis("off")
    plt.tight_layout()
    plt.show()

In [16]:
import csv

def save_results(results, output_data_path, keyframes_map, query_name):
    query_folder_path = 'data/keyframes'
    query_type = query_name.split("-")[-1]
    query_name = query_name + ".csv";
    output_data_path = output_data_path + "/" + query_name    
    with open(output_data_path, 'w', newline='') as file:
        writer = csv.writer(file, delimiter=',')

        for result in results:
            base = result[0]
            keyframe_id = result[1]
            frame_idx = keyframes_map[base].loc[keyframes_map[base]['n'] == keyframe_id]['frame_idx'].values[0]

            if query_type == "qa":
                ans = result[3]
                writer.writerow([base, frame_idx, ans])
            elif query_type == "kis":
                writer.writerow([base, frame_idx])

            # chunk_id = "keyframes_" + base.split('_')[0]
            # keyframe_id = keyframes_map[base].loc[keyframes_map[base]['n'] == keyframe_id]['n'].values[0]
            # keyframe_id = "%03d" % keyframe_id + ".jpg"
            # query_path = os.path.join('',*[query_folder_path, chunk_id, "keyframes", base, keyframe_id])
            # title = Path(query_path).parts
            # quote = title[4] + "-" + title[5]

            # if query_type == "qa":
            #     ans = result[3]
            #     writer.writerow([base, quote, ans])
            # elif query_type == "kis":
            #     writer.writerow([base, quote])

In [17]:
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
from PIL import Image
import re
# nltk.download('punkt')

def process_kis(query_text, embeddings, sample_per_video, results_size):
    print(query_text)

    results = []
    text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32')
    text_embedding = text_model.encode(query_text)

    for i in range (len(embeddings)):
        scores = util.cos_sim(text_embedding, embeddings[i])            
        scores_top = torch.topk(scores.flatten(), sample_per_video).indices

        for j in range(len(scores_top)):
            score_raw = scores[0][scores_top[j]].item()
            keyframe_id = scores_top[j].item() + 1
            file_name = os.path.basename(embeddings_map[i])
            file_base, extension = os.path.splitext(file_name)

            if (len(results) < results_size):
                results.append([file_base, keyframe_id, score_raw])
                results = sorted(results, key = lambda item: item[-1])
                continue
            elif (score_raw > results[0][-1]):
                results.pop(0)
                results.append([file_base, keyframe_id, score_raw])
                results = sorted(results, key = lambda item: item[-1])
    return results

def process_qa(query_text, query_folder_path, embeddings, sample_per_video, results_size, processor_qa, model_qa):
    prompts = []
    questions = []
    results = []

    tokens = nltk.sent_tokenize(query_text)
    for token in tokens:
        if '?' in token:
            questions.append(token)
        else:
            prompts.append(token)

    query_prompt = " ".join(prompts)
    query_question = " ".join(questions)
    query_question += " Output in only numbers."

    # results = process_kis(query_prompt, embeddings, sample_per_video, results_size)
    results = process_kis(query_text, embeddings, sample_per_video, results_size)
    print(query_question)

    for i in range(len(results)):
        base, keyframe_id = results[i][0], results[i][1]
        chunk_id = "keyframes_" + base.split('_')[0]
        keyframe_id = keyframes_map[base].loc[keyframes_map[base]['n'] == keyframe_id]['n'].values[0]
        keyframe_id = "%03d" % keyframe_id + ".jpg"
        query_path = os.path.join('',*[query_folder_path, chunk_id, "keyframes", base, keyframe_id])
        
        image = Image.open(query_path).convert('RGB')
        inputs = processor_qa(image, query_question, return_tensors="pt").to("cuda")
        out = model_qa.generate(**inputs)
        generated_text = processor_qa.decode(out[0], skip_special_tokens=True)
        numbers = re.findall(r'\d+', generated_text)
        results[i].append("".join(numbers))
    return results

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import BlipProcessor, BlipForQuestionAnswering
import open_clip
from PIL import Image, ImageFile
import requests
import tensorflow as tf
import matplotlib.pyplot as plt
import torch
import os

text_model = SentenceTransformer('sentence-transformers/clip-ViT-B-32')
processor_qa = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_qa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

query_folder_path = "data/keyframes"
output_folder_path = "data/submission"
results_size = 100
sample_per_video = 3
i = 1

for query in queries:
    query_text = query[0]
    query_type = query[1]
    query_name = query[2]
    print("Query: ", query_name)

    if (query_type == 'kis'):
        results = process_kis(query_text, embeddings, sample_per_video, results_size)
        results = sorted(results, key = lambda item: item[-1], reverse=True)
        plot_results(results, query_folder_path, keyframes_map)
        save_results(results, output_folder_path, keyframes_map, query_name)
    elif query_type == 'qa':
        results = process_qa(query_text, query_folder_path, embeddings, sample_per_video, results_size, processor_qa, model_qa)
        results = sorted(results, key = lambda item: item[-2], reverse=True)
        plot_results(results, query_folder_path, keyframes_map)
        save_results(results, output_folder_path, keyframes_map, query_name)
    i += 1