In [None]:
!pip -q install faiss-cpu

In [None]:
import faiss

In [None]:
import os
root = '/kaggle/input/stage2b-encoded-keyframes'
shard_files = sorted(os.listdir(root))
shard_paths = [os.path.join(root, file) for file in shard_files]
shard_paths

In [None]:
# Load encoded frames with device mapping
import torch

# Determine the device to use for model inference
device = "cuda" if torch.cuda.is_available() else "cpu"
# Print the selected device (either GPU if available or CPU)
print('device: ', device)

In [None]:
# Function to add vectors from a shard index to the merged index
def add_vectors_from_shard(shard_filename):
    """
    Reads vectors from a shard index file and adds them to the merged index.
    :param shard_filename: Path to the shard index file.
    """
    # Load the FAISS index from the shard file
    shard_index = faiss.read_index(shard_filename)
    # Reconstruct the vectors from the shard index
    # `reconstruct_n` returns the vectors for all stored items (from index 0 to ntotal)
    vectors = shard_index.reconstruct_n(0, shard_index.ntotal)
    # Add the vectors to the merged index
    merged_index.add(vectors)

In [None]:
%%time

import numpy as np

# Create a FAISS index (HNSWFlat) for each shard.             Can change this (or benchmark)
# Note: HNSW indexing is not implemented on GPU.

num_neighbors = 32  # Number of neighbors to link in HNSW

# Define the folder to store FAISS index files
bin_collection = 'FAISS_index_folder'
# Create the folder if it does not exist
os.makedirs(bin_collection, exist_ok=True)

# Iterate over each shard to create and save an index
for i, path in enumerate(shard_paths):
    print(f'Working with: {path}')
    encoded_frames = torch.load(path, 
                                map_location=device, 
                                weights_only=True)
    print(f'Number of images: {len(encoded_frames)}')
    dimension = encoded_frames.shape[-1]
    # Create an HNSWFlat index with the specified dimension and number of neighbors
    index_hnsw = faiss.IndexHNSWFlat(dimension, num_neighbors)
    # Convert encoded frames to numpy array with float32 type
    vector_data = encoded_frames.cpu().numpy().astype(np.float32)
    # Add the shard vectors to the FAISS index
    index_hnsw.add(vector_data)
    # Save the index to a binary file
    faiss.write_index(index_hnsw, f"{bin_collection}/shard_index_hnsw_{i:05d}.bin")

In [None]:
# Merge and store indexes
# Collect all `.bin` files in the `bin_collection` directory and sort them
shard_filenames = sorted([os.path.join(bin_collection, f) for f in os.listdir(bin_collection) if f.endswith('.bin')])
# Load the first index to get the dimension of the vectors and set up the merged index
index = faiss.read_index(shard_filenames[0])
dimension = index.d  # Get the dimension of the vectors from the index
# Create the final merged index using HNSWFlat with the same dimension and number of neighbors
merged_index = faiss.IndexHNSWFlat(dimension, num_neighbors)
# Iterate over each shard index file and add its vectors to the merged index
for shard_filename in shard_filenames:
    add_vectors_from_shard(shard_filename)
# Save the final merged index to a file
faiss.write_index(merged_index, "merged_index_hnsw_baseline_v0.bin")

In [None]:
roots = ['/kaggle/input/videos-l01/key_frame_folder_videos-l01',
         '/kaggle/input/videos-l02/key_frame_folder_videos-l02',
         '/kaggle/input/videos-l03/key_frame_folder_videos-l03',
         '/kaggle/input/videos-l04/key_frame_folder_videos-l04',
         '/kaggle/input/videos-l05/key_frame_folder_videos-l05',
         '/kaggle/input/videos-l06/key_frame_folder_videos-l06',
         '/kaggle/input/videos-l07/key_frame_folder_videos-l07.',
         '/kaggle/input/videos-l08/key_frame_folder_videos-l08',
         '/kaggle/input/videos-l09/key_frame_folder_videos-l09',
         '/kaggle/input/videos-l10/key_frame_folder_videos-l10',
         '/kaggle/input/videos-l11/key_frame_folder_videos-l11',
         '/kaggle/input/videos-l12/key_frame_folder_videos-l12',]
annotations = ['/kaggle/input/videos-l01/key_frame_folder_videos-l01.json',
               '/kaggle/input/videos-l02/key_frame_folder_videos-l02.json',
               '/kaggle/input/videos-l03/key_frame_folder_videos-l03.json',
               '/kaggle/input/videos-l04/key_frame_folder_videos-l04.json',
               '/kaggle/input/videos-l05/key_frame_folder_videos-l05.json',
               '/kaggle/input/videos-l06/key_frame_folder_videos-l06.json',
               '/kaggle/input/videos-l07/key_frame_folder_videos-l07.json',
               '/kaggle/input/videos-l08/key_frame_folder_videos-l08.json',
               '/kaggle/input/videos-l09/key_frame_folder_videos-l09.json',
               '/kaggle/input/videos-l10/key_frame_folder_videos-l10.json',
               '/kaggle/input/videos-l11/key_frame_folder_videos-l11.json',
               '/kaggle/input/videos-l12/key_frame_folder_videos-l12.json',
              ]

In [None]:
import json
key_frame_folder = {}
i = 0
for root, annotation in zip(roots, annotations): 
    #Load annotation
    with open(annotation, 'r') as openfile:
        index_caption_hashtag_dict = json.load(openfile)
    print(f'Load {annotation}: DONE with total {len(index_caption_hashtag_dict)} images!')
    for info in index_caption_hashtag_dict.values():
        frame_ID = info['frame_ID']
        frame_path = root+info['frame_path'].replace('content', 'kaggle/working') 
        video_ID = info['video_ID']
        timestamp = info['timestamp']
        key_frame_folder[str(i)] = {
            "frame_ID":frame_ID,
            "frame_path":frame_path,
            "video_ID":video_ID,
            "timestamp":timestamp,
        }
        i+=1
with open(f"image_info_dict.json", "w", encoding='utf-8') as outfile:
        json.dump(key_frame_folder, outfile, ensure_ascii=False)
print(f'Save new annotation: DONE with total {len(key_frame_folder)} images!')