In [1]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from modules.import_llave import load_llave_model
from modules.embed_utils import embed_frame
import os
import numpy as np 
import json 
import glob
from tqdm import tqdm
import yaml 

In [None]:
with open('../config.yaml', 'r') as f: 
    config = yaml.safe_load(f)

root_folder = config['root_folder']
output_embedding_file = config['output_embedding_file']
output_metadata_file = config['output_metadata_file']

In [10]:
# Load model 
tokenizer, model, image_processor, device = load_llave_model()
print(f'Embed with {device}')
# Embed 
image_paths = sorted(glob.glob(os.path.join(root_folder, "**", "*.jpg"), recursive=True))
all_embeddings = [] 
all_paths = [] 

for image_path in tqdm(image_paths, desc='Embedding frames'): 
    try: 
        emb = embed_frame(image_path, model, tokenizer, image_processor, device)
        all_embeddings.append(emb.astype(np.float32))
        all_paths.append(image_path)
    except Exception as e: 
        print(f'[WARNING] Failed to embed {image_path} : {e}') 

# === Save to .bin and .json ===
embedding_matrix = np.vstack(all_embeddings)
embedding_matrix.tofile(output_embedding_file)

with open(output_metadata_file, "w", encoding="utf-8") as f:
    json.dump({
        "shape": embedding_matrix.shape,
        "dtype": "float32",
        "paths": all_paths
    }, f, indent=2, ensure_ascii=False)

print(f"\nEmbedded {len(all_paths)} images.")
print(f"Saved: {output_embedding_file} and {output_metadata_file}")


Loaded LLaVA model: zhibinlan/LLaVE-0.5B


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.


Loading vision tower: google/siglip-so400m-patch14-384


Some weights of LlavaQwenForCausalLM were not initialized from the model checkpoint at zhibinlan/LLaVE-0.5B and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Class: LlavaQwenForCausalLM
Embed with cuda


Embedding frames: 100%|██████████| 488/488 [03:20<00:00,  2.44it/s]


Embedded 488 images.
Saved: /home/quocthinh/Desktop/pythonCode/AIC_2025/llave_embeddings.bin and /home/quocthinh/Desktop/pythonCode/AIC_2025/llave_metadata.json



