In [1]:
# %% [code]
!pip install git+https://github.com/huggingface/transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-gwngagj_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-gwngagj_
  Resolved https://github.com/huggingface/transformers to commit 3927ffed31e3c0d2929bf98bd05b7c61fcc48b62
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub==1.0.0.rc5 (from transformers==5.0.0.dev0)
  Downloading huggingface_hub-1.0.0rc5-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers==5.0.0.dev0)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.1

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import os

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("YOUR_ACCESS_KEY")
os.environ["HF_TOKEN"] = user_secrets.get_secret("YOUR_ACCESS_KEY")
os.environ["HF_USERNAME"] = "YOUR_USERNAME"

# login(token=hf_token)

In [3]:
import torch
from transformers import AutoModel, AutoProcessor, AutoImageProcessor
from PIL import Image
import pandas as pd
from tqdm import tqdm
import requests
from io import BytesIO
import numpy as np
import re
import torch.nn.functional as F


MODEL_ID = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
SAVE_PATH = "./dinov3-vith16plus-pretrain-lvd1689m"
DATA_PATH = "/kaggle/input/amlc2025/student_resource/dataset/test.csv"
BATCH_SIZE = 128
DEBUG = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SHARD_NUMBER = 10 # Change this for each Kaggle account
TOTAL_SHARDS = 10 # Set this to the total number of accounts/shards

model = AutoModel.from_pretrained(MODEL_ID, device_map="auto").eval()
processor = AutoImageProcessor.from_pretrained(MODEL_ID)

total_df = pd.read_csv(DATA_PATH)
total_rows = len(total_df)
rows_per_shard = total_rows // TOTAL_SHARDS
start_idx = (SHARD_NUMBER - 1) * rows_per_shard
end_idx = start_idx + rows_per_shard if SHARD_NUMBER < TOTAL_SHARDS else total_rows
df = total_df.iloc[start_idx:end_idx].reset_index(drop=True)
print(f"Processing shard {SHARD_NUMBER}/{TOTAL_SHARDS}: {len(df)} rows")

lim = len(df)
if DEBUG:
    lim = 200

all_image_embeddings = []
all_ids = []

for start_idx in tqdm(range(0, lim, BATCH_SIZE)):
    batch_df = df.iloc[start_idx : min(start_idx + BATCH_SIZE, lim)]
    
    images_to_process = []
    
    for _, row in batch_df.iterrows():
        
        # Process image
        try:
            image_url = row["image_link"]
            if not isinstance(image_url, str) or not (image_url.startswith("http://") or image_url.startswith("https://")):
                raise ValueError("Invalid image URL")

            image_response = requests.get(image_url, stream=True)
            image_response.raise_for_status()
            image = Image.open(image_response.raw).convert("RGB")
            images_to_process.append(image)

        except Exception as e:
            print(f"Failed to process image {row.get('image_link', 'N/A')}. Using a black dummy image instead. Error: {e}")
            image = Image.new('RGB', (224, 224), color='black')
            images_to_process.append(image)

    # Use the processor for images
    # inputs = processor(
    #     text=batch_texts, 
    #     images=images_to_process, 
    #     return_tensors="pt", 
    #     padding="max_length", # Recommended for SigLIP 2
    #     truncation=True,
    #     max_length=64       # Recommended for SigLIP 2
    # ).to(model.device)

    inputs = processor(
        images=images_to_process, 
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        # Get both image and text embeddings from a single model call
        outputs = model(**inputs)
        image_embeddings = outputs.pooler_output
        
        # Normalize the embeddings
        image_embeddings = F.normalize(image_embeddings, p=2, dim=-1)

    for i, (_, row) in enumerate(batch_df.iterrows()):
        all_image_embeddings.append(image_embeddings[i].cpu().numpy())
        all_ids.append(row["sample_id"])
            
    del inputs, outputs, image_embeddings, images_to_process

# Convert lists of embeddings to 2D numpy arrays
all_image_embeddings = np.stack(all_image_embeddings)
all_ids = np.array(all_ids)

# Save image embeddings, text embeddings, and IDs to separate files
np.save(f"image_embeddings_{SHARD_NUMBER}.npy", all_image_embeddings)
np.save(f"sample_ids_{SHARD_NUMBER}.npy", all_ids)

print("Saved image embeddings shape:", all_image_embeddings.shape)
print("Saved sample IDs shape:", all_ids.shape)

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Processing shard 10/10: 7500 rows


100%|██████████| 59/59 [39:06<00:00, 39.78s/it]

Saved image embeddings shape: (7500, 1280)
Saved sample IDs shape: (7500,)





In [4]:
# del model, processor