In [1]:
!pip install flash_attn

Collecting flash_attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->flash_attn)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->flash_attn)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->flash_attn)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->flash_attn)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch->flash_attn)
  Downloading nvidia_cusolver_c

In [2]:
import os
import pickle
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from PIL import Image
from transformers import AutoModel, AutoProcessor
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Định nghĩa các đường dẫn
BASE_DIR = '/kaggle/input/vitextvqa/ViTextVQA_images'
OUTPUT_DIR = '/kaggle/working/'
IMAGE_DIR = os.path.join(BASE_DIR, 'st_images')
FEATURES_FILE = os.path.join(OUTPUT_DIR, 'internvit_features.pkl')

# Đảm bảo thư mục đầu ra tồn tại
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Dataset tùy chỉnh để tải ảnh
class ImageDataset(Dataset):
    def __init__(self, image_dir):
        self.image_dir = image_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
        # Transform để resize và chuyển ảnh thành tensor
        self.transform = transforms.Compose([
            transforms.Resize((448, 448)),  # Resize về 448x448 (phù hợp InternViT)
            transforms.ToTensor(),  # Chuyển PIL Image thành tensor (C, H, W)
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Chuẩn hóa
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        image_id = os.path.splitext(img_name)[0]
        # Chuyển ảnh thành tensor
        image_tensor = self.transform(image)
        return image_tensor, image_id  # Chỉ trả về tensor và image_id

# Tải mô hình InternViT và processor
model_name = "OpenGVLab/InternViT-300M-448px-V2_5"
processor = AutoProcessor.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_flash_attn=False
)
model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_flash_attn=False
)

# Chuyển mô hình sang GPU chính (cuda:0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Đảm bảo mô hình nằm trên cuda:0
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model, device_ids=[0, 1])  # Sử dụng cả 2 GPU
model.eval()

# Hàm chuyển tensor về PIL Image
def tensor_to_pil(tensor, device):
    # Denormalize và chuyển tensor về numpy
    mean = torch.tensor([0.5, 0.5, 0.5]).view(3, 1, 1).to(device)
    std = torch.tensor([0.5, 0.5, 0.5]).view(3, 1, 1).to(device)
    tensor = tensor * std + mean
    tensor = tensor.clamp(0, 1) * 255
    tensor = tensor.permute(1, 2, 0).cpu().numpy().astype(np.uint8)
    return Image.fromarray(tensor)

# Hàm trích xuất đặc trưng cho một batch ảnh
def extract_feature_batch(image_tensors, device):
    # Chuyển batch tensor thành list PIL Image
    pil_images = [tensor_to_pil(tensor, device) for tensor in image_tensors]
    # Xử lý với processor
    inputs = processor(images=pil_images, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        features = outputs.last_hidden_state[:, 0, :]  # Lấy [CLS] token
    
    return features.cpu().numpy()

# Làm sạch bộ nhớ GPU trước khi chạy
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Tạo dataset và dataloader
dataset = ImageDataset(IMAGE_DIR)
batch_size = 16  # Tùy chỉnh batch size tùy theo bộ nhớ GPU
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Dictionary để lưu đặc trưng
features = {}

# Trích xuất đặc trưng với tiến trình
for batch_image_tensors, batch_image_ids in tqdm(dataloader, desc="Extracting features"):
    # Chuyển batch tensor sang cuda:0
    batch_image_tensors = batch_image_tensors.to(device)
    # Trích xuất đặc trưng
    batch_features = extract_feature_batch(batch_image_tensors, device)
    
    # Lưu đặc trưng vào dictionary
    for image_id, feature in zip(batch_image_ids, batch_features):
        features[image_id] = feature[np.newaxis, :]  # Thêm chiều để giữ shape (1, d)

# Lưu đặc trưng vào file pickle
with open(FEATURES_FILE, 'wb') as f:
    pickle.dump(features, f)

print(f"Features extracted and saved to {FEATURES_FILE}")

2025-05-09 15:37:01.669320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746805022.073264      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746805022.187259      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


modeling_intern_vit.py:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

flash_attention.py:   0%|          | 0.00/3.46k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5:
- flash_attention.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5:
- modeling_intern_vit.py
- flash_attention.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/608M [00:00<?, ?B/s]

Using 2 GPUs!


Extracting features: 100%|██████████| 1048/1048 [43:01<00:00,  2.46s/it]


Features extracted and saved to /kaggle/working/internvit_features.pkl


In [3]:
features

{'7981': array([[-5.0049367, -5.770723 , -3.9500704, ..., -4.9654665, -1.5438336,
          3.4893723]], dtype=float32),
 '12666': array([[-4.156481 , -6.2520065, -5.095504 , ..., -4.622704 , -1.9338377,
          4.7668686]], dtype=float32),
 '13288': array([[-4.7622185, -3.9988837, -3.43635  , ..., -3.1324687, -2.4866867,
          2.5917804]], dtype=float32),
 '6234': array([[-4.9264054, -4.4495225, -3.7982697, ..., -2.7575755, -2.77636  ,
          3.4331894]], dtype=float32),
 '1269': array([[-4.7131944, -4.1748834, -3.6044164, ..., -3.466506 , -2.4241233,
          2.9179087]], dtype=float32),
 '3863': array([[-4.2772937, -3.8957767, -3.505686 , ..., -4.7293744, -1.8038578,
          2.999613 ]], dtype=float32),
 '6241': array([[-4.717731 , -4.069887 , -3.3911355, ..., -3.319418 , -2.901861 ,
          3.2657278]], dtype=float32),
 '10304': array([[-5.7573047, -3.6249733, -3.2914774, ..., -3.462882 , -2.066247 ,
          2.6245692]], dtype=float32),
 '623': array([[-5.174476 , -

In [4]:
print(features['7981'].shape)  # Ví dụ: (1, 768) hoặc (1, 1024)

(1, 1024)


In [5]:
feature = features['7981'].squeeze(0)  # Chuyển từ (1, d) sang (d,)
feature

array([-5.0049367, -5.770723 , -3.9500704, ..., -4.9654665, -1.5438336,
        3.4893723], dtype=float32)