## BLIP-2 Feature Extraction for MMT (Per-frame CLS Token, 1408D)

In [1]:
import os
import glob
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2Model

2025-05-23 19:33:14.323836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748028794.582883      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748028794.658899      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Configuration

In [2]:
FRAME_DIR = "/kaggle/input/sota-mrsvtt-test-frame/msrvtt_test_frames_1fps"
OUTPUT_DIR = "/kaggle/working/blip2_features_seq_test"  # Renamed for clarity
MODEL_DIR = "/kaggle/input/image-caption-models/blip2-opt-2.7b"

os.makedirs(OUTPUT_DIR, exist_ok=True)

### Load BLIP-2 Model (ViT-G/14, 1408D CLS)

In [3]:
processor = Blip2Processor.from_pretrained(MODEL_DIR)
model = Blip2Model.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Blip2Model(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1408,), eps=1e-05, 

### Extract and Save Per-Frame CLS Features

In [4]:
video_ids = sorted(os.listdir(FRAME_DIR))

for video_id in tqdm(video_ids, desc="Processing videos"):
    frame_folder = os.path.join(FRAME_DIR, video_id)
    frame_paths = sorted(glob.glob(os.path.join(frame_folder, "*.jpg")))

    if not frame_paths:
        continue

    all_features = []

    for i in range(0, len(frame_paths), 8):
        batch_paths = frame_paths[i:i+8]
        images = [Image.open(p).convert("RGB") for p in batch_paths]

        inputs = processor(images=images, return_tensors="pt", padding=True).to("cuda", torch.float16)

        with torch.no_grad():
            vision_outputs = model.vision_model(**inputs)
            batch_feat = vision_outputs.last_hidden_state[:, 0, :].float()  # (batch_size, 1408)
            all_features.append(batch_feat)

    # Sequence of per-frame CLS tokens
    frame_tensor = torch.cat(all_features, dim=0)  # Shape: [num_frames, 1408]
    video_feat_seq = frame_tensor.detach().cpu().numpy()

    # Save sequence of CLS vectors for MMT
    save_path = os.path.join(OUTPUT_DIR, f"{video_id}_video.npy")
    np.save(save_path, video_feat_seq)

Processing videos: 100%|██████████| 2990/2990 [35:31<00:00,  1.40it/s]


### Done
Per-frame CLS features saved in `/kaggle/working/blip2_features_seq_test` for MMT input