## Test for direct pass to the full model

In [2]:
############################ Test for AutoModel ############################
# test: what's inside the hidden_states of the whole model
from transformers import AutoProcessor, AutoModel
import torch
from PIL import Image
# proc = AutoProcessor.from_pretrained(model_name)

# load model + processor
# model_name = "google/paligemma2-3b-pt-224"
# model = AutoModel.from_pretrained(
#     model_name, device_map="auto"
# )
############################ Test for AutoModel ############################

############################ Test for PaliGemmaForConditionalGeneration ############################

from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

device = "cuda:0"
model_id = "google/paligemma2-3b-mix-224"  # or your specific version
proc = AutoProcessor.from_pretrained(model_id)
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).to(device)
############################ Test for PaliGemmaForConditionalGeneration ############################

# dummy input
img = Image.new("RGB", (224, 224), color="gray")
# pass to the full model
enc = proc(images=img, text="<image>", return_tensors="pt").to(device)

with torch.inference_mode():
    out = model(**enc, output_hidden_states=True, return_dict=True)

print(type(out))
print(out.keys())          # if it's a ModelOutput, this shows named attributes
print([k for k in dir(out) if not k.startswith("_")])  # inspect methods/attrs


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.49it/s]


<class 'transformers.models.paligemma.modeling_paligemma.PaliGemmaCausalLMOutputWithPast'>
odict_keys(['logits', 'past_key_values', 'hidden_states', 'image_hidden_states'])
['attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'image_hidden_states', 'items', 'keys', 'logits', 'loss', 'move_to_end', 'past_key_values', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']


In [3]:
out_full = out
out_full.hidden_states[0].shape

torch.Size([1, 258, 2304])

In [4]:
out_full.image_hidden_states[0].shape

torch.Size([256, 2304])

## pass to vision_tower -> proj_layer -> language_model (more VRAM demanding)

In [5]:
############################ Test for AutoModel ############################

from transformers import AutoProcessor, AutoModel
import torch
from PIL import Image

# pass to the visual encoder and then text decoder

# # load model + processor
# model_name = "google/paligemma2-3b-pt-224"
# proc = AutoProcessor.from_pretrained(model_name)
# model = AutoModel.from_pretrained(
#     model_name, device_map="auto"
# )
############################ Test for AutoModel ############################


############################ Test for PaliGemmaForConditionalGeneration ############################
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

device = "cuda:0"
model_id = "google/paligemma2-3b-mix-224"  # or your specific version
processor = AutoProcessor.from_pretrained(model_id)
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).to(device)
############################ Test for PaliGemmaForConditionalGeneration ############################

# dummy input
img = Image.new("RGB", (224, 224), color="gray")
# pass to the full model
enc = proc(images=img, text="<image>", return_tensors="pt").to(device)

with torch.inference_mode():
    out = model(**enc, output_hidden_states=True, return_dict=True)
    
    
# dummy input
# Pass to encoder, transformation and decoder
img = Image.new("RGB", (224, 224), color="gray")
enc = proc(images=img, text="<image>", return_tensors="pt") # images are passed to vision tower, texts to tokeniser
# Here we use proc to wrap them
# enc = proc(images=img, return_tensors="pt")
px = enc["pixel_values"].to(device, non_blocking=True)
vout = model.vision_tower(pixel_values=px, output_hidden_states=False, return_dict=False)[0]
print(f"vout.shape: {vout.shape}")
proj=model.multi_modal_projector(vout)

tok_embeds = model.language_model.embed_tokens(enc["input_ids"][...,256*1:].to(device, non_blocking=True))
inputs_embeds = torch.cat([proj, tok_embeds], dim=1)
lm_inputs = {
        "inputs_embeds": inputs_embeds,             # vision embeddings
        "attention_mask": enc["attention_mask"],
    }
out = model.language_model(**lm_inputs, output_hidden_states=True, return_dict=True)
hs_tuple = out.hidden_states

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.53it/s]


vout.shape: torch.Size([1, 256, 1152])


In [6]:
tok_embeds.shape

torch.Size([1, 2, 2304])

In [7]:
inputs_embeds.shape

torch.Size([1, 258, 2304])

In [8]:
hs_tuple[0].shape

torch.Size([1, 258, 2304])

In [17]:

import torch.nn.functional as F

def mse_similarity(tensor1, tensor2):
    mse = F.mse_loss(tensor1, tensor2)
    return mse.item()

mse = mse_similarity(torch.cat(out_full.hidden_states, dim=0).cpu(), torch.cat(hs_tuple, dim=0).cpu())
print(f"MSE: {mse:.4f}")  # Lower = more similar, 0 = identical

MSE: 307.7573


## The two modules (PaliGemmaForConditionalGeneration and AutoModel) can give the same dimension for final outputs

In [42]:
len(hs_tuple),hs_tuple[0].shape

(27, torch.Size([1, 256, 2304]))

In [46]:
# vout[0].shape, len(vout), len(vout[1]), vout[1][1].shape
vout[0].shape

torch.Size([256, 1152])

In [1]:
import torch
import gc

# Clear PyTorch cache
torch.cuda.empty_cache()

# Delete specific tensors if you have them
# del your_tensor_variable

# Force garbage collection
gc.collect()

# Check VRAM usage
if torch.cuda.is_available():
    print(f"VRAM allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"VRAM reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

VRAM allocated: 0.00 GB
VRAM reserved: 0.00 GB


## Vision twoer has 1152 hid_dim, language_model 2304

In [47]:
model.multi_modal_projector

PaliGemmaMultiModalProjector(
  (linear): Linear(in_features=1152, out_features=2304, bias=True)
)

In [49]:
model

PaliGemmaForConditionalGeneration(
  (model): PaliGemmaModel(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(256, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elem

In [50]:
out.image_hidden_states

AttributeError: 'BaseModelOutputWithPast' object has no attribute 'image_hidden_states'

In [13]:
out.hidden_states[0].shape, out.image_hidden_states.shape

NameError: name 'out' is not defined