In [1]:
from transformers import AutoProcessor, AutoModel, AutoModelForVision2Seq
import torch

from PIL import Image
import requests

In [2]:
model_id = "google/paligemma-3b-mix-224"
processor = AutoProcessor.from_pretrained(model_id, token="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")

In [3]:
model = AutoModelForVision2Seq.from_pretrained(model_id, token="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")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
import PIL
prompt = 'Describe this image'

In [5]:
def extract_logits_vector(inputs):
    # Send to device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = inputs.to(device)
    
    vision_output = model.vision_tower(inputs['pixel_values'])
    multimodal_projector_output = model.multi_modal_projector(vision_output['last_hidden_state'])
    text_embeddings = model.get_input_embeddings()(inputs['input_ids']) # These contain <image> + prompt embeddings.
    merge_output = model._merge_input_ids_with_image_features(multimodal_projector_output, text_embeddings, input_ids = inputs['input_ids'], attention_mask = inputs['attention_mask'], labels = None, token_type_ids=None, cache_position=None)

    inputs_embeds, attention_mask, labels, position_ids = merge_output
    print(f"input_embeds[0, 5, :10] shape: {inputs_embeds.shape}\n\t{inputs_embeds[0, 5, :10]}")
    outputs = model.language_model(attention_mask=attention_mask, 
                                   position_ids=position_ids, 
                                   inputs_embeds=inputs_embeds)
    logits = outputs.logits
    last_logit = logits[0, -1, :]

    return last_logit

In [6]:
img = Image.open('inseq/extra_samu/data/image_test.png')
img = img.convert('RGB')
image_inputs = processor(prompt, img)
last_logit_image = extract_logits_vector(image_inputs)

input_embeds[0, 5, :10] shape: torch.Size([1, 261, 2048])
	tensor([ 0.0268, -0.0039, -0.0020, -0.0092,  0.0053, -0.0089, -0.0032,  0.0156,
        -0.0121,  0.0059], grad_fn=<SliceBackward0>)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
processor.encode('Describe this image\ncongruent angles')

In [None]:
processor.encode('Describe this image congruent angles')

In [7]:
black_img = Image.new('RGB', (100, 100))
black_inputs = processor(prompt, black_img)
last_logit_black = extract_logits_vector(black_inputs)

input_embeds[0, 5, :10] shape: torch.Size([1, 261, 2048])
	tensor([ 0.0472,  0.0010,  0.0035, -0.0238,  0.0039, -0.0368, -0.0042,  0.0173,
        -0.0120,  0.0135], grad_fn=<SliceBackward0>)


In [8]:
print(f"Prediction: {processor.decode(torch.argmax(last_logit_image))}\nlogits: {last_logit_image}\n\n")
print(f"Prediction: {processor.decode(torch.argmax(last_logit_black))}\nlogits: {last_logit_black}")

Prediction: con
logits: tensor([  0.4453,   5.9641, -11.5594,  ...,   0.3837,   0.3817,   0.3834],
       grad_fn=<SliceBackward0>)


Prediction: un
logits: tensor([  0.2063,   3.2383, -13.9596,  ...,   0.1951,   0.1927,   0.1930],
       grad_fn=<SliceBackward0>)


In [9]:
image_probs = torch.nn.functional.softmax(last_logit_image, dim = 0)
black_probs = torch.nn.functional.softmax(last_logit_black, dim = 0)

In [10]:
from scipy.stats import entropy
kl_i_b = entropy(image_probs.detach().numpy(), black_probs.detach().numpy())
kl_b_i = entropy(black_probs.detach().numpy(), image_probs.detach().numpy())
print(f"kl divergence should be either: {kl_i_b} or {kl_b_i}")

kl divergence should be either: 4.857807636260986 or 3.772852897644043


#### Investigate the different tokenization options

In [34]:
print(f"We have decoded:\n\t-{processor.decode(torch.argmax(last_logit_image), skip_special_tokens=False)}-")

We have decoded:
	-con-


In [41]:
image_inputs['input_ids']

tensor([[257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152, 257152,
         257152, 257152, 257

In [45]:
processor.decode(108) # \n is added automatically at the end, so it is correct that the first generation 
                      #   does not contain a beginning space.

'\n'

In [35]:
last_logit_image[processor.tokenizer.encode('con')]

tensor([13.2995], grad_fn=<IndexBackward0>)

In [36]:
last_logit_image[processor.tokenizer.encode(' con')]

tensor([5.0420], grad_fn=<IndexBackward0>)

In [37]:
processor.tokenizer.encode('congruent')

[759, 16780, 579]

**Finish generation**

In [49]:
output_generation = model.generate(**image_inputs, max_new_tokens=10)

In [59]:
print(f"Shape of input_generation:\n\t{image_inputs['input_ids'].shape}")
print(f"Shape of output_generation:\n\t{output_generation.shape}")

Shape of input_generation:
	torch.Size([1, 261])
Shape of output_generation:
	torch.Size([1, 267])


In [62]:
[processor.decode(x) + '-' for x in output_generation[0]][image_inputs['input_ids'].shape[1]:] # 6 generated tokens!

['con-', 'gru-', 'ent-', ' angles-', ' .-', '<eos>-']

### Compare with the logits obtained via inseq

In [8]:
# Extract information for the generation with the image 
img = Image.open('inseq/extra_samu/data/image_test.png')
img = img.convert('RGB')
image_inputs = processor(prompt, img)
vision_output = model.vision_tower(image_inputs['pixel_values'])
multimodal_projector_output = model.multi_modal_projector(vision_output['last_hidden_state'])
text_embeddings = model.get_input_embeddings()(image_inputs['input_ids']) # These contain <image> + prompt embeddings.
merge_output = model._merge_input_ids_with_image_features(multimodal_projector_output, text_embeddings, input_ids = image_inputs['input_ids'], attention_mask = image_inputs['attention_mask'], labels = None, token_type_ids=None, cache_position=None)

image_inputs_embeds, attention_mask, labels, position_ids = merge_output
#print(f"input_embeds[0, 5, :10] shape: {image_inputs_embeds.shape}\n\t{image_inputs_embeds[0, 5, :10]}")
outputs_image = model.language_model(attention_mask=attention_mask, 
                               position_ids=position_ids, 
                               inputs_embeds=image_inputs_embeds)
logits_image = outputs_image.logits
last_logit_image = logits_image[0, -1, :]

In [7]:
# Extract information for the generation with the black 
black_img = Image.new('RGB', (100, 100))
black_inputs = processor(prompt, black_img)
vision_output = model.vision_tower(black_inputs['pixel_values'])
multimodal_projector_output = model.multi_modal_projector(vision_output['last_hidden_state'])
text_embeddings = model.get_input_embeddings()(black_inputs['input_ids']) # These contain <image> + prompt embeddings.
merge_output = model._merge_input_ids_with_image_features(multimodal_projector_output, text_embeddings, input_ids = black_inputs['input_ids'], attention_mask = black_inputs['attention_mask'], labels = None, token_type_ids=None, cache_position=None)

black_inputs_embeds, attention_mask, labels, position_ids = merge_output
#print(f"input_embeds[0, 5, :10] shape: {image_inputs_embeds.shape}\n\t{image_inputs_embeds[0, 5, :10]}")
outputs_black = model.language_model(attention_mask=attention_mask, 
                               position_ids=position_ids, 
                               inputs_embeds=black_inputs_embeds)
logits_black = outputs_black.logits
last_logit_black = logits_black[0, -1, :]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


- - - 
Extract inseq logits and compare (didn't resave them so ignore for now).

In [10]:
import numpy as np
import torch
batch_logits = torch.from_numpy(np.loadtxt('inseq/original_logits.txt')) # Black
contrast_batch_logits = torch.from_numpy(np.loadtxt('inseq/contrast_logits.txt')) # Image

In [64]:
batch_logits.shape # I Saved the last ones that's why it results wrong.

torch.Size([257216])

In [11]:
processor.decode([torch.argmax(batch_logits)])

'<eos>'

In [12]:
processor.decode([torch.argmax(contrast_batch_logits)])

'<eos>'

In [39]:
print(f"From inseq logits for black image are: {batch_logits[0:5]}")
print(f"From here logits for black image are:  {last_logit_black[0:5]}")

From inseq logits for black image are: tensor([-1.7957, 13.0201, -9.0444, -1.8154, -2.3267], dtype=torch.float64)
From here logits for black image are:  tensor([  0.2063,   3.2383, -13.9596,   0.1729,  -0.0149],
       grad_fn=<SliceBackward0>)


In [40]:
print(f"From inseq logits for image are: {contrast_batch_logits[0:5]}")
print(f"From here logits for image are:  {last_logit_image[0:5]}") 

From inseq logits for image are: tensor([-1.3816, 13.3314, -9.0542, -1.4042, -1.7697], dtype=torch.float64)
From here logits for image are:  tensor([  0.4453,   5.9641, -11.5594,   0.4139,   0.9109],
       grad_fn=<SliceBackward0>)


- - - 
**Check the embeddings**

In [6]:
import numpy as np
batch_embeddings = torch.from_numpy(np.loadtxt('inseq/original_inputs_embeddings.txt')) # Black
contrast_batch_embeddings = torch.from_numpy(np.loadtxt('inseq/contrast_inputs_embeddings.txt')) # Image

In [14]:
batch_embeddings.shape

torch.Size([261, 2048])

In [16]:
batch_embeddings[5, :10]

tensor([ 0.0472,  0.0010,  0.0035, -0.0238,  0.0039, -0.0368, -0.0042,  0.0173,
        -0.0120,  0.0135], dtype=torch.float64)

In [22]:
black_inputs_embeds = black_inputs_embeds.squeeze(0)
image_inputs_embeds = image_inputs_embeds.squeeze(0)

In [29]:
all((black_inputs_embeds == batch_embeddings).flatten()) # Black embeddings from here are same as those in inseq!

True

In [31]:
all((image_inputs_embeds == contrast_batch_embeddings).flatten()) # Black embeddings from here are same as those in inseq!

True

In [8]:
# Generation with inseq embeddings
outputs = model.language_model(attention_mask=attention_mask[:, :, :261, :261], 
                               position_ids=position_ids[:, :261],
                               inputs_embeds=batch_embeddings.unsqueeze(0).to(torch.float32))

In [12]:
processor.decode(torch.argmax(outputs.logits[0, -1, :]))

'un'

In [13]:
# Generation with inseq embeddings
outputs = model.language_model(attention_mask=attention_mask[:, :, :261, :261], 
                               position_ids=position_ids[:, :261],
                               inputs_embeds=contrast_batch_embeddings.unsqueeze(0).to(torch.float32))

In [14]:
processor.decode(torch.argmax(outputs.logits[0, -1, :]))

'con'

### Repeat for all steps steps:
CHECK HERE

In [8]:
eos_token = processor.tokenizer.eos_token

In [14]:
# For next step
prompt = 'Describe this image'
img = Image.open('inseq/extra_samu/data/image_test.png')
img = img.convert('RGB')
image_inputs = processor(prompt, img)
last_logit_image = extract_logits_vector(image_inputs)

input_embeds[0, 5, :10] shape: torch.Size([1, 261, 2048])
	tensor([ 0.0268, -0.0039, -0.0020, -0.0092,  0.0053, -0.0089, -0.0032,  0.0156,
        -0.0121,  0.0059], grad_fn=<SliceBackward0>)


In [15]:
import torch
processor.decode(torch.argmax(last_logit_image))

'in'

In [None]:
black_img = Image.new('RGB', (100, 100))
black_inputs = processor(prompt, black_img)
last_logit_black = extract_logits_vector(black_inputs)