In [1]:
import torch
import torch.nn.functional as F
import requests
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration
from datasets import load_dataset

import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import logging
logger = logging.getLogger(__name__)

func_to_enable_grad = '_sample'
setattr(LlavaForConditionalGeneration, func_to_enable_grad, torch.enable_grad(getattr(LlavaForConditionalGeneration, func_to_enable_grad)))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
    attn_implementation = "eager"
).to(0)

#--------------------------------------------------
model.vision_tower.config.output_attentions = True

# set hooks to get attention weights
model.enc_attn_weights = []
#outputs: attn_output, attn_weights, past_key_value
def forward_hook(module, inputs, output): 
    if output[1] is None:
        logger.error(
            ("Attention weights were not returned for the encoder. "
            "To enable, set output_attentions=True in the forward pass of the model. ")
        )
        return output
    
    output[1].requires_grad_(True)
    output[1].retain_grad()
    model.enc_attn_weights.append(output[1].detach().cpu())
    return output

hooks_pre_encoder, hooks_encoder = [], []
for layer in model.language_model.layers:
    hook_encoder_layer = layer.self_attn.register_forward_hook(forward_hook)
    hooks_pre_encoder.append(hook_encoder_layer)

model.enc_attn_weights_vit = []

def forward_hook_image_processor(module, inputs, output): 
    if output[1] is None:
        logger.warning(
            ("Attention weights were not returned for the vision model. "
             "Relevancy maps will not be calculated for the vision model. " 
             "To enable, set output_attentions=True in the forward pass of vision_tower. ")
        )
        return output

    output[1].requires_grad_(True)
    output[1].retain_grad()
    model.enc_attn_weights_vit.append(output[1])
    return output

hooks_pre_encoder_vit = []
for layer in model.vision_tower.vision_model.encoder.layers:
    hook_encoder_layer_vit = layer.self_attn.register_forward_hook(forward_hook_image_processor)
    hooks_pre_encoder_vit.append(hook_encoder_layer_vit)
#--------------------------------------------------

processor = AutoProcessor.from_pretrained(model_id)

if model.language_model.config.model_type == "gemma":
    eos_token_id = processor.tokenizer('<end_of_turn>', add_special_tokens=False).input_ids[0]
else:
    eos_token_id = processor.tokenizer.eos_token_id

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 25.35it/s]
  return t.to(
Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
tokens = torch.load("full_generation.pt").tolist()
# Decode each token individually
decoded_tokens = [processor.decode([t]) for t in tokens]

# Optionally, print
for i, tok in enumerate(decoded_tokens):
    print(f"{i}: {repr(tok)}")

  untyped_storage = torch.UntypedStorage(self.size(), device=device)


NameError: name 'processor' is not defined

In [6]:
file = r"saved/vit_attn"
import os
torch.save(tokens, os.path.join(file, "lmao.pt"))

In [4]:
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What is the colour of this object?"},
          {"type": "image"},
        ],
    },
]
conv_step = conversation + [
    {"role": "assistant", "content": [{"type": "text", "text": processor.decode(tokens[596:600])}]}
]
conv_step

[{'role': 'user',
  'content': [{'type': 'text', 'text': 'What is the colour of this object?'},
   {'type': 'image'}]},
 {'role': 'assistant',
  'content': [{'type': 'text', 'text': 'The object is red'}]}]

In [5]:
image_file = r"C:\Users\Dreamcore\Downloads\61-gPGG7umL._UF894,1000_QL80_.jpg"
raw_image = Image.open(image_file).convert("RGB")
prompt_appended = processor.apply_chat_template(conv_step, add_generation_prompt=False, return_tensors="pt")
inputs_appended = processor(images=raw_image, text=prompt_appended, return_tensors='pt').to(0, torch.float16)

output_appended = model(**inputs_appended,
                        use_cache=False,
                        output_attentions=True,
                        output_hidden_states=True,
                        return_dict_in_generate=True,
                        output_scores=True,
                        eos_token_id=eos_token_id)
print(len(model.enc_attn_weights))

32


In [6]:
print(output_appended.logits.shape)
# Decode them
topk = torch.topk(output_appended.logits[:, -1], k=1, dim=-1)
for ids in topk.indices:
    print(processor.tokenizer.batch_decode(ids))

torch.Size([1, 601, 32064])
['in']
