In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
import torch.nn as nn
import lens
import polars as pl
from transformers import AutoProcessor, AutoModelForVision2Seq

In [2]:
SAVE_PATH = 'test_save/'

In [4]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x15d9a6690>

In [5]:
device = None
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
else: device = 'cpu'

In [6]:
# Загрузка модели и процессора
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    torch_dtype=torch.bfloat16,
    #_attn_implementation="flash_attention_2" if device == "cuda" else "eager"
).to(device)

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


In [7]:
text_lens = lens.LogitLens(model.model.text_model.layers, model.lm_head, tokenizer=processor.tokenizer, processor=processor, output_attentions=True)
text_lens.register()

vision_lens = lens.LogitLens(model.model.vision_model.encoder.layers, model.lm_head, tokenizer=processor.tokenizer, processor=processor)
vision_lens.register()

projector_lens = lens.LogitLens(model.model.connector, model.lm_head, tokenizer=processor.tokenizer, processor=processor)
projector_lens.register()

vision_embedding_lens = lens.LogitLens(model.model.vision_model.embeddings.patch_embedding, model.lm_head, tokenizer=processor.tokenizer, processor=processor)
vision_embedding_lens.register()

In [8]:
df = pl.read_parquet('hf://datasets/Lin-Chen/MMStar/mmstar.parquet').sample(250, seed=42)

In [9]:
df.write_parquet('sampled_data_250.parquet')

In [10]:
to_np = lambda data: np.array([x.cpu().float() for x in data.activations.values()], dtype=np.float16)

for i, question, answer, category, l2_category, image, meta_info in tqdm(df.iter_rows(), total=len(df)):
    image = Image.open(io.BytesIO(image))

    messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": question}
        ]
        },
        {"role": "assistant", "content": [{"type" : "text", "text": "The answer is "}]
    }]

    prompt = processor.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False)
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)

    outputs = model(**inputs)

    result = {
        'text_lens': to_np(text_lens),
        'vision_lens': to_np(vision_lens),
        'embedding_lens': to_np(vision_embedding_lens),
        'projector_lens': to_np(projector_lens)
    }


    np.savez(SAVE_PATH+f'{i}_saved_dictionary.npz', **result)

    text_lens.cleanup()
    vision_embedding_lens.cleanup()
    vision_lens.cleanup()
    projector_lens.cleanup()

100%|██████████| 250/250 [22:12<00:00,  5.33s/it]


In [11]:
text_lens.activations

{}