In [1]:
import torch, requests, math
from io import BytesIO
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----------------------------
# 0) ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏≠‡∏∏‡∏õ‡∏Å‡∏£‡∏ì‡πå
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

device: cuda


In [None]:
# ----------------------------
# 1) ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏• + ‡πÇ‡∏õ‡∏£‡πÄ‡∏ã‡∏™‡πÄ‡∏ã‡∏≠‡∏£‡πå
# ----------------------------
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    trust_remote_code=True,
)

bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    dtype="auto",         # ‡∏ñ‡πâ‡∏≤‡πÉ‡∏ä‡πâ torch>=2.4 ‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥ dtype=torch.float16/bfloat16
    device_map="auto",
    trust_remote_code=True,
    offload_folder="offload",   # ‡∏ö‡∏≤‡∏á‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏¢‡πâ‡∏≤‡∏¢‡πÑ‡∏õ disk
    quantization_config=bnb_cfg,
).eval()

In [4]:
# ----------------------------
# 2) ‡πÇ‡∏´‡∏•‡∏î‡∏†‡∏≤‡∏û‡∏ó‡∏î‡∏™‡∏≠‡∏ö + ‡∏Ñ‡∏≥‡∏™‡∏±‡πà‡∏á
# ----------------------------
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(BytesIO(requests.get(url, timeout=15).content)).convert("RGB").resize((224, 224))

messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": "Describe this image."}
    ],
}]

In [5]:
# ----------------------------
# 3) Inference ‡πÉ‡∏´‡πâ‡πÄ‡∏´‡πá‡∏ô‡∏ú‡∏•‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏à‡∏£‡∏¥‡∏á
# ----------------------------
chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# ‡∏™‡∏£‡πâ‡∏≤‡∏á batch inputs (‡∏ó‡∏±‡πâ‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°+‡∏†‡∏≤‡∏û) ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö generate
inputs = processor(
    text=[chat_text],
    images=[image],
    videos=None,
    padding=True,
    return_tensors="pt",
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.inference_mode():
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=16,
        do_sample=False,        # deterministic
        use_cache=True,
    )

# ‡∏ï‡∏±‡∏î prompt ‡∏≠‡∏≠‡∏Å ‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà‡πÇ‡∏°‡πÄ‡∏î‡∏• "‡∏û‡∏π‡∏î‡∏ï‡πà‡∏≠"
trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)]
texts = processor.batch_decode(trimmed, skip_special_tokens=True)
print("Model output:", texts[0].strip())

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model output: The image depicts a serene beach scene during what appears to be either sunrise or sunset


In [None]:
# ----------------------------
# 4) ‡∏î‡∏∂‡∏á Visual Attention ‡∏à‡∏≤‡∏Å Vision Tower (ViT)
#    - ‡πÉ‡∏ä‡πâ‡∏û‡∏¥‡∏Å‡πÄ‡∏ã‡∏•‡∏à‡∏≤‡∏Å image_processor ‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á
#    - ‡∏Ç‡∏≠ output_attentions=True ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏∂‡∏á‡πÅ‡∏ú‡∏ô‡∏ó‡∏µ‡πà attention ‡∏ó‡∏∏‡∏Å‡πÄ‡∏•‡πÄ‡∏¢‡∏≠‡∏£‡πå
# ----------------------------

print(model.config.to_dict().keys())
print(model.config.output_attentions)   # ‡∏Ñ‡πà‡∏≤ default (True/False)

dict_keys(['vision_config', 'text_config', 'image_token_id', 'video_token_id', 'return_dict', 'output_hidden_states', 'torchscript', 'dtype', 'pruned_heads', 'tie_word_embeddings', 'chunk_size_feed_forward', 'is_encoder_decoder', 'is_decoder', 'cross_attention_hidden_size', 'add_cross_attention', 'tie_encoder_decoder', 'architectures', 'finetuning_task', 'id2label', 'label2id', 'task_specific_params', 'problem_type', 'tokenizer_class', 'prefix', 'bos_token_id', 'pad_token_id', 'eos_token_id', 'sep_token_id', 'decoder_start_token_id', 'max_length', 'min_length', 'do_sample', 'early_stopping', 'num_beams', 'temperature', 'top_k', 'top_p', 'typical_p', 'repetition_penalty', 'length_penalty', 'no_repeat_ngram_size', 'encoder_no_repeat_ngram_size', 'bad_words_ids', 'num_return_sequences', 'output_scores', 'return_dict_in_generate', 'forced_bos_token_id', 'forced_eos_token_id', 'remove_invalid_values', 'exponential_decay_length_penalty', 'suppress_tokens', 'begin_suppress_tokens', 'num_beam_

In [None]:
with torch.inference_mode():
    out = model(
        **inputs,
        output_attentions=True,
        return_dict=True,
        use_cache=False
    )

print(type(out.attentions))

<class 'tuple'>


In [None]:
print(len(out.attentions))      # ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÄ‡∏•‡πÄ‡∏¢‡∏≠‡∏£‡πå
print(out.attentions)           # ‡πÄ‡∏•‡πÄ‡∏¢‡∏≠‡∏£‡πå‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢

36
(None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)



üîé ‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏Ç‡∏≠‡∏á‡∏£‡∏∏‡πà‡∏ô Instruct

* **Qwen2.5-VL-3B-Instruct** ‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∏‡πà‡∏ô‡∏ó‡∏µ‡πà fine-tune ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏ô‡∏ó‡∏ô‡∏≤ (chat) ‡πÅ‡∏•‡∏∞‡∏ï‡∏≠‡∏ö‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°
* ‡∏ó‡∏µ‡∏°‡∏ú‡∏π‡πâ‡∏û‡∏±‡∏í‡∏ô‡∏≤‡∏õ‡∏¥‡∏î `output_attentions` ‚Üí ‡πÄ‡∏ß‡∏•‡∏≤‡∏Ç‡∏≠ attention ‡∏à‡∏∞‡πÑ‡∏î‡πâ `None`
* hook ‡∏Å‡πá‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠‡∏Ñ‡πà‡∏≤ ‡πÄ‡∏û‡∏£‡∏≤‡∏∞ attention weights ‡∏ñ‡∏π‡∏Å‡∏ó‡∏¥‡πâ‡∏á‡πÑ‡∏õ‡πÉ‡∏ô forward

‚úÖ ‡∏ß‡∏¥‡∏ò‡∏µ‡πÅ‡∏Å‡πâ
* ‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÑ‡∏õ‡πÉ‡∏ä‡πâ base ‡πÅ‡∏ó‡∏ô instruct "Qwen/Qwen2.5-VL-3B"
* ‡πÅ‡∏ï‡πà‡∏ú‡∏π‡πâ‡∏û‡∏±‡∏í‡∏ô‡∏≤‡∏î‡∏±‡∏ô‡∏•‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏• base ‡πÑ‡∏õ‡∏à‡∏≤‡∏Å huggingface ‡πÅ‡∏•‡πâ‡∏ß ‚Üí ‡∏û‡∏≠‡πÑ‡∏õ‡∏ï‡∏≤‡∏°‡∏î‡∏π‡πÉ‡∏ô github ‡∏ñ‡∏∂‡∏á‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡∏ä‡∏∑‡πà‡∏≠ Qwen2.5-VL ‡πÅ‡∏ï‡πà‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡πÉ‡∏ô‡∏Å‡∏•‡∏≤‡∏¢‡πÄ‡∏õ‡πá‡∏ô Instruct ‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß T_T

In [None]:
# # ----------------------------
# # 5) ‡∏£‡∏µ‡πÄ‡∏ä‡∏õ‡πÄ‡∏õ‡πá‡∏ô‡∏Å‡∏£‡∏¥‡∏î‡πÅ‡∏û‡∏ï‡∏ä‡πå ‡πÅ‡∏•‡πâ‡∏ß‡∏≠‡∏±‡∏û‡∏™‡πÄ‡∏Å‡∏•‡∏ó‡∏±‡∏ö‡∏†‡∏≤‡∏û
# #    - ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏Ç‡∏ô‡∏≤‡∏î‡∏Å‡∏£‡∏¥‡∏î‡∏à‡∏≤‡∏Å image_processor
# #    - ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö ViT base (patch=14x14 ‡∏ó‡∏µ‡πà 224), ‡πÅ‡∏ï‡πà‡πÄ‡∏£‡∏≤‡∏≠‡πà‡∏≤‡∏ô‡∏à‡∏≤‡∏Å‡∏û‡∏¥‡∏Å‡πÄ‡∏ã‡∏•‡∏à‡∏£‡∏¥‡∏á‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢‡∏Å‡∏ß‡πà‡∏≤
# # ----------------------------
# # ‡∏´‡∏≤ H,W ‡∏´‡∏•‡∏±‡∏á preprocess ‡πÅ‡∏•‡πâ‡∏ß‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏û‡∏ï‡∏ä‡πå‡πÇ‡∏î‡∏¢‡∏î‡∏π‡∏à‡∏≤‡∏Å config ‡∏Ç‡∏≠‡∏á vision_model
# # ‡∏ñ‡πâ‡∏≤ vision_model.config.patch_size ‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà ‡πÉ‡∏´‡πâ‡πÉ‡∏ä‡πâ
# if hasattr(vision_model.config, "image_size"):
#     img_size_proc = vision_model.config.image_size  # ‡∏õ‡∏Å‡∏ï‡∏¥ 224 ‡∏´‡∏£‡∏∑‡∏≠ 448 (‡∏Ç‡∏∂‡πâ‡∏ô‡∏Å‡∏±‡∏ö processor)
# else:
#     # ‡∏™‡∏≥‡∏£‡∏≠‡∏á: ‡πÉ‡∏ä‡πâ‡∏Ç‡∏ô‡∏≤‡∏î‡∏Ç‡∏≠‡∏á pixel_values
#     _, _, Hpx, Wpx = vision_inputs["pixel_values"].shape
#     img_size_proc = max(Hpx, Wpx)

# patch = getattr(vision_model.config, "patch_size", 14)  # ‡∏™‡∏≥‡∏£‡∏≠‡∏á‡∏Ñ‡πà‡∏≤ 14 ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏£‡∏∞‡∏ö‡∏∏
# Hp = math.ceil(img_size_proc / patch)
# Wp = math.ceil(img_size_proc / patch)

# # ‡∏ï‡∏£‡∏ß‡∏à‡∏ß‡πà‡∏≤‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÇ‡∏ó‡πÄ‡∏Ñ‡πá‡∏ô‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏Å‡∏£‡∏¥‡∏î‡πÑ‡∏´‡∏° ‡πÑ‡∏°‡πà‡∏ï‡∏£‡∏á‡∏Å‡πá‡∏õ‡∏£‡∏±‡∏ö‡πÅ‡∏ö‡∏ö‡∏¢‡∏∑‡∏î‡∏´‡∏¢‡∏∏‡πà‡∏ô
# if attn_mean.numel() != Hp * Wp:
#     # ‡∏ö‡∏≤‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏°‡∏µ token ‡πÑ‡∏°‡πà‡∏Ñ‡∏£‡∏ö‡πÄ‡∏ï‡πá‡∏°‡∏Å‡∏£‡∏¥‡∏î‡πÄ‡∏û‡∏£‡∏≤‡∏∞ padding; ‡∏•‡∏≠‡∏á‡πÄ‡∏î‡∏≤‡∏ß‡πà‡∏≤‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏à‡∏£‡∏¥‡∏á‡∏Ñ‡∏∑‡∏≠ (N-1)
#     # ‡πÅ‡∏•‡πâ‡∏ß‡πÉ‡∏™‡πà‡∏Ñ‡πà‡∏≤ sqrt ‡∏•‡∏á‡∏ï‡∏±‡∏ß‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏´‡∏≤‡∏Å‡∏£‡∏¥‡∏î‡∏™‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏µ‡πà‡∏¢‡∏°‡∏ó‡∏µ‡πà‡πÉ‡∏Å‡∏•‡πâ‡πÄ‡∏Ñ‡∏µ‡∏¢‡∏á‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
#     Np = attn_mean.numel()
#     s = int(round(math.sqrt(Np)))
#     Hp = s
#     Wp = Np // s

# fmap = attn_mean.reshape(Hp, Wp).numpy()
# fmap = (fmap - fmap.min()) / (fmap.max() - fmap.min() + 1e-6)

# # ‡∏≠‡∏±‡∏û‡∏™‡πÄ‡∏Å‡∏• heatmap ‡πÉ‡∏´‡πâ‡πÄ‡∏ó‡πà‡∏≤‡∏Ç‡∏ô‡∏≤‡∏î‡∏†‡∏≤‡∏û‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö
# img_w, img_h = image.size
# fmap_img = np.array(Image.fromarray((fmap * 255).astype(np.uint8)).resize((img_w, img_h), Image.BILINEAR)) / 255.0

In [None]:
# # ----------------------------
# # 6) ‡∏ß‡∏≤‡∏î heatmap ‡∏ã‡πâ‡∏≠‡∏ô‡∏ó‡∏±‡∏ö + ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏•‡∏á‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á
# # ----------------------------
# plt.figure(figsize=(8, 6))
# plt.imshow(image)
# plt.imshow(fmap_img, alpha=0.5, cmap="jet")  # ‡∏ã‡πâ‡∏≠‡∏ô‡∏ó‡∏±‡∏ö
# plt.axis("off")
# plt.tight_layout()
# out_path = "attention_overlay.png"
# plt.savefig(out_path, dpi=150)
# plt.close()
# print(f"Saved visual attention overlay to: {out_path}")