# DINOv2 Attention Rollout Demo

本 Notebook 展示如何：
1) 加载预训练的 DINOv2 ViT 并提取图像特征；
2) 使用 **Attention Rollout** 可视化注意力图。

> 模型：Hugging Face `facebook/dinov2-small`（支持 CPU/GPU）

👉 你可以：
- 直接跑我们提供的示例图片；
- 或者上传你自己的机器人采集图像进行可视化。

In [None]:
!pip -q install --upgrade torch torchvision transformers pillow matplotlib opencv-python
import torch, torchvision
from transformers import AutoImageProcessor, Dinov2Model
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import requests, io, cv2
from typing import List
print('Torch:', torch.__version__, '| CUDA available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
MODEL_ID = 'facebook/dinov2-small'  # 约22M参数，课堂演示足够
processor = AutoImageProcessor.from_pretrained(MODEL_ID)
model = Dinov2Model.from_pretrained(MODEL_ID, output_attentions=True).to(device).eval()
print('Loaded', MODEL_ID)

In [None]:
def load_image_from_url(url: str, size: int = 518) -> Image.Image:
    # DINOv2 建议输入尺寸能被 patch_size 整除（small 模型 patch=14），518 在示例中常用
    img = Image.open(io.BytesIO(requests.get(url, timeout=10).content)).convert('RGB')
    img = img.resize((size, size))
    return img

def preprocess(img: Image.Image):
    inputs = processor(images=img, return_tensors='pt')
    return {k: v.to(device) for k, v in inputs.items()}

def attention_rollout(attentions: List[torch.Tensor], head_fusion: str = 'mean', start_layer: int = 0):
    """
    attentions: list of (B, num_heads, T, T)
    返回 (B, T) CLS->token 的注意力得分（已rollout）
    参考：Abnar & Zuidema, 2020；ViT 论文附录做法
    """
    with torch.no_grad():
        # 选层范围
        attns = attentions[start_layer:]
        # 融合 head（mean/max/min）
        if head_fusion == 'mean':
            attns = [a.mean(dim=1) for a in attns]  # (B, T, T)
        elif head_fusion == 'max':
            attns = [a.max(dim=1).values for a in attns]
        elif head_fusion == 'min':
            attns = [a.min(dim=1).values for a in attns]
        else:
            raise ValueError('head_fusion must be one of [mean, max, min]')

        # 加 I 并重新归一化（考虑残差对信息流的影响）
        attns = [a + torch.eye(a.size(-1), device=a.device).unsqueeze(0) for a in attns]
        attns = [a / a.sum(dim=-1, keepdim=True) for a in attns]

        # 逐层相乘
        rollout = attns[0]
        for a in attns[1:]:
            rollout = torch.bmm(a, rollout)

        # 取 CLS token 对所有 token 的注意力（CLS 在索引0）
        cls_attn = rollout[:, 0]  # (B, T)
        return cls_attn

def show_attention_on_image(img: Image.Image, attn_map_2d: np.ndarray, alpha: float = 0.5):
    # 将注意力图 resize 到原图大小，并叠加可视化
    h, w = img.size[1], img.size[0]
    attn = cv2.resize(attn_map_2d, (w, h))
    attn = (attn - attn.min()) / (attn.max() - attn.min() + 1e-8)
    heat = (plt.cm.jet(attn)[..., :3] * 255).astype(np.uint8)
    over = cv2.addWeighted(np.array(img), 1 - alpha, heat, alpha, 0)
    plt.figure(figsize=(6,6))
    plt.axis('off')
    plt.imshow(over)
    plt.show()

In [None]:
# 示例图片（你可替换为机器人拍摄图像的 URL）
IMAGE_URLS = [
  'content/cat.jpg',  # cat
  'content/dog.jpg',    # dog
  'content/car.jpg',    # car
  'content/plane.jpg'     # airplane
]
print('Loaded', len(IMAGE_URLS), 'sample image URLs.')

HEAD_FUSION = 'mean'   # 可选: 'mean' | 'max' | 'min'
START_LAYER = 0        # 从第几层开始做 rollout（可调以观察浅/深层差异）

In [None]:
for i, url in enumerate(IMAGE_URLS):
    print(f'\n=== Image {i+1}: {url} ===')
    img = load_image_from_url(url)
    inputs = preprocess(img)
    with torch.no_grad():
        outputs = model(**inputs)  # outputs: last_hidden_state, pooler_output, attentions

    # 1) 全局特征：CLS 或平均池化
    cls_feat = outputs.last_hidden_state[:, 0]              # (B, D)
    mean_feat = outputs.last_hidden_state.mean(dim=1)       # (B, D)
    print('CLS feature shape:', tuple(cls_feat.shape), '| mean-pooled shape:', tuple(mean_feat.shape))

    # 2) Attention Rollout
    # outputs.attentions: list[L] of (B, heads, T, T)
    attentions = [a.detach() for a in outputs.attentions]
    cls_to_tokens = attention_rollout(attentions, head_fusion=HEAD_FUSION, start_layer=START_LAYER)  # (B, T)

    # 去除 CLS 自身，剩下 patch token（DINOv2-small 默认 patch=14，输入518x518 => 37x37=1369 个patch + 1 cls => T=1370）
    b, t = cls_to_tokens.shape
    patch_scores = cls_to_tokens[:, 1:]  # (B, 37*37)
    grid = int(np.sqrt(patch_scores.shape[1]))
    attn_map = patch_scores.reshape(b, grid, grid).cpu().numpy()[0]
    show_attention_on_image(img, attn_map)

print('\nDone.')

In [None]:
# 可选：上传你自己的图片进行可视化（课堂时可用）
from google.colab import files
uploaded = files.upload()  # 选择一张或多张图片
for fname in uploaded:
    img = Image.open(io.BytesIO(uploaded[fname])).convert('RGB').resize((518, 518))
    inputs = preprocess(img)
    with torch.no_grad():
        outputs = model(**inputs)
    attentions = [a.detach() for a in outputs.attentions]
    cls_to_tokens = attention_rollout(attentions, head_fusion='mean', start_layer=0)
    patch_scores = cls_to_tokens[:, 1:]
    grid = int(np.sqrt(patch_scores.shape[1]))
    attn_map = patch_scores.reshape(1, grid, grid).cpu().numpy()[0]
    print('Visualizing:', fname)
    show_attention_on_image(img, attn_map)

## 讲解要点速记
- **Feature Extraction**：`last_hidden_state` 的 `CLS` 向量可作全局表征，或对 token 平均得到 `mean-pooled` 特征。
- **Attention Rollout**：将每层注意力做 head 融合（mean/max/min），加上单位矩阵后归一化并逐层相乘，得到 CLS→各 token 的“信息流”强度。
- **观察**：浅层更像边缘/纹理，深层更聚焦物体与上下文语义区域。课堂可切换 `START_LAYER`/`HEAD_FUSION` 展现差异。