# DINOv2 Dense Feature Colorization (PCA & KMeans)\n\n本 demo 生成 **和截图类似的“彩色区域图”**：对每个 patch 的 DINOv2 特征做 **PCA→RGB 伪彩** 或 **KMeans 聚类上色**。\n- 支持 **图片** 或 **短视频**（前 N 帧），PCA/KMeans 在第一帧拟合，保证后续帧颜色一致。\n- 可用于课堂对比：DINOv2 的特征更整洁、语义更一致。

In [None]:
#@title Install deps!pip -q install --upgrade torch torchvision transformers scikit-learn pillow opencv-python matplotlib imageioimport os, io, math, glob, contextlib, requests, numpy as np, cv2, imageiofrom PIL import Imageimport torch, matplotlib.pyplot as pltfrom sklearn.decomposition import PCAfrom sklearn.cluster import KMeansfrom transformers import AutoImageProcessor, Dinov2Modeldevice = 'cuda' if torch.cuda.is_available() else 'cpu'print('Torch:', torch.__version__, '| CUDA:', torch.cuda.is_available())

In [None]:
#@title Utils: I/O, feature extraction, colorizationdef load_image_from_url(url, size=None):    img = Image.open(io.BytesIO(requests.get(url, timeout=10).content)).convert('RGB')    if size is not None: img = img.resize((size, size))    return imgdef video_to_frames(url, out_dir='/content/frames', max_frames=80):    os.makedirs(out_dir, exist_ok=True)    vid_path = '/content/input.mp4'    # 下载视频    !wget -q -O /content/input.mp4 "$url"    cap = cv2.VideoCapture(vid_path)    frames = []; i=0    while True:        ok, bgr = cap.read()        if not ok or i>=max_frames: break        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)        frames.append(rgb)        cv2.imwrite(f"{out_dir}/{i:05d}.jpg", cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR))        i+=1    cap.release()    return frames, out_dirMODEL_ID = 'facebook/dinov2-small'  # 速度/效果兼顾；也可改 base/largeprocessor = AutoImageProcessor.from_pretrained(MODEL_ID)model = Dinov2Model.from_pretrained(MODEL_ID, output_attentions=False).to(device).eval()@torch.no_grad()def dinov2_patch_tokens(img_pil):    # 返回 patch tokens (Hgrid, Wgrid, D) 与预处理后尺寸    inputs = processor(images=img_pil, return_tensors='pt')    pixel_values = inputs['pixel_values'].to(device)  # (1,3,H,W)    out = model(pixel_values=pixel_values)    tokens = out.last_hidden_state[0, 1:, :]  # 去掉 CLS -> (N,D)    N, D = tokens.shape    g = int(N ** 0.5)  # 方格    tokens = tokens.reshape(g, g, D).detach().cpu().numpy()  # (g,g,D)    Hproc = pixel_values.shape[2]; Wproc = pixel_values.shape[3]    return tokens, (Hproc, Wproc)def pca_colorize(tokens_g_g_d, pca=None):    g, _, d = tokens_g_g_d.shape    X = tokens_g_g_d.reshape(-1, d)  # (g*g, d)    if pca is None:        pca = PCA(n_components=3, random_state=0)        X3 = pca.fit_transform(X)    else:        X3 = pca.transform(X)    # 百分位归一化，鲁棒去极值    X3 = X3.reshape(g*g, 3)    lo = np.percentile(X3, 1, axis=0, keepdims=True)    hi = np.percentile(X3, 99, axis=0, keepdims=True)    X3n = np.clip((X3 - lo) / (hi - lo + 1e-6), 0, 1)    rgb = (X3n.reshape(g, g, 3) * 255).astype(np.uint8)    return rgb, pcadef kmeans_colorize(tokens_g_g_d, k=6, kmeans=None):    g, _, d = tokens_g_g_d.shape    X = tokens_g_g_d.reshape(-1, d)    if kmeans is None:        kmeans = KMeans(n_clusters=k, n_init=10, random_state=0)        lab = kmeans.fit_predict(X)    else:        lab = kmeans.predict(X)    lab = lab.reshape(g, g)    # 固定调色板（HSV 均匀采样）    palette = np.stack([        np.array([np.sin(i*2*np.pi/k)*127+128, np.cos(i*2*np.pi/k)*127+128, (i*255)//k], dtype=np.uint8)        for i in range(k)    ], axis=0)    rgb = palette[lab]    return rgb, kmeansdef upsample_to_image(rgb_grid, out_hw, method=cv2.INTER_NEAREST):    h, w = out_hw    return cv2.resize(rgb_grid, (w, h), interpolation=method)def show_side_by_side(title_left, img_left, title_right, img_right, size=6):    plt.figure(figsize=(size*2, size))    plt.subplot(1,2,1); plt.imshow(img_left); plt.title(title_left); plt.axis('off')    plt.subplot(1,2,2); plt.imshow(img_right); plt.title(title_right); plt.axis('off')    plt.show()

## 选择图片或视频（默认视频）\n- `RUN_VIDEO=True`：下载短视频并取前 `MAX_FRAMES` 帧；\n- `RUN_VIDEO=False`：对一张图片做彩色化。

In [None]:
#@title ConfigRUN_VIDEO = True  #@param {type:'boolean'}MAX_FRAMES = 80   #@param {type:'integer'}IMAGE_URL = 'https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg'  # 猫示例VIDEO_URL = 'https://github.com/opencv/opencv/blob/master/samples/data/vtest.avi?raw=true'  # 行人视频；可换成你自己的狗视频 URLINPUT_SIZE = 518  # DINOv2 常用 518，37x37 patch（p=14）。也可 392 等

In [None]:
#@title Run (image / video) and colorize featuresif RUN_VIDEO:    frames_rgb, frame_dir = video_to_frames(VIDEO_URL, max_frames=MAX_FRAMES)    first_img = Image.fromarray(frames_rgb[0]).resize((INPUT_SIZE, INPUT_SIZE))else:    first_img = load_image_from_url(IMAGE_URL, size=INPUT_SIZE)# 1) 在第一帧上提取 patch 特征并拟合 PCA / KMeanstokens_first, (Hp, Wp) = dinov2_patch_tokens(first_img)pca_map_first, pca = pca_colorize(tokens_first, pca=None)kmeans_map_first, kmeans = kmeans_colorize(tokens_first, k=6, kmeans=None)pca_vis_first = upsample_to_image(pca_map_first, (Hp, Wp), method=cv2.INTER_NEAREST)kmeans_vis_first = upsample_to_image(kmeans_map_first, (Hp, Wp), method=cv2.INTER_NEAREST)show_side_by_side('Input (1st frame/image)', first_img, 'PCA pseudo-color', pca_vis_first, size=5)show_side_by_side('Input (1st frame/image)', first_img, 'KMeans (k=6) color', kmeans_vis_first, size=5)# 2) 如果是视频：把 PCA/KMeans 固定在第一帧上，传播到后续帧 -> 保持时间一致的着色gif_frames_pca = []; gif_frames_km = []if RUN_VIDEO:    for idx, rgb in enumerate(frames_rgb):        img = Image.fromarray(rgb).resize((INPUT_SIZE, INPUT_SIZE))        tok, (Hp, Wp) = dinov2_patch_tokens(img)        pca_map, _ = pca_colorize(tok, pca=pca)        km_map, _  = kmeans_colorize(tok, k=6, kmeans=kmeans)        pca_vis = upsample_to_image(pca_map, (Hp, Wp), method=cv2.INTER_NEAREST)        km_vis  = upsample_to_image(km_map, (Hp, Wp), method=cv2.INTER_NEAREST)        # 组合成左右拼图（左原始帧，右彩色图），更像课件截图        canvas = np.zeros((Hp, Wp*2, 3), dtype=np.uint8)        canvas[:, :Wp]  = cv2.resize(rgb, (Wp, Hp))        canvas[:, Wp:]  = pca_vis        gif_frames_pca.append(Image.fromarray(canvas))        canvas2 = np.zeros((Hp, Wp*2, 3), dtype=np.uint8)        canvas2[:, :Wp] = cv2.resize(rgb, (Wp, Hp))        canvas2[:, Wp:] = km_vis        gif_frames_km.append(Image.fromarray(canvas2))    # 导出 GIF    imageio.mimsave('/content/dinov2_pca.gif', gif_frames_pca, duration=0.08)    imageio.mimsave('/content/dinov2_kmeans.gif', gif_frames_km, duration=0.08)    print('Saved GIFs: /content/dinov2_pca.gif , /content/dinov2_kmeans.gif')

### 讲解提示\n- **彩色来源**：每个 patch 的高维特征经 PCA 压到 3 维并映射到 RGB；不同部件（头/背/腿/地面）在特征空间分布不同 → 颜色不同。\n- **时间一致**：PCA/KMeans 在第一帧拟合，后续帧用同一投影/聚类，颜色在时间上保持稳定。\n- **对比热力图**：这不是注意力热力图，而是**特征空间的色彩编码**；能更直观对比 DINO 与 DINOv2 的表征差异。