# Grad-Cam  可视化与可解释
## 一张照片例子

In [11]:
model_path = "/workspace/models/best_model/yolo11m-cls-best_v8.pt"
img_path = "/workspace/models/SAHI/run_v7/air2_0729-0813_04/raw_data_sliced_merge/06_track_galleries_v9/id_0001_x1_swd/01_07-29_13:37_0729_1337_640.jpg"
out_path = "/workspace/models/gradcam"

In [27]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, glob, cv2, torch, numpy as np
from pathlib import Path
from ultralytics import YOLO

# grad-cam 家族（按需开关）
from pytorch_grad_cam import GradCAM, GradCAMPlusPlus, EigenCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

# ==================== 你可以在这里配置 ====================
MODEL = model_path              # 你的 yolov11-cls 权重
IMGS  = img_path            # 文件/通配/目录
IMG_SIZE = 224                 # 和训练一致：常见 224/256/384/512
DO_METHODS = ("gradcam", "gradcam++", "eigen")  # 要导出的 CAM 类型
TARGET_CLASS = "swd"            # None=自动Top1；或 int 类别id；或 str 类名
SAVE_ORIG_OVERLAY = False       # 同时在“原始分辨率”上叠加一份热力图
OUT_DIR = out_path            # 输出目录
ALPHA = 0.7                   # 叠加透明度（show_cam_on_image 内部使用）
# =========================================================

# ---------- 稳妥的日志与断言工具 ----------
def _assert_tensor4d_rgb01(x: torch.Tensor):
    assert x.ndim == 4 and x.shape[0] == 1, f"Expect [1,3,H,W], got {tuple(x.shape)}"
    assert x.shape[1] == 3, f"Expect 3 channels RGB, got {x.shape[1]}"
    assert x.dtype in (torch.float32, torch.float16, torch.bfloat16), f"Expect float tensor, got {x.dtype}"

# ---------- 预处理：优先用 ultralytics 官方 classify_transforms ----------
def build_preprocess(imgsz: int):
    """
    返回 preprocess(img_bgr) -> (rgb01_square, tensor, meta)
    meta 用于把 CAM 回投影回原图：包含 scale 和 padding。
    如果官方 transforms 不可用，则回退到 letterbox_square。
    """
    try:
        from ultralytics.data.augment import classify_transforms
        tfm = classify_transforms(imgsz=imgsz)  # 返回一个可调用对象
        def _preprocess(img_bgr):
            img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
            out = tfm(image=img_rgb)            # dict {'img': tensor[C,H,W], ...}
            tensor = out["img"].unsqueeze(0)    # [1,3,H,W], float, 0~1
            # 我们不知道官方是否做了crop，这里只保证方形尺寸与张量一致
            H, W = tensor.shape[2:]
            rgb01_square = cv2.resize(img_rgb, (W, H), interpolation=cv2.INTER_LINEAR).astype(np.float32)/255.0
            meta = dict(mode="unknown_square", size=(H, W))  # 用于后续安全叠加
            return rgb01_square, tensor, meta
        return _preprocess
    except Exception:
        # —— 回退：我们自己做“等比缩放 + 居中填充”的正方形 letterbox —— #
        def letterbox_square(img_bgr, new_size, color=(114,114,114)):
            h, w = img_bgr.shape[:2]
            scale = min(new_size / h, new_size / w)
            nh, nw = int(round(h * scale)), int(round(w * scale))
            resized = cv2.resize(img_bgr, (nw, nh), interpolation=cv2.INTER_LINEAR)
            top = (new_size - nh) // 2
            bottom = new_size - nh - top
            left = (new_size - nw) // 2
            right = new_size - nw - left
            padded = cv2.copyMakeBorder(resized, top, bottom, left, right,
                                        borderType=cv2.BORDER_CONSTANT, value=color)
            meta = dict(mode="letterbox", scale=scale, pad=(top, bottom, left, right), resized_hw=(nh, nw))
            return padded, meta

        def _preprocess(img_bgr):
            sq_bgr, meta = letterbox_square(img_bgr, imgsz)
            rgb01 = cv2.cvtColor(sq_bgr, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
            tensor = torch.from_numpy(rgb01).permute(2,0,1).unsqueeze(0)  # [1,3,H,W]
            return rgb01, tensor, meta
        return _preprocess

# ---------- 把方形 CAM 回投影到原始图尺寸（仅对 letterbox 回退路径严格正确） ----------
def cam_to_original(grayscale_cam, orig_shape, meta):
    H0, W0 = orig_shape[:2]
    if meta.get("mode") != "letterbox":
        # 官方 transforms 情况不保证有精确逆变换；退化到简单 resize 回原图大小
        cam_resized = cv2.resize(grayscale_cam, (W0, H0), interpolation=cv2.INTER_LINEAR)
        return cam_resized
    # letterbox: 先把 CAM 裁掉 padding，再按 1/scale 放大回原尺寸
    top, bottom, left, right = meta["pad"]
    nh, nw = meta["resized_hw"]
    size = top + nh + bottom  # == left + nw + right == square size
    # CAM 先缩放到 square，再去掉边
    cam_sq = cv2.resize(grayscale_cam, (size, size), interpolation=cv2.INTER_LINEAR)
    cam_cropped = cam_sq[top: top+nh, left: left+nw]
    cam_orig = cv2.resize(cam_cropped, (W0, H0), interpolation=cv2.INTER_LINEAR)
    return cam_orig

# ---------- 选择“最后一个 Conv2d” ----------
def find_last_conv_layer(module: torch.nn.Module):
    last = None
    for m in module.modules():
        if isinstance(m, torch.nn.Conv2d):
            last = m
    if last is None:
        raise RuntimeError("未在模型中找到 Conv2d 层，请确认这是分类权重。")
    return last

# ---------- 统一输出：[N,C] logits ----------
class LogitsOnly(torch.nn.Module):
    def __init__(self, base: torch.nn.Module):
        super().__init__()
        self.base = base
    def forward(self, x):
        out = self.base(x)
        if torch.is_tensor(out) and out.ndim == 2:
            return out
        if isinstance(out, (list, tuple)):
            for t in out:
                if torch.is_tensor(t) and t.ndim == 2:
                    return t
            for t in out:
                if torch.is_tensor(t):
                    return t
        if isinstance(out, dict):
            if "logits" in out and torch.is_tensor(out["logits"]):
                return out["logits"]
            for v in out.values():
                if torch.is_tensor(v) and v.ndim == 2:
                    return v
        raise TypeError(f"无法从模型输出提取 logits（got {type(out)})")

# ---------- 根据类名/类id 解析目标类别 ----------
def resolve_target_class(target, names):
    if target is None:
        return None
    if isinstance(target, int):
        return target
    if isinstance(target, str):
        if isinstance(names, dict):
            inv = {v: k for k, v in names.items()}
            if target in inv:
                return inv[target]
    raise ValueError(f"TARGET_CLASS 无法解析：{target!r}")

# ---------- 主流程 ----------
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(OUT_DIR, exist_ok=True)

    # 1) 加载 YOLOv11-cls
    wrapper = YOLO(MODEL)
    base: torch.nn.Module = wrapper.model.eval().to(device)  # 原始 nn.Module
    net  = LogitsOnly(base).eval().to(device)                 # 统一输出
    target_layer = find_last_conv_layer(base)
    class_names = getattr(wrapper, "names", None)

    # 2) 预处理
    preprocess = build_preprocess(IMG_SIZE)

    # 3) 收集图片
    if os.path.isdir(IMGS):
        paths = sorted(sum([glob.glob(str(Path(IMGS)/p)) for p in ("*.jpg","*.jpeg","*.png","*.bmp","*.webp")], []))
    elif any(ch in IMGS for ch in "*?[]"):
        paths = sorted(glob.glob(IMGS))
    else:
        paths = [IMGS]
    assert paths, f"未找到图片：{IMGS}"

    # 4) 目标类解析（允许 None / int / str）
    target_id = resolve_target_class(TARGET_CLASS, class_names)

    # 5) CAM 方法映射
    method_map = {
        "gradcam": GradCAM,
        "gradcam++": GradCAMPlusPlus,
        "eigen": EigenCAM
    }
    methods = [m for m in DO_METHODS if m in method_map]
    assert methods, f"DO_METHODS 设置有误：{DO_METHODS}"

    # 6) 推理 + CAM
    for p in paths:
        img_bgr = cv2.imread(p)
        if img_bgr is None:
            print(f"[跳过] 无法读取 {p}")
            continue
        rgb01_square, x, meta = preprocess(img_bgr)   # rgb01:[H,W,3] in [0,1]; x:[1,3,H,W]
        x = x.to(device)
        _assert_tensor4d_rgb01(x)

        # ====== 关键：CAM 前必须允许梯度 ======
        torch.set_grad_enabled(True)
        net.zero_grad(set_to_none=True)
        for param in net.parameters():
            param.requires_grad_(True)
        x.requires_grad_(True)

        # 自动 Top1 或指定类
        cam_targets = None if target_id is None else [ClassifierOutputTarget(target_id)]

        # ====== 做 CAM（逐方法导出） ======
        for mname in methods:
            CAM = method_map[mname]
            with CAM(model=net, target_layers=[target_layer]) as cam:
                grayscale_cam = cam(input_tensor=x, targets=cam_targets)[0]  # HxW, 0~1

            # 方形图上叠加
            vis_square = show_cam_on_image(rgb01_square.copy(), grayscale_cam, use_rgb=True, image_weight=ALPHA)
            stem = Path(p).stem
            out1 = os.path.join(OUT_DIR, f"{stem}_{mname}_square.jpg")
            cv2.imwrite(out1, cv2.cvtColor(vis_square, cv2.COLOR_RGB2BGR))

            # 原图上叠加（尽可能复原）
            if SAVE_ORIG_OVERLAY:
                cam_orig = cam_to_original(grayscale_cam, img_bgr.shape, meta)
                cam_orig_rgb = show_cam_on_image(
                    cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)/255.0,
                    cam_orig, use_rgb=True, image_weight=ALPHA)
                out2 = os.path.join(OUT_DIR, f"{stem}_{mname}_orig.jpg")
                cv2.imwrite(out2, cv2.cvtColor(cam_orig_rgb, cv2.COLOR_RGB2BGR))

        # ====== 单独做一次无梯度前向，只为显示 Top1 名称/概率 ======
        with torch.no_grad():
            logits = net(x.detach())
            probs  = torch.softmax(logits.float(), dim=1)
            top1   = int(probs.argmax(dim=1).item())
            top1_p = float(probs[0, top1].item())
        top1_name = class_names.get(top1, top1) if isinstance(class_names, dict) else top1
        target_disp = (class_names.get(target_id, target_id) if isinstance(class_names, dict) else target_id) if target_id is not None else f"Top1({top1_name})"

        print(f"[OK] {p} | Pred={top1_name} ({top1_p:.3f}) | CAM for {target_disp} -> {OUT_DIR}")
    print("全部完成。")

if __name__ == "__main__":
    main()


[OK] /workspace/models/SAHI/run_v7/air2_0729-0813_04/raw_data_sliced_merge/06_track_galleries_v9/id_0001_x1_swd/01_07-29_13:37_0729_1337_640.jpg | Pred=others (0.576) | CAM for swd -> /workspace/models/gradcam
全部完成。


## 批量执行

In [38]:
# model_path = "/workspace/models/best_model/yolo11m-cls-best_v8.pt"
# model_path = "/workspace/models/best_model/yolo11m-cls-best_v7.pt"
# model_path = "/workspace/models/best_model/yolo11m-cls-best_v6.pt"
model_path = "/workspace/models/best_model/yolo11n-cls-best_v5.pt"
# model_path = "/workspace/models/best_model/yolo11m-cls-best_v4.pt"
# model_path = "/workspace/models/best_model/yolo11s-cls-best_v3.pt"
# model_path = "/workspace/models/best_model/yolo11s-cls-best_v2.pt"

imgs_folder_path = "/workspace/models/runs_yolov11_cls/cropped_objects/swd"
out_path = "/workspace/models/gradcam"

In [39]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, glob, cv2, torch, numpy as np
from pathlib import Path
from ultralytics import YOLO

from pytorch_grad_cam import GradCAM, GradCAMPlusPlus, EigenCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

# ==================== 配置 ====================
MODEL = model_path               # yolov11-cls 权重
IMGS  = imgs_folder_path         # 目录 / 文件 / 通配
IMG_SIZE = 224                   # 与训练一致：224/256/384/512
DO_METHODS = ("gradcam", "gradcam++", "eigen")
TARGET_CLASS = "swd"               # None=自动Top1；或 int 类别id；或 str 类名
SAVE_ORIG_OVERLAY = False        # 同时在原图大小上叠加一份
OUT_DIR = out_path               # 基础输出目录
ALPHA = 0.7                      # 叠加透明度（show_cam_on_image 内部使用）
# =================================================

def _assert_tensor4d_rgb01(x: torch.Tensor):
    assert x.ndim == 4 and x.shape[0] == 1, f"Expect [1,3,H,W], got {tuple(x.shape)}"
    assert x.shape[1] == 3, f"Expect 3 channels RGB, got {x.shape[1]}"
    assert x.dtype in (torch.float32, torch.float16, torch.bfloat16), f"Expect float tensor, got {x.dtype}"

def build_preprocess(imgsz: int):
    try:
        from ultralytics.data.augment import classify_transforms
        tfm = classify_transforms(imgsz=imgsz)
        def _preprocess(img_bgr):
            img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
            out = tfm(image=img_rgb)
            tensor = out["img"].unsqueeze(0)
            H, W = tensor.shape[2:]
            rgb01_square = cv2.resize(img_rgb, (W, H), interpolation=cv2.INTER_LINEAR).astype(np.float32)/255.0
            meta = dict(mode="unknown_square", size=(H, W))
            return rgb01_square, tensor, meta
        return _preprocess
    except Exception:
        def letterbox_square(img_bgr, new_size, color=(114,114,114)):
            h, w = img_bgr.shape[:2]
            scale = min(new_size / h, new_size / w)
            nh, nw = int(round(h * scale)), int(round(w * scale))
            resized = cv2.resize(img_bgr, (nw, nh), interpolation=cv2.INTER_LINEAR)
            top = (new_size - nh) // 2
            bottom = new_size - nh - top
            left = (new_size - nw) // 2
            right = new_size - nw - left
            padded = cv2.copyMakeBorder(resized, top, bottom, left, right,
                                        borderType=cv2.BORDER_CONSTANT, value=color)
            meta = dict(mode="letterbox", scale=scale, pad=(top,bottom,left,right), resized_hw=(nh,nw))
            return padded, meta
        def _preprocess(img_bgr):
            sq_bgr, meta = letterbox_square(img_bgr, imgsz)
            rgb01 = cv2.cvtColor(sq_bgr, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
            tensor = torch.from_numpy(rgb01).permute(2,0,1).unsqueeze(0)
            return rgb01, tensor, meta
        return _preprocess

def cam_to_original(grayscale_cam, orig_shape, meta):
    H0, W0 = orig_shape[:2]
    if meta.get("mode") != "letterbox":
        return cv2.resize(grayscale_cam, (W0, H0), interpolation=cv2.INTER_LINEAR)
    top, bottom, left, right = meta["pad"]
    nh, nw = meta["resized_hw"]
    size = top + nh + bottom
    cam_sq = cv2.resize(grayscale_cam, (size, size), interpolation=cv2.INTER_LINEAR)
    cam_cropped = cam_sq[top: top+nh, left: left+nw]
    cam_orig = cv2.resize(cam_cropped, (W0, H0), interpolation=cv2.INTER_LINEAR)
    return cam_orig

def find_last_conv_layer(module: torch.nn.Module):
    last = None
    for m in module.modules():
        if isinstance(m, torch.nn.Conv2d):
            last = m
    if last is None:
        raise RuntimeError("未在模型中找到 Conv2d 层，请确认是分类权重。")
    return last

class LogitsOnly(torch.nn.Module):
    def __init__(self, base: torch.nn.Module):
        super().__init__()
        self.base = base
    def forward(self, x):
        out = self.base(x)
        if torch.is_tensor(out) and out.ndim == 2:
            return out
        if isinstance(out, (list, tuple)):
            for t in out:
                if torch.is_tensor(t) and t.ndim == 2:
                    return t
            for t in out:
                if torch.is_tensor(t):
                    return t
        if isinstance(out, dict):
            if "logits" in out and torch.is_tensor(out["logits"]):
                return out["logits"]
            for v in out.values():
                if torch.is_tensor(v) and v.ndim == 2:
                    return v
        raise TypeError(f"无法从模型输出提取 logits（got {type(out)})")

def resolve_target_class(target, names):
    if target is None:
        return None
    if isinstance(target, int):
        return target
    if isinstance(target, str):
        if isinstance(names, dict):
            inv = {v: k for k, v in names.items()}
            if target in inv:
                return inv[target]
    raise ValueError(f"TARGET_CLASS 无法解析：{target!r}")

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(OUT_DIR, exist_ok=True)

    wrapper = YOLO(MODEL)
    base: torch.nn.Module = wrapper.model.eval().to(device)
    net  = LogitsOnly(base).eval().to(device)
    target_layer = find_last_conv_layer(base)
    class_names = getattr(wrapper, "names", None)

    preprocess = build_preprocess(IMG_SIZE)

    if os.path.isdir(IMGS):
        paths = sorted(sum([glob.glob(str(Path(IMGS)/p)) for p in ("*.jpg","*.jpeg","*.png","*.bmp","*.webp")], []))
    elif any(ch in IMGS for ch in "*?[]"):
        paths = sorted(glob.glob(IMGS))
    else:
        paths = [IMGS]
    assert paths, f"未找到图片：{IMGS}"

    target_id = resolve_target_class(TARGET_CLASS, class_names)

    method_map = {
        "gradcam": GradCAM,
        "gradcam++": GradCAMPlusPlus,
        "eigen": EigenCAM
    }
    methods = [m for m in DO_METHODS if m in method_map]
    assert methods, f"DO_METHODS 设置有误：{DO_METHODS}"

    # —— 为每个方法准备子目录 —— #
    method_dirs = {}
    for mname in methods:
        mdir = os.path.join(OUT_DIR, mname)
        os.makedirs(mdir, exist_ok=True)
        method_dirs[mname] = mdir

    for p in paths:
        img_bgr = cv2.imread(p)
        if img_bgr is None:
            print(f"[跳过] 无法读取 {p}")
            continue
        rgb01_square, x, meta = preprocess(img_bgr)
        x = x.to(device)
        _assert_tensor4d_rgb01(x)

        torch.set_grad_enabled(True)
        net.zero_grad(set_to_none=True)
        for param in net.parameters():
            param.requires_grad_(True)
        x.requires_grad_(True)

        cam_targets = None if target_id is None else [ClassifierOutputTarget(target_id)]
        stem = Path(p).stem

        for mname in methods:
            CAM = method_map[mname]
            with CAM(model=net, target_layers=[target_layer]) as cam:
                grayscale_cam = cam(input_tensor=x, targets=cam_targets)[0]

            vis_square = show_cam_on_image(rgb01_square.copy(), grayscale_cam, use_rgb=True, image_weight=ALPHA)
            out_square = os.path.join(method_dirs[mname], f"{stem}_square.jpg")
            cv2.imwrite(out_square, cv2.cvtColor(vis_square, cv2.COLOR_RGB2BGR))

            if SAVE_ORIG_OVERLAY:
                cam_orig = cam_to_original(grayscale_cam, img_bgr.shape, meta)
                cam_orig_rgb = show_cam_on_image(
                    cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)/255.0,
                    cam_orig, use_rgb=True, image_weight=ALPHA)
                out_orig = os.path.join(method_dirs[mname], f"{stem}_orig.jpg")
                cv2.imwrite(out_orig, cv2.cvtColor(cam_orig_rgb, cv2.COLOR_RGB2BGR))

        # 单独做一次无梯度前向，只为打印信息
        with torch.no_grad():
            logits = net(x.detach())
            probs  = torch.softmax(logits.float(), dim=1)
            top1   = int(probs.argmax(dim=1).item())
            top1_p = float(probs[0, top1].item())
        top1_name = class_names.get(top1, top1) if isinstance(class_names, dict) else top1
        target_disp = (class_names.get(target_id, target_id) if isinstance(class_names, dict) else target_id) if target_id is not None else f"Top1({top1_name})"
        print(f"[OK] {p} | Pred={top1_name} ({top1_p:.3f}) | Methods -> {', '.join(methods)} in {OUT_DIR}")

    print("全部完成。输出目录：", OUT_DIR)

if __name__ == "__main__":
    main()


[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/01_06-16_06_02_0616_0602_580.jpg | Pred=others (0.572) | Methods -> gradcam, gradcam++, eigen in /workspace/models/gradcam
[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/01_07-16_06_38_0716_0638_820.jpg | Pred=others (0.572) | Methods -> gradcam, gradcam++, eigen in /workspace/models/gradcam
[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/02_06-16_06_12_0616_0612_580.jpg | Pred=others (0.572) | Methods -> gradcam, gradcam++, eigen in /workspace/models/gradcam
[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/02_07-16_07_52_0716_0752_820.jpg | Pred=others (0.572) | Methods -> gradcam, gradcam++, eigen in /workspace/models/gradcam
[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/03_07-16_08_44_0716_0844_820.jpg | Pred=others (0.573) | Methods -> gradcam, gradcam++, eigen in /workspace/models/gradcam
[OK] /workspace/models/runs_yolov11_cls/cropped_objects/swd/04_07-16_08_48_0716_0848_

KeyboardInterrupt: 