In [21]:
import os
import random
from typing import List, Tuple
from pathlib import Path
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn.functional import cross_entropy
from torchvision.datasets import ImageFolder
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from torchvision import transforms
from transformers import AutoProcessor, AutoModel
import timm
import shutil

# ========== 1. 设备 ==========
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

# ========== 2. 数据路径 ==========
# ❗❗❗ 把这个改成你“四个类别”所在的文件夹 ❗❗❗
root_dir = r"D:/OneDriveFiles/OneDrive/人工智能基础期末/dataset2/"

# 目录结构要求：
# root_dir/
#   classA/
#   classB/
#   classC/
#   classD/

# ========== 3. 训练超参数 ==========
batch_size   = 64
num_workers  = 0
num_epochs   = 30
lr           = 1e-3      # 只训练线性头，可以稍微大一点
weight_decay = 1e-2

# ========== 4. 随机种子（保证每次划分一致） ==========
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device == "cuda":
    torch.cuda.manual_seed_all(seed)

local_model_dir = r"/root/人工智能基础期末作业/medsig_model"  # TODO: 改成你的路径

train_dir = r"/root/人工智能基础期末作业/data_split/train"
val_dir = r"/root/人工智能基础期末作业/data_split/val"
image_size = 448

device: cuda


In [22]:
RAW_DIR = Path("/root/人工智能基础期末作业/dataset2/")
OUT_DIR = Path("/root/人工智能基础期末作业/data_split")    # 拆分后的 train/val 会放在这里

classes = os.listdir(RAW_DIR)
classes

['class 0', 'class 1', 'class 2', 'class 3', '.DS_Store']

In [23]:
classes = [d for d in os.listdir(RAW_DIR) if (RAW_DIR / d).is_dir()]
print("发现的类别：", classes)

for phase in ["train", "val"]:
    for cls in classes:
        (OUT_DIR / phase / cls).mkdir(parents=True, exist_ok=True)


发现的类别： ['class 0', 'class 1', 'class 2', 'class 3']


In [24]:
# train_ratio = 0.8   # 训练集占 80%
# val_ratio   = 0.2   # 验证集占 20%，train_ratio + val_ratio 应该 = 1
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print("使用设备:", device)


# for cls in classes:
#     src_dir = RAW_DIR / cls
#     files = [f for f in os.listdir(src_dir)
#              if f.lower().endswith((".png", ".jpg", ".jpeg"))]

#     random.shuffle(files)
#     n = len(files)
#     n_train = int(n * train_ratio)
#     # val 集就是剩下的
#     train_files = files[:n_train]
#     val_files   = files[n_train:]

#     print(f"{cls}: 总数 {n}, 训练 {len(train_files)}, 验证 {len(val_files)}")

#     # 复制到目标文件夹（想省空间可以用 shutil.move）
#     for fname in train_files:
#         shutil.copy(src_dir / fname, OUT_DIR / "train" / cls / fname)
#     for fname in val_files:
#         shutil.copy(src_dir / fname, OUT_DIR / "val" / cls / fname)

# print("划分完成，已保存到", OUT_DIR)


In [25]:
# %% 从本地加载 MedSigLIP，多模态模型中抽出视觉塔
print("Loading base model from local dir:", local_model_dir)

raw_model = AutoModel.from_pretrained(
    local_model_dir,
    local_files_only=True,
)
print("raw_model class:", type(raw_model))

# ---- 关键：只抽出“视觉 encoder” ----
if hasattr(raw_model, "vision_model"):
    # 大多数 CLIP/SigLIP 多模态模型的视觉塔都叫 vision_model
    img_encoder = raw_model.vision_model
    print("Use raw_model.vision_model as image encoder.")
elif hasattr(raw_model, "get_image_features"):
    # 有些实现是直接在 model 上提供 get_image_features
    img_encoder = raw_model
    print("Use raw_model itself as image encoder (get_image_features).")
else:
    raise RuntimeError(
        "在 raw_model 里找不到 vision_model 或 get_image_features，"
        "请 print(raw_model) 看看结构，然后再定位视觉塔。"
    )

img_encoder.to(device)
img_encoder.eval()

# ---- 用 dummy 探测 embedding 维度 ----
with torch.no_grad():
    dummy = torch.zeros(1, 3, image_size, image_size).to(device)   # [1,3,448,448]

    try:
        out = img_encoder(pixel_values=dummy)   # 优先用 keyword
    except TypeError:
        out = img_encoder(dummy)               # 有的模型只收 positional

    if hasattr(out, "image_embeds"):
        feats = out.image_embeds                    # [1, D]
    elif hasattr(out, "pooler_output"):
        feats = out.pooler_output                   # [1, D]
    elif hasattr(out, "last_hidden_state"):
        feats = out.last_hidden_state.mean(dim=1)   # [1, D]
    elif isinstance(out, torch.Tensor):
        feats = out
    else:
        print("Unknown output type:", type(out))
        print(out)
        raise RuntimeError(
            "无法从 img_encoder 的输出中找到特征，请 print(out) 再调整逻辑。"
        )

embed_dim = feats.shape[-1]
print("image embed dim:", embed_dim)


# ---- 封装分类模型：视觉塔 + 线性 head ----
class MedSigVisionClassifier(nn.Module):
    def __init__(self, img_encoder, embed_dim, num_classes):
        super().__init__()
        self.encoder = img_encoder
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, pixel_values):
        # pixel_values: [B,3,448,448]
        try:
            out = self.encoder(pixel_values=pixel_values)
        except TypeError:
            out = self.encoder(pixel_values)

        if hasattr(out, "image_embeds"):
            feats = out.image_embeds
        elif hasattr(out, "pooler_output"):
            feats = out.pooler_output
        elif hasattr(out, "last_hidden_state"):
            feats = out.last_hidden_state.mean(dim=1)
        elif isinstance(out, torch.Tensor):
            feats = out
        else:
            raise RuntimeError("encoder 输出里找不到特征，需根据实际结构单独处理。")

        # L2 归一化（保持和 CLIP 系一致的风格）
        feats = feats / (feats.norm(dim=-1, keepdim=True) + 1e-6)
        logits = self.head(feats)    # [B,num_classes]
        return logits


# ==== 构建模型 + DataParallel ====
model = MedSigVisionClassifier(img_encoder, embed_dim, 4)

if torch.cuda.device_count() > 1:
    print("使用", torch.cuda.device_count(), "张 GPU 进行 DataParallel")
    model = nn.DataParallel(model)   # 在多卡上自动切 batch

model = model.to(device)

# 关键：取出真正的模型（DataParallel 包了一层壳）
core = model.module if isinstance(model, nn.DataParallel) else model

# 先冻结视觉塔，只训 head 当 baseline
for p in core.encoder.parameters():
    p.requires_grad = False
for p in core.head.parameters():
    p.requires_grad = True

# 统计参数量用 core（真正的模型）
total_params = sum(p.numel() for p in core.parameters())
trainable_params = sum(p.numel() for p in core.parameters() if p.requires_grad)
print(f"总参数量: {total_params:,}")
print(f"当前可训练参数量(仅 head): {trainable_params:,}")


Loading base model from local dir: /root/人工智能基础期末作业/medsig_model
raw_model class: <class 'transformers.models.siglip.modeling_siglip.SiglipModel'>
Use raw_model.vision_model as image encoder.
image embed dim: 1152
使用 2 张 GPU 进行 DataParallel
总参数量: 428,570,052
当前可训练参数量(仅 head): 4,612


In [26]:
# %% DataLoader：自己写 448x448 的 transform
# 灰度医学片 -> 3 通道，Resize 到 448，归一化

mean = [0.5, 0.5, 0.5]
std  = [0.5, 0.5, 0.5]

train_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((image_size, image_size)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

val_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

train_dataset = ImageFolder(train_dir, transform=train_transform)
val_dataset   = ImageFolder(val_dir,   transform=val_transform)

print("Classes:", train_dataset.classes)
print("train samples:", len(train_dataset))
print("val samples:", len(val_dataset))

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
)


Classes: ['class 0', 'class 1', 'class 2', 'class 3']
train samples: 5841
val samples: 1462


In [27]:
import torch.nn as nn

# ==== 先拿到真正的模型 ====
core = model.module if isinstance(model, nn.DataParallel) else model

# ===== 设定要冻结的层数 =====
K = 24  # 冻结前 K 层，训练第 K 层及之后的层（总共 27 层的话就是训最后 7 层）

# 1. 先把整个 encoder 都冻住
for p in core.encoder.parameters():
    p.requires_grad = False

# 2. 拿到 block 列表（就是你之前打印的 encoder.layers 这个 ModuleList）
blocks = core.encoder.encoder.layers   # 注意是 encoder.encoder.layers
print("block 数量:", len(blocks))      # 应该是 27

# 3. 冻结前 K 层，解冻后面的
for i, block in enumerate(blocks):
    if i < K:
        # 前 K 层保持冻结（其实已经全冻过了，这里写不写都行）
        for p in block.parameters():
            p.requires_grad = False
    else:
        # 只训练后面的 block（第 K 层及以后）
        for p in block.parameters():
            p.requires_grad = True

# 4. head 一定要解冻
for p in core.head.parameters():
    p.requires_grad = True

# 5. 看一下现在的参数情况（用 core 统计真正模型）
total_params = sum(p.numel() for p in core.parameters())
trainable_params = sum(p.numel() for p in core.parameters() if p.requires_grad)
print(f"总参数量: {total_params:,}")
print(f"当前可训练参数量: {trainable_params:,}")

# 6. 冻结/解冻设置好之后，再重建 optimizer
optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad],  # 这里用 model，兼容 DataParallel
    lr=1e-4,
    weight_decay=1e-2,
)


block 数量: 27
总参数量: 428,570,052
当前可训练参数量: 45,723,124


In [28]:
# %% 训练 & 验证函数
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=lr,
    weight_decay=weight_decay,
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=num_epochs,
)


def train_one_epoch(epoch: int):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch} [train]")
    for imgs, labels in pbar:
        imgs   = imgs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()
        logits = model(imgs)
        loss   = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        _, preds = torch.max(logits, dim=1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)

        pbar.set_postfix({
            "loss": f"{running_loss/total:.4f}",
            "acc":  f"{correct/total:.4f}",
        })

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc


@torch.no_grad()
def eval_one_epoch(epoch: int):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(val_loader, desc=f"Epoch {epoch} [val]  ")
    for imgs, labels in pbar:
        imgs   = imgs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        logits = model(imgs)
        loss   = criterion(logits, labels)

        running_loss += loss.item() * imgs.size(0)
        _, preds = torch.max(logits, dim=1)
        correct += (preds == labels).sum().item()
        total   += labels.size(0)

        pbar.set_postfix({
            "loss": f"{running_loss/total:.4f}",
            "acc":  f"{correct/total:.4f}",
        })

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc


In [29]:
# %% 主训练循环
log_path = "trainlog_23.log"
best_val_acc = 0.0
best_state_dict = None

In [30]:
torch.cuda.empty_cache()

In [31]:
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(epoch)
    val_loss, val_acc     = eval_one_epoch(epoch)
    line = (
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
        f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}"
    )

    scheduler.step()

    print(line)

    with open(log_path, "a", encoding="utf-8") as f:
        f.write(line + "\n")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = model.state_dict()

print("Best val_acc:", best_val_acc)

if best_state_dict is not None:
    save_path = f"./medsiglip448_cls_best_acc{best_val_acc:.4f}.pth"
    torch.save(best_state_dict, save_path)
    print("Saved best model to:", save_path)

Epoch 1 [train]:   0%|          | 0/92 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

def eval_and_confmat(model, loader, device, class_names):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            logits = model(x)
            pred = torch.argmax(logits, dim=1)
            y_true.append(y.cpu().numpy())
            y_pred.append(pred.cpu().numpy())

    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    # 1) 原始混淆矩阵（计数）
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Confusion Matrix (counts)")
    plt.show()

    # 2) 归一化混淆矩阵（每行归一化，更好看“每类召回”）
    cm_norm = confusion_matrix(y_true, y_pred, labels=list(range(len(class_names))), normalize="true")
    disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=class_names)
    disp2.plot(cmap="Blues", values_format=".2f")
    plt.title("Confusion Matrix (normalized by true class)")
    plt.show()

    # 3) 每类 precision/recall/F1
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))

    return cm, cm_norm


In [None]:
# 处理 DataParallel 外壳
core = model.module if isinstance(model, nn.DataParallel) else model

save_path = "medsig_best_epoch_V2.pth"   # 随便起个名字

torch.save(core.state_dict(), save_path)
print("已保存当前这一轮的参数到:", save_path)


In [None]:
# 继续炼丹
log_path = "trainlog_23.log"
torch.cuda.empty_cache()
epochs_append = 10
for epoch in range(num_epochs, epochs_append + num_epochs):
    train_loss, train_acc = train_one_epoch(epoch)
    val_loss, val_acc     = eval_one_epoch(epoch)
    line = (
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
        f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}"
    )

    scheduler.step()

    print(line)

    with open(log_path, "a", encoding="utf-8") as f:
        f.write(line + "\n")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = model.state_dict()

print("Best val_acc:", best_val_acc)

if best_state_dict is not None:
    save_path = f"./medsiglip448_cls_best_acc{best_val_acc:.4f}.pth"
    torch.save(best_state_dict, save_path)
    print("Saved best model to:", save_path)


In [None]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 先构建“同结构”的模型 ------------------------------------
# 这里用你自己的模型类和参数
# 比如你的是：
# model_core = MedSigVisionClassifier(img_encoder, embed_dim, num_classes)
torch.cuda.empty_cache()
model_core = MedSigVisionClassifier(img_encoder, embed_dim, 4)  # 按你之前的一样写
model_core = model_core.to(device)

# 2. 读取 pth（state_dict） ------------------------------------
state_dict_path = "medsiglip448_cls_best_acc0.9770.pth"   # 改成你自己的文件名
state_dict = torch.load(state_dict_path, map_location=device)

# 3. 把参数加载进模型 ------------------------------------------
model_core.load_state_dict(state_dict)

# 4. 如果你想多卡跑，可以再包一层 DataParallel（可选）
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model_core)
else:
    model = model_core

model.eval()
print("✅ 模型加载完毕，可以直接用 model 做推理/验证")


In [18]:
@torch.no_grad()
def eval_once(loader):
    model.eval()
    correct = 0
    total = 0
    for imgs, labels in loader:
        imgs = imgs.to(device)
        labels = labels.to(device)

        logits = model(imgs)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    print("acc =", correct / total)

# 比如在 val_loader / test_loader 上跑一下
eval_once(val_loader)


acc = 0.9762084592145015
