In [1]:
# wsi_stage_baseline.py
import os, sys, math
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# ===== 配置：把它改成你的根目录（含 stage 1/2/3/4 train/test）=====
ROOT = r"C:\Users\mxjli\Desktop\image"

# ===== 工具函数 =====
def read_best_level_thumbnail(slide_path, min_dim=3000):
    """用 OpenSlide 读取接近 min_dim 的缩略图（长边≈min_dim），避免读全分辨率。"""
    slide = openslide.OpenSlide(slide_path)
    w, h = slide.dimensions
    scale = max(w, h) / float(min_dim)
    new_w, new_h = int(w/scale), int(h/scale)
    img = slide.get_thumbnail((new_w, new_h))  # PIL
    slide.close()
    return np.asarray(img)  # uint8 RGB

def tissue_mask(rgb):
    """简单组织掩码：去白底。"""
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98            # 去亮白
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    """H&E分离，取H通道（核染色）并自动阈值。"""
    # skimage 需要float并在[0,1]
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]                  # Hematoxylin
    H = exposure.rescale_intensity(H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    # H 值越大核越深，取Otsu
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    # 形态学清理
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb = read_best_level_thumbnail(svs_path)
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:   # 太小视为无组织
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    tissue_area = mask.sum()
    tumor_area  = nuc.sum()
    tumor_frac  = tumor_area / tissue_area

    # 连通域
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    if len(props) == 0:
        largest = 0
    else:
        largest = max(p.area for p in props)
    largest_cc_frac = largest / tissue_area
    cc_count_per_mpx = len(props) / (rgb.shape[0]*rgb.shape[1]/1_000_000.0)

    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    return dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count_per_mpx=cc_count_per_mpx,
        h_mean=h_mean, h_std=h_std
    )

def scan_split(split_dir):
    """返回该 split（train/test）下所有 stage 的特征表"""
    rows = []
    for stage_name in ["stage 1", "stage 2", "stage 3", "stage 4"]:
        d = Path(split_dir) / f"{stage_name} train" if "train" in split_dir.lower() else Path(split_dir) / f"{stage_name} test"
        if not d.exists():  # 也兼容你截图里：stage 1 train 这样的直接在根目录
            d = Path(split_dir) / f"{stage_name} {'train' if 'train' in split_dir.lower() else 'test'}"
        if not d.exists():
            d = Path(split_dir).parent / f"{stage_name} {'train' if 'train' in split_dir.lower() else 'test'}"
        if not d.exists():
            d = Path(split_dir).parent / f"{stage_name} {'train' if 'train' in split_dir.lower() else 'test'}"
        if not d.exists():
            continue

        label = int(stage_name.split()[1]) - 1  # 0..3
        for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
            feats = wsi_features(str(p))
            if feats is None:
                continue
            feats.update(dict(path=str(p), label=label, stage=stage_name))
            rows.append(feats)
    return pd.DataFrame(rows)

def load_dataset(root):
    train_df = []
    test_df  = []
    # 你的结构是并列的 “stage 1 train / stage 1 test …”，我们直接各扫一遍
    for name in os.listdir(root):
        if name.lower().endswith("train"):
            train_df.append(scan_split(Path(root)))
            break
    # 上面已经把所有 train/test 都扫了（函数内部根据名字判断），这里再单独把 test 合并
    return pd.concat(train_df, ignore_index=True)

# ===== 主流程：提特征 -> 训练 -> 评估 =====
if __name__ == "__main__":
    df = []
    # 扫描一次就会把四个stage的train/test都吃到
    for stage in [1,2,3,4]:
        for split in ["train", "test"]:
            d = Path(ROOT) / f"stage {stage} {split}"
            if d.exists():
                for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                    feats = wsi_features(str(p))
                    if feats is None:
                        continue
                    feats.update(dict(path=str(p), label=stage-1, split=split))
                    df.append(feats)
    df = pd.DataFrame(df)
    print("Feature rows:", df.shape)
    df.to_csv("wsi_stage_features.csv", index=False)

    # 用 train 行训练，test 行评估（若没有 test，就做随机切分）
    if "test" in df["split"].unique():
        train_df = df[df.split=="train"].copy()
        test_df  = df[df.split=="test"].copy()
    else:
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

    Xtr = train_df[["tumor_frac","largest_cc_frac","cc_count_per_mpx","h_mean","h_std"]].values
    ytr = train_df["label"].values
    Xte = test_df[["tumor_frac","largest_cc_frac","cc_count_per_mpx","h_mean","h_std"]].values
    yte = test_df["label"].values

    clf = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=0, n_jobs=-1)
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr))
    print("\nReport:\n", classification_report(yte, ypr, digits=4))

KeyboardInterrupt: 

In [5]:
# wsi_stage_baseline_mpp.py
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# ===== 路径：改成你的根目录（含 stage 1/2/3/4 train/test）=====
ROOT = r"C:\Users\mxjli\Desktop\image"

# 若为 True，没有 mpp 的切片就跳过；为 False 则退化为只用比例特征
REQUIRE_MPP = True

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    """读取缩略图 + 返回到 level-0 的比例 scale 以及 mpp 信息"""
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)  # thumb 像素 -> level0 像素的比例
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    # mpp（微米/像素），有些厂商字段不同，这里只取 openslide 标准
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H,
        in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    # 缺 mpp 的处理
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None  # 直接跳过
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # --- 缩略图上的像素计数 ---
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    largest_cc_px_thumb = 0
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    if len(props) > 0:
        largest_cc_px_thumb = max(p.area for p in props)
    cc_count = len(props)

    # --- 比例特征（与分辨率无关） ---
    tumor_frac = tumor_px_thumb / tissue_px_thumb
    largest_cc_frac = largest_cc_px_thumb / tissue_px_thumb
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        h_mean=h_mean, h_std=h_std,
        cc_count_thumb=cc_count  # 先存，下面若有 mpp 会转成每 cm²
    )

    # --- 物理面积（与分辨率有关，需要 mpp + scale） ---
    if has_mpp:
        # 每个 level-0 像素的面积（mm²）
        # mpp 单位是微米/像素 → 每像素面积(um²) = mpp_x*mpp_y → 换算成 mm²: *1e-6
        px_area_mm2 = (mpp_x * mpp_y) * 1e-6
        # 缩略图像素需要乘回到 level-0：thumb_px * scale^2
        s2 = scale * scale
        tissue_area_mm2  = tissue_px_thumb  * s2 * px_area_mm2
        tumor_area_mm2   = tumor_px_thumb   * s2 * px_area_mm2
        largest_cc_mm2   = largest_cc_px_thumb * s2 * px_area_mm2
        tissue_area_cm2  = tissue_area_mm2 / 100.0
        tumor_area_cm2   = tumor_area_mm2  / 100.0
        largest_cc_cm2   = largest_cc_mm2  / 100.0

        # 物理尺度上的密度指标（每 cm² 的病灶个数）
        cc_per_cm2 = cc_count / max(tissue_area_cm2, 1e-6)

        feats.update(dict(
            tissue_area_mm2=tissue_area_mm2,
            tumor_area_mm2=tumor_area_mm2,
            largest_cc_mm2=largest_cc_mm2,
            tissue_area_cm2=tissue_area_cm2,
            tumor_area_cm2=tumor_area_cm2,
            largest_cc_cm2=largest_cc_cm2,
            cc_per_cm2=cc_per_cm2,
            has_mpp=1
        ))
    else:
        feats.update(dict(
            tissue_area_mm2=np.nan, tumor_area_mm2=np.nan, largest_cc_mm2=np.nan,
            tissue_area_cm2=np.nan, tumor_area_cm2=np.nan, largest_cc_cm2=np.nan,
            cc_per_cm2=np.nan, has_mpp=0
        ))
    return feats

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            label = stage - 1
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                feats = wsi_features(str(p))
                if feats is None:
                    continue
                feats.update(dict(path=str(p), label=label, split=split))
                rows.append(feats)
    return pd.DataFrame(rows)

if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    df.to_csv("wsi_stage_features_mpp.csv", index=False)

    # 训练：优先使用含物理特征的样本
    have_mpp = df[df["has_mpp"]==1]
    if len(have_mpp) >= 8:   # 至少每类若干个更稳
        train_df = have_mpp[have_mpp.split=="train"] if "test" in df.split.unique() else have_mpp
        test_df  = have_mpp[have_mpp.split=="test"]  if "test" in df.split.unique() else None
        feature_cols = ["tumor_frac","largest_cc_frac","h_mean","h_std",
                        "tumor_area_mm2","largest_cc_mm2","tissue_area_mm2","cc_per_cm2"]
    else:
        # 回退：只用比例特征
        train_df = df[df.split=="train"] if "test" in df.split.unique() else df
        test_df  = df[df.split=="test"]  if "test" in df.split.unique() else None
        feature_cols = ["tumor_frac","largest_cc_frac","h_mean","h_std"]

    if test_df is None:
        train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["label"])

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    clf = RandomForestClassifier(n_estimators=400, random_state=0, n_jobs=-1)
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("Using features:", feature_cols)
    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr))
    print("\nReport:\n", classification_report(yte, ypr, digits=4))

Feature rows: (154, 16)
Using features: ['tumor_frac', 'largest_cc_frac', 'h_mean', 'h_std', 'tumor_area_mm2', 'largest_cc_mm2', 'tissue_area_mm2', 'cc_per_cm2']

Confusion matrix:
 [[2 1 1 0]
 [3 1 0 0]
 [0 2 0 0]
 [0 2 0 0]]

Report:
               precision    recall  f1-score   support

           0     0.4000    0.5000    0.4444         4
           1     0.1667    0.2500    0.2000         4
           2     0.0000    0.0000    0.0000         2
           3     0.0000    0.0000    0.0000         2

    accuracy                         0.2500        12
   macro avg     0.1417    0.1875    0.1611        12
weighted avg     0.1889    0.2500    0.2148        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [11]:
# 改进后的 wsi_stage_baseline_mpp.py（增加CNN特征 + RGB直方图 + 平衡权重）
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from PIL import Image
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torchvision.transforms as T
from torchvision.models import resnet18
import torch.nn as nn

# ===== 路径设置 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 模型初始化 =====
class ResNetFeatureExtractor:
    def __init__(self):
        model = resnet18(pretrained=True)
        model.fc = nn.Identity()
        self.model = model.eval()
        self.transform = T.Compose([
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def extract(self, pil_img):
        x = self.transform(pil_img).unsqueeze(0)  # Add batch dim
        with torch.no_grad():
            feat = self.model(x).squeeze(0).numpy()
        return feat

cnn_extractor = ResNetFeatureExtractor()

# ===== 图像分析部分 =====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0 / scale), int(H0 / scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return img, np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb / 255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def rgb_hist_features(rgb):
    feats = {}
    for i, c in enumerate(['r', 'g', 'b']):
        hist, _ = np.histogram(rgb[..., i], bins=32, range=(0, 255), density=True)
        feats.update({f"{c}_hist_{j}": hist[j] for j in range(32)})
    return feats

def wsi_features(svs_path):
    pil_img, rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x and mpp_y and mpp_x > 0 and mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    largest_cc_px_thumb = max([p.area for p in props], default=0)
    cc_count = len(props)

    tumor_frac = tumor_px_thumb / tissue_px_thumb
    largest_cc_frac = largest_cc_px_thumb / tissue_px_thumb
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    feats = dict(
        tumor_frac=tumor_frac, largest_cc_frac=largest_cc_frac,
        h_mean=h_mean, h_std=h_std, cc_count_thumb=cc_count
    )
    feats.update(rgb_hist_features(rgb))

    if has_mpp:
        px_area_mm2 = (mpp_x * mpp_y) * 1e-6
        s2 = scale * scale
        tissue_area_mm2 = tissue_px_thumb * s2 * px_area_mm2
        tumor_area_mm2 = tumor_px_thumb * s2 * px_area_mm2
        largest_cc_mm2 = largest_cc_px_thumb * s2 * px_area_mm2
        tissue_area_cm2 = tissue_area_mm2 / 100
        tumor_area_cm2 = tumor_area_mm2 / 100
        largest_cc_cm2 = largest_cc_mm2 / 100
        cc_per_cm2 = cc_count / max(tissue_area_cm2, 1e-6)
        feats.update(dict(
            tissue_area_mm2=tissue_area_mm2, tumor_area_mm2=tumor_area_mm2,
            largest_cc_mm2=largest_cc_mm2, tissue_area_cm2=tissue_area_cm2,
            tumor_area_cm2=tumor_area_cm2, largest_cc_cm2=largest_cc_cm2,
            cc_per_cm2=cc_per_cm2, has_mpp=1
        ))
    else:
        feats.update(dict(
            tissue_area_mm2=np.nan, tumor_area_mm2=np.nan, largest_cc_mm2=np.nan,
            tissue_area_cm2=np.nan, tumor_area_cm2=np.nan, largest_cc_cm2=np.nan,
            cc_per_cm2=np.nan, has_mpp=0
        ))

    # 添加CNN特征（512维）
    feats.update({f"cnn_{i}": v for i, v in enumerate(cnn_extractor.extract(pil_img))})

    return feats

def collect_rows(root):
    rows = []
    for stage in [1, 2, 3, 4]:
        for split in ["train", "test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists(): continue
            label = stage - 1
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                feats = wsi_features(str(p))
                if feats is None: continue
                feats.update(dict(path=str(p), label=label, split=split))
                rows.append(feats)
    return pd.DataFrame(rows)

if __name__ == "__main__":
    df = collect_rows(ROOT)
    df = df[df["tissue_area_cm2"] > 0.1]  # 清洗组织面积太小的样本
    print("Feature rows:", df.shape)
    df.to_csv("wsi_stage_features_enhanced.csv", index=False)

    have_mpp = df[df["has_mpp"] == 1]
    if len(have_mpp) >= 8:
        train_df = have_mpp[have_mpp.split == "train"] if "test" in df.split.unique() else have_mpp
        test_df = have_mpp[have_mpp.split == "test"] if "test" in df.split.unique() else None
    else:
        train_df = df[df.split == "train"] if "test" in df.split.unique() else df
        test_df = df[df.split == "test"] if "test" in df.split.unique() else None

    if test_df is None:
        train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)

    feature_cols = [col for col in train_df.columns if col.startswith("tumor_") or
                    col.startswith("largest_cc") or col.startswith("h_") or
                    col.startswith("tissue_area") or col.startswith("cc_per") or
                    col.startswith("r_hist") or col.startswith("g_hist") or col.startswith("b_hist") or
                    col.startswith("cnn_")]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    scaler = StandardScaler()
    Xtr = scaler.fit_transform(Xtr)
    Xte = scaler.transform(Xte)

    clf = RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=0, n_jobs=-1)
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr))
    print("\nReport:\n", classification_report(yte, ypr, digits=4, zero_division=0))



Feature rows: (117, 624)

Confusion matrix:
 [[2 4 1 0]
 [2 4 4 0]
 [0 5 0 0]
 [0 2 0 0]]

Report:
               precision    recall  f1-score   support

           0     0.5000    0.2857    0.3636         7
           1     0.2667    0.4000    0.3200        10
           2     0.0000    0.0000    0.0000         5
           3     0.0000    0.0000    0.0000         2

    accuracy                         0.2500        24
   macro avg     0.1917    0.1714    0.1709        24
weighted avg     0.2569    0.2500    0.2394        24



In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure, transform
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # 去掉最后分类层
resnet = resnet.to(device)
resnet.eval()

# 图像预处理
transform_cnn = T.Compose([
    T.ToPILImage(),
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(rgb_crop):
    """输入RGB图像 -> 提取CNN特征"""
    x = transform_cnn(rgb_crop).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat

# ===== 你的原始函数（保留不变）=====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H,
        in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

# ===== 提取特征（手工 + CNN）=====
def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # 手工特征
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    tumor_frac = tumor_px_thumb / tissue_px_thumb
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    feats = dict(
        tumor_frac=tumor_frac,
        h_mean=h_mean, h_std=h_std
    )

    # ===== CNN 特征提取 =====
    ys, xs = np.where(nuc)
    if len(xs) > 0 and len(ys) > 0:
        xmin, xmax = xs.min(), xs.max()
        ymin, ymax = ys.min(), ys.max()
        crop = rgb[ymin:ymax, xmin:xmax]
        cnn_feat = extract_cnn_feature(crop)
    else:
        cnn_feat = np.zeros(512)  # 没检测到肿瘤 → 空特征

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            label = stage - 1
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split)
                row.update(feats)
                # CNN 特征展开为单独列
                for i,v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    df.to_csv("wsi_stage_features_cnn.csv", index=False)

    train_df = df[df.split=="train"]
    test_df  = df[df.split=="test"]

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + ["tumor_frac","h_mean","h_std"]

    Xtr, ytr = train_df[feature_cols].values, train_df["label"].values
    Xte, yte = test_df[feature_cols].values, test_df["label"].values

    clf = RandomForestClassifier(n_estimators=400, random_state=0, n_jobs=-1, class_weight="balanced")
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr))
    print("\nReport:\n", classification_report(yte, ypr, digits=4))




Feature rows: (133, 518)

Confusion matrix:
 [[1 6 0 0]
 [1 7 2 0]
 [0 6 1 0]
 [1 1 0 0]]

Report:
               precision    recall  f1-score   support

           0     0.3333    0.1429    0.2000         7
           1     0.3500    0.7000    0.4667        10
           2     0.3333    0.1429    0.2000         7
           3     0.0000    0.0000    0.0000         2

    accuracy                         0.3462        26
   macro avg     0.2542    0.2464    0.2167        26
weighted avg     0.3141    0.3462    0.2872        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure, transform
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # 去掉最后分类层
resnet = resnet.to(device)
resnet.eval()

# 图像预处理
transform_cnn = T.Compose([
    T.ToPILImage(),
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(rgb_crop):
    """输入RGB图像 -> 提取CNN特征"""
    x = transform_cnn(rgb_crop).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat

# ===== 读取与分割 =====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H,
        in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

# ===== 提取特征（手工 + CNN）=====
def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # 手工特征
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    tumor_frac = tumor_px_thumb / tissue_px_thumb
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())

    feats = dict(
        tumor_frac=tumor_frac,
        h_mean=h_mean, h_std=h_std
    )

    # CNN 特征（肿瘤外接框裁剪）
    ys, xs = np.where(nuc)
    if len(xs) > 0 and len(ys) > 0:
        xmin, xmax = xs.min(), xs.max()
        ymin, ymax = ys.min(), ys.max()
        crop = rgb[ymin:ymax, xmin:xmax]
        cnn_feat = extract_cnn_feature(crop)
    else:
        cnn_feat = np.zeros(512, dtype=np.float32)  # 兜底

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue

            # === 三分类映射：1->0, 2/3->1, 4->2 ===
            if stage == 1:
                label = 0
            elif stage in (2, 3):
                label = 1
            else:  # stage == 4
                label = 2

            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split)
                row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2&3, 2=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_cnn_3class.csv", index=False)

    train_df = df[df.split=="train"]
    test_df  = df[df.split=="test"]

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + ["tumor_frac","h_mean","h_std"]

    Xtr, ytr = train_df[feature_cols].values, train_df["label"].values
    Xte, yte = test_df[feature_cols].values, test_df["label"].values

    clf = RandomForestClassifier(
        n_estimators=400, random_state=0, n_jobs=-1, class_weight="balanced"
    )
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2],
        target_names=["stage1","stage2&3","stage4"], digits=4))




Feature rows: (133, 518)
Class distribution (0=stage1, 1=stage2&3, 2=stage4):
 label
0    35
1    89
2     9
Name: count, dtype: int64

Confusion matrix:
 [[ 0  7  0]
 [ 0 17  0]
 [ 0  2  0]]

Report:
               precision    recall  f1-score   support

      stage1     0.0000    0.0000    0.0000         7
    stage2&3     0.6538    1.0000    0.7907        17
      stage4     0.0000    0.0000    0.0000         2

    accuracy                         0.6538        26
   macro avg     0.2179    0.3333    0.2636        26
weighted avg     0.4275    0.6538    0.5170        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [8]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()  # 去掉最后分类层
resnet = resnet.to(device)
resnet.eval()

# 预处理（输入将是 PIL）
transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    """img 可以是 PIL / numpy.ndarray / torch.Tensor；统一转 PIL→RGB 再变换"""
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat

def extract_tumor_region_with_padding(pil_img, tumor_mask, padding_ratio=0.12):
    """从 tumor_mask 中裁剪病灶区域，加 padding，返回 224x224 的 PIL"""
    ys, xs = np.where(tumor_mask)
    if len(xs) == 0 or len(ys) == 0:
        return pil_img.resize((224, 224))
    xmin, xmax = xs.min(), xs.max()
    ymin, ymax = ys.min(), ys.max()
    h, w = (ymax - ymin), (xmax - xmin)
    pad_x = int(padding_ratio * w)
    pad_y = int(padding_ratio * h)
    xmin = max(xmin - pad_x, 0)
    xmax = min(xmax + pad_x, pil_img.width)
    ymin = max(ymin - pad_y, 0)
    ymax = min(ymax + pad_y, pil_img.height)
    return pil_img.crop((xmin, ymin, xmax, ymax)).resize((224, 224))

# ===== 读取与分割 =====
def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb)
    v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98))
    )
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

# ===== 提取特征（手工 + CNN）=====
def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # 手工特征
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    h_vals = H[mask]
    h_mean, h_std = float(h_vals.mean()), float(h_vals.std())
    feats = dict(tumor_frac=tumor_frac, h_mean=h_mean, h_std=h_std)

    # CNN 特征：最小外接框 + padding 裁剪
    pil_img = Image.fromarray(rgb)
    tumor_crop = extract_tumor_region_with_padding(pil_img, tumor_mask=nuc, padding_ratio=0.12)
    cnn_feat = extract_cnn_feature(tumor_crop)

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            # 三分类映射：1->0, 2/3->1, 4->2
            if stage == 1:
                label = 0
            elif stage in (2, 3):
                label = 1
            else:
                label = 2
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split)
                row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2&3, 2=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_cnn_3class_padding_fixed.csv", index=False)

    train_df = df[df.split=="train"]
    test_df  = df[df.split=="test"]

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + ["tumor_frac","h_mean","h_std"]
    Xtr, ytr = train_df[feature_cols].values, train_df["label"].values
    Xte, yte = test_df[feature_cols].values, test_df["label"].values

    clf = RandomForestClassifier(n_estimators=400, random_state=0, n_jobs=-1, class_weight="balanced")
    clf.fit(Xtr, ytr)
    ypr = clf.predict(Xte)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2],
        target_names=["stage1","stage2&3","stage4"], digits=4))




Feature rows: (133, 518)
Class distribution (0=stage1, 1=stage2&3, 2=stage4):
 label
0    35
1    89
2     9
Name: count, dtype: int64

Confusion matrix:
 [[ 1  6  0]
 [ 0 17  0]
 [ 0  2  0]]

Report:
               precision    recall  f1-score   support

      stage1     1.0000    0.1429    0.2500         7
    stage2&3     0.6800    1.0000    0.8095        17
      stage4     0.0000    0.0000    0.0000         2

    accuracy                         0.6923        26
   macro avg     0.5600    0.3810    0.3532        26
weighted avg     0.7138    0.6923    0.5966        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from collections import Counter

from sklearn.ensemble import HistGradientBoostingClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device).eval()

# 预处理（输入将是 PIL）
transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    """img: PIL / ndarray / Tensor -> 512d"""
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat  # [512]

# ---------- Top-K 病灶裁剪 + 池化 ----------
def extract_topk_crops(pil_img, nuc_mask, k=3, pad_ratio=0.12, out_size=224):
    lab = measure.label(nuc_mask)
    props = sorted(measure.regionprops(lab), key=lambda p: p.area, reverse=True)
    W, H = pil_img.width, pil_img.height
    crops = []
    for p in props[:k]:
        ymin, xmin, ymax, xmax = p.bbox  # (min_row, min_col, max_row, max_col)
        h, w = ymax - ymin, xmax - xmin
        if h <= 0 or w <= 0:
            continue
        px = int(pad_ratio * w); py = int(pad_ratio * h)
        xmin = max(xmin - px, 0); xmax = min(xmax + px, W)
        ymin = max(ymin - py, 0); ymax = min(ymax + py, H)
        crops.append(pil_img.crop((xmin, ymin, xmax, ymax)).resize((out_size, out_size)))
    if not crops:  # 兜底：没有病灶就整图
        crops = [pil_img.resize((out_size, out_size))]
    return crops

def cnn_features_pooled(crops):
    """多块裁剪 -> [1024] = mean(512) ⊕ max(512)"""
    feats = [extract_cnn_feature(im) for im in crops]  # [k,512]
    F = np.stack(feats, 0)
    return np.concatenate([F.mean(0), F.max(0)], 0)  # [1024]
# ------------------------------------------

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb); v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # —— 手工统计
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    areas  = np.array([p.area for p in props], dtype=np.float32) if props else np.array([])
    perims = np.array([p.perimeter for p in props], dtype=np.float32) if props else np.array([])
    cc_count = len(props)
    largest_cc_px = int(areas.max()) if cc_count else 0

    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    largest_cc_frac = largest_cc_px / max(tissue_px_thumb, 1)
    small_thresh = 0.001 * tissue_px_thumb
    cc_small = int((areas < small_thresh).sum()) if cc_count else 0
    cc_small_frac = cc_small / max(cc_count, 1)
    frag_ratio = float(perims.sum() / (areas.sum() + 1e-6)) if cc_count else 0.0

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count=cc_count,
        cc_small_frac=cc_small_frac,
        frag_ratio=frag_ratio,
    )

    # —— CNN：Top-K 裁剪 + 池化（K=3）
    pil_img = Image.fromarray(rgb)
    crops = extract_topk_crops(pil_img, nuc_mask=nuc, k=3, pad_ratio=0.12)
    cnn_feat = cnn_features_pooled(crops)  # [1024]

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            # 三分类映射：1->0, 2/3->1, 4->2
            if stage == 1: label = 0
            elif stage in (2,3): label = 1
            else: label = 2
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None: continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split); row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2&3, 2=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_topk.csv", index=False)

    train_df = df[df.split=="train"].copy()
    test_df  = df[df.split=="test"].copy()

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + [
        "tumor_frac","largest_cc_frac","cc_count","cc_small_frac","frag_ratio"
    ]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    # ===== 更强的不平衡处理：在 balanced 基础上额外放大 stage1/4 =====
    cnt = Counter(ytr)
    base_bal = {k: len(ytr) / (3.0 * cnt[k]) for k in cnt}  # balanced
    boost = {0: 1.6, 1: 1.0, 2: 1.8}                        # 可调：stage1/4 更重
    weights = np.array([base_bal[c] * boost.get(c,1.0) for c in ytr], dtype=np.float32)

    # === 模型：HGB（更稳），也可换回 RF（把下面两行注释解除）
    clf = HistGradientBoostingClassifier(
        max_depth=6, learning_rate=0.06, max_iter=600,
        l2_regularization=1.0, min_samples_leaf=20
    )
    clf.fit(Xtr, ytr, sample_weight=weights)

    # rf = RandomForestClassifier(n_estimators=800, random_state=0, n_jobs=-1, class_weight={0:base_bal[0]*boost[0],1:base_bal[1],2:base_bal.get(2,1.0)*boost[2]})
    # rf.fit(Xtr, ytr); clf = rf

    # ===== 预测时小幅偏置：提高 stage1/4 的召回 =====
    proba = clf.predict_proba(Xte)
    proba[:, 0] *= 1.20   # stage1 boost  1.10~1.40 可调
    proba[:, 2] *= 1.25   # stage4 boost  1.10~1.50 可调
    ypr = proba.argmax(axis=1)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2],
        target_names=["stage1","stage2&3","stage4"], digits=4))




KeyboardInterrupt: 

In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from collections import Counter

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device).eval()

transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat  # [512]

# ---------- Top-K 病灶裁剪 + 池化 ----------
def extract_topk_crops(pil_img, nuc_mask, k=3, pad_ratio=0.12, out_size=224):
    lab = measure.label(nuc_mask)
    props = sorted(measure.regionprops(lab), key=lambda p: p.area, reverse=True)
    W, H = pil_img.width, pil_img.height
    crops = []
    for p in props[:k]:
        ymin, xmin, ymax, xmax = p.bbox
        h, w = ymax - ymin, xmax - xmin
        if h <= 0 or w <= 0: continue
        px = int(pad_ratio * w); py = int(pad_ratio * h)
        xmin = max(xmin - px, 0); xmax = min(xmax + px, W)
        ymin = max(ymin - py, 0); ymax = min(ymax + py, H)
        crops.append(pil_img.crop((xmin, ymin, xmax, ymax)).resize((out_size, out_size)))
    if not crops:
        crops = [pil_img.resize((out_size, out_size))]
    return crops

def cnn_features_pooled(crops):
    feats = [extract_cnn_feature(im) for im in crops]  # [k,512]
    F = np.stack(feats, 0)
    return np.concatenate([F.mean(0), F.max(0)], 0)  # [1024]
# ------------------------------------------

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb); v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # —— 手工统计
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    areas  = np.array([p.area for p in props], dtype=np.float32) if props else np.array([])
    perims = np.array([p.perimeter for p in props], dtype=np.float32) if props else np.array([])
    cc_count = len(props)
    largest_cc_px = int(areas.max()) if cc_count else 0

    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    largest_cc_frac = largest_cc_px / max(tissue_px_thumb, 1)
    small_thresh = 0.001 * tissue_px_thumb
    cc_small = int((areas < small_thresh).sum()) if cc_count else 0
    cc_small_frac = cc_small / max(cc_count, 1)
    frag_ratio = float(perims.sum() / (areas.sum() + 1e-6)) if cc_count else 0.0

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count=cc_count,
        cc_small_frac=cc_small_frac,
        frag_ratio=frag_ratio,
    )

    # —— CNN：Top-K 裁剪 + 池化（K=3）
    pil_img = Image.fromarray(rgb)
    crops = extract_topk_crops(pil_img, nuc_mask=nuc, k=3, pad_ratio=0.12)
    cnn_feat = cnn_features_pooled(crops)  # [1024]

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists(): continue
            label = stage - 1        # ★ 变更点：四分类直接用 1→0,2→1,3→2,4→3
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None: continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split); row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):\n",   # ★ 变更点
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_topk_4class.csv", index=False)

    train_df = df[df.split=="train"].copy()
    test_df  = df[df.split=="test"].copy()

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + [
        "tumor_frac","largest_cc_frac","cc_count","cc_small_frac","frag_ratio"
    ]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    # ===== 类不平衡权重：按 4 类计算 =====
    counts = train_df["label"].value_counts()
    n_classes = 4                                                          # ★ 变更点
    class_weight_map = {c: len(train_df) / (n_classes * counts[c]) for c in counts.index}
    # 可选：额外拉高极少类（例如 stage1/4）
    for k, mult in {0:1.4, 3:1.5}.items():                                 # ★ 可调
        if k in class_weight_map: class_weight_map[k] *= mult
    weights = train_df["label"].map(class_weight_map).values

    clf = HistGradientBoostingClassifier(
        max_depth=6, learning_rate=0.06, max_iter=600,
        l2_regularization=1.0, min_samples_leaf=20, random_state=0
    )
    clf.fit(Xtr, ytr, sample_weight=weights)

    # ===== 预测：按 4 类输出；可对两端类小幅 boost =====
    proba = clf.predict_proba(Xte)                                        # [N,4]
    proba[:, 0] *= 1.10   # stage1 boost（可按需要调整/去掉）
    proba[:, 3] *= 1.15   # stage4 boost
    proba = proba / np.clip(proba.sum(1, keepdims=True), 1e-12, None)
    ypr = proba.argmax(axis=1)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2,3]))   # ★ 变更点
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2,3],                                                # ★ 变更点
        target_names=["stage1","stage2","stage3","stage4"], digits=4))


Feature rows: (133, 1032)
Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):
 label
0    35
1    51
2    38
3     9
Name: count, dtype: int64

Confusion matrix:
 [[1 4 0 0]
 [2 2 1 0]
 [0 2 3 0]
 [0 1 0 1]]

Report:
               precision    recall  f1-score   support

      stage1     0.3333    0.2000    0.2500         5
      stage2     0.2222    0.4000    0.2857         5
      stage3     0.7500    0.6000    0.6667         5
      stage4     1.0000    0.5000    0.6667         2

    accuracy                         0.4118        17
   macro avg     0.5764    0.4250    0.4673        17
weighted avg     0.5016    0.4118    0.4321        17



In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from collections import Counter

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===== 根目录 =====
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True

# ===== CNN 特征提取器 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device).eval()

transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat  # [512]

# ---------- Top-K 病灶裁剪 + 池化 ----------
def extract_topk_crops(pil_img, nuc_mask, k=3, pad_ratio=0.12, out_size=224):
    lab = measure.label(nuc_mask)
    props = sorted(measure.regionprops(lab), key=lambda p: p.area, reverse=True)
    W, H = pil_img.width, pil_img.height
    crops = []
    for p in props[:k]:
        ymin, xmin, ymax, xmax = p.bbox
        h, w = ymax - ymin, xmax - xmin
        if h <= 0 or w <= 0: continue
        px = int(pad_ratio * w); py = int(pad_ratio * h)
        xmin = max(xmin - px, 0); xmax = min(xmax + px, W)
        ymin = max(ymin - py, 0); ymax = min(ymax + py, H)
        crops.append(pil_img.crop((xmin, ymin, xmax, ymax)).resize((out_size, out_size)))
    if not crops:
        crops = [pil_img.resize((out_size, out_size))]
    return crops

def cnn_features_pooled(crops):
    feats = [extract_cnn_feature(im) for im in crops]  # [k,512]
    F = np.stack(feats, 0)
    return np.concatenate([F.mean(0), F.max(0)], 0)  # [1024]
# ------------------------------------------

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb); v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None
    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # —— 手工统计
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    areas  = np.array([p.area for p in props], dtype=np.float32) if props else np.array([])
    perims = np.array([p.perimeter for p in props], dtype=np.float32) if props else np.array([])
    cc_count = len(props)
    largest_cc_px = int(areas.max()) if cc_count else 0

    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    largest_cc_frac = largest_cc_px / max(tissue_px_thumb, 1)
    small_thresh = 0.001 * tissue_px_thumb
    cc_small = int((areas < small_thresh).sum()) if cc_count else 0
    cc_small_frac = cc_small / max(cc_count, 1)
    frag_ratio = float(perims.sum() / (areas.sum() + 1e-6)) if cc_count else 0.0

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count=cc_count,
        cc_small_frac=cc_small_frac,
        frag_ratio=frag_ratio,
    )

    # —— CNN：Top-K 裁剪 + 池化（K=3）
    pil_img = Image.fromarray(rgb)
    crops = extract_topk_crops(pil_img, nuc_mask=nuc, k=3, pad_ratio=0.12)
    cnn_feat = cnn_features_pooled(crops)  # [1024]

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists(): continue
            label = stage - 1        # ★ 变更点：四分类直接用 1→0,2→1,3→2,4→3
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None: continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split); row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# ===== 主程序 =====
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):\n",   # ★ 变更点
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_topk_4class.csv", index=False)

    train_df = df[df.split=="train"].copy()
    test_df  = df[df.split=="test"].copy()

    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + [
        "tumor_frac","largest_cc_frac","cc_count","cc_small_frac","frag_ratio"
    ]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    # ===== 类不平衡权重：按 4 类计算 =====
    counts = train_df["label"].value_counts()
    n_classes = 4                                                          # ★ 变更点
    class_weight_map = {c: len(train_df) / (n_classes * counts[c]) for c in counts.index}
    # 可选：额外拉高极少类（例如 stage1/4）
    for k, mult in {0:1.4, 3:1.5}.items():                                 # ★ 可调
        if k in class_weight_map: class_weight_map[k] *= mult
    weights = train_df["label"].map(class_weight_map).values

    clf = HistGradientBoostingClassifier(
        max_depth=6, learning_rate=0.06, max_iter=600,
        l2_regularization=1.0, min_samples_leaf=20, random_state=0
    )
    clf.fit(Xtr, ytr, sample_weight=weights)

    # ===== 预测：按 4 类输出；可对两端类小幅 boost =====
    proba = clf.predict_proba(Xte)                                        # [N,4]
    proba[:, 0] *= 1.10   # stage1 boost（可按需要调整/去掉）
    proba[:, 3] *= 1.15   # stage4 boost
    proba = proba / np.clip(proba.sum(1, keepdims=True), 1e-12, None)
    ypr = proba.argmax(axis=1)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2,3]))   # ★ 变更点
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2,3],                                                # ★ 变更点
        target_names=["stage1","stage2","stage3","stage4"], digits=4))




Feature rows: (133, 1032)
Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):
 label
0    35
1    51
2    38
3     9
Name: count, dtype: int64

Confusion matrix:
 [[2 3 1 2]
 [3 4 1 0]
 [2 3 2 1]
 [0 1 0 1]]

Report:
               precision    recall  f1-score   support

      stage1     0.2857    0.2500    0.2667         8
      stage2     0.3636    0.5000    0.4211         8
      stage3     0.5000    0.2500    0.3333         8
      stage4     0.2500    0.5000    0.3333         2

    accuracy                         0.3462        26
   macro avg     0.3498    0.3750    0.3386        26
weighted avg     0.3729    0.3462    0.3398        26



In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology, measure, exposure
from skimage.color import hed_from_rgb

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =================== 基础设置 ===================
ROOT = r"C:\Users\mxjli\Desktop\image"
REQUIRE_MPP = True    # 需要 mpp 才计算物理面积

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device).eval()

transform_cnn = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

def extract_cnn_feature(img):
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img)
    elif isinstance(img, torch.Tensor):
        img = T.ToPILImage()(img)
    if getattr(img, "mode", None) != "RGB":
        img = img.convert("RGB")
    x = transform_cnn(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(x).cpu().numpy().flatten()
    return feat  # 512d

# ----------- Top-K 病灶裁剪 + 池化（更稳） -----------
def extract_topk_crops(pil_img, nuc_mask, k=5, pad_ratio=0.18, out_size=224):
    lab = measure.label(nuc_mask)
    props = sorted(measure.regionprops(lab), key=lambda p: p.area, reverse=True)
    W, H = pil_img.width, pil_img.height
    crops = []
    for p in props[:k]:
        ymin, xmin, ymax, xmax = p.bbox
        h, w = ymax - ymin, xmax - xmin
        if h <= 0 or w <= 0:
            continue
        px = int(pad_ratio * w); py = int(pad_ratio * h)
        xmin = max(xmin - px, 0); xmax = min(xmax + px, W)
        ymin = max(ymin - py, 0); ymax = min(ymax + py, H)
        crops.append(pil_img.crop((xmin, ymin, xmax, ymax)).resize((out_size, out_size)))
    if not crops:  # 兜底
        crops = [pil_img.resize((out_size, out_size))]
    return crops

def cnn_features_pooled(crops):
    feats = [extract_cnn_feature(im) for im in crops]  # [k,512]
    F = np.stack(feats, 0)
    return np.concatenate([F.mean(0), F.max(0)], 0)    # 1024d
# ---------------------------------------------------

def read_thumbnail_and_geometry(svs_path, min_dim=3000):
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    scale = max(W0, H0) / float(min_dim)
    new_w, new_h = int(W0/scale), int(H0/scale)
    img = slide.get_thumbnail((new_w, new_h))
    mpp_x = slide.properties.get("openslide.mpp-x", None)
    mpp_y = slide.properties.get("openslide.mpp-y", None)
    slide.close()
    mpp_x = float(mpp_x) if mpp_x not in (None, "") else None
    mpp_y = float(mpp_y) if mpp_y not in (None, "") else None
    return np.asarray(img), scale, mpp_x, mpp_y

def tissue_mask(rgb):
    hsv = color.rgb2hsv(rgb); v = hsv[..., 2]
    thr = filters.threshold_otsu(v)
    mask = v < thr * 0.98
    mask = morphology.remove_small_holes(mask, 256)
    mask = morphology.remove_small_objects(mask, 256)
    return mask

def he_nuclei_mask(rgb, tissue):
    rgbf = np.clip(rgb/255.0, 0, 1)
    hed = color.separate_stains(rgbf, hed_from_rgb)
    H = hed[..., 0]
    H = exposure.rescale_intensity(
        H, in_range=(np.percentile(H[tissue], 2), np.percentile(H[tissue], 98)))
    thr = filters.threshold_otsu(H[tissue])
    nuc = np.zeros_like(H, dtype=bool)
    nuc[tissue] = H[tissue] > thr
    nuc = morphology.remove_small_objects(nuc, 64)
    nuc = morphology.binary_opening(nuc, morphology.disk(2))
    return nuc, H

def wsi_features(svs_path):
    rgb, scale, mpp_x, mpp_y = read_thumbnail_and_geometry(svs_path)
    has_mpp = (mpp_x is not None) and (mpp_y is not None) and (mpp_x > 0) and (mpp_y > 0)
    if REQUIRE_MPP and not has_mpp:
        return None

    mask = tissue_mask(rgb)
    if mask.sum() < 5000:
        return None
    nuc, H = he_nuclei_mask(rgb, mask)

    # ------ 缩略图像素域统计 ------
    tissue_px_thumb = int(mask.sum())
    tumor_px_thumb  = int(nuc.sum())
    lab = measure.label(nuc)
    props = measure.regionprops(lab)
    areas  = np.array([p.area for p in props], dtype=np.float32) if props else np.array([])
    perims = np.array([p.perimeter for p in props], dtype=np.float32) if props else np.array([])
    cc_count = len(props)
    largest_cc_px = int(areas.max()) if cc_count else 0

    tumor_frac = tumor_px_thumb / max(tissue_px_thumb, 1)
    largest_cc_frac = largest_cc_px / max(tissue_px_thumb, 1)
    small_thresh = 0.001 * tissue_px_thumb
    cc_small = int((areas < small_thresh).sum()) if cc_count else 0
    cc_small_frac = cc_small / max(cc_count, 1)
    frag_ratio = float(perims.sum() / (areas.sum() + 1e-6)) if cc_count else 0.0

    feats = dict(
        tumor_frac=tumor_frac,
        largest_cc_frac=largest_cc_frac,
        cc_count=cc_count,
        cc_small_frac=cc_small_frac,
        frag_ratio=frag_ratio,
    )

    # ------ 物理尺度（mm²/cm²），直连 “T 大小” ------
    if has_mpp:
        px_area_mm2 = (mpp_x * mpp_y) * 1e-6
        s2 = scale * scale
        tissue_area_mm2  = tissue_px_thumb  * s2 * px_area_mm2
        tumor_area_mm2   = tumor_px_thumb   * s2 * px_area_mm2
        largest_cc_mm2   = largest_cc_px    * s2 * px_area_mm2
        tissue_area_cm2  = tissue_area_mm2 / 100.0
        tumor_area_cm2   = tumor_area_mm2  / 100.0
        largest_cc_cm2   = largest_cc_mm2  / 100.0
        cc_per_cm2       = cc_count / max(tissue_area_cm2, 1e-6)
        feats.update(dict(
            tissue_area_mm2=tissue_area_mm2,
            tumor_area_mm2=tumor_area_mm2,
            largest_cc_mm2=largest_cc_mm2,
            tissue_area_cm2=tissue_area_cm2,
            tumor_area_cm2=tumor_area_cm2,
            largest_cc_cm2=largest_cc_cm2,
            cc_per_cm2=cc_per_cm2,
            has_mpp=1
        ))
    else:
        feats.update(dict(
            tissue_area_mm2=np.nan, tumor_area_mm2=np.nan, largest_cc_mm2=np.nan,
            tissue_area_cm2=np.nan,  tumor_area_cm2=np.nan,  largest_cc_cm2=np.nan,
            cc_per_cm2=np.nan, has_mpp=0
        ))

    # ------ CNN：Top-K 裁剪 + 池化 ------
    pil_img = Image.fromarray(rgb)
    crops = extract_topk_crops(pil_img, nuc_mask=nuc, k=5, pad_ratio=0.18)
    cnn_feat = cnn_features_pooled(crops)  # 1024d

    return feats, cnn_feat

def collect_rows(root):
    rows = []
    for stage in [1,2,3,4]:
        for split in ["train","test"]:
            d = Path(root) / f"stage {stage} {split}"
            if not d.exists():
                continue
            label = stage - 1
            for p in list(d.glob("*.svs")) + list(d.glob("*.tif")):
                res = wsi_features(str(p))
                if res is None:
                    continue
                feats, cnn_feat = res
                row = dict(path=str(p), label=label, split=split); row.update(feats)
                for i, v in enumerate(cnn_feat):
                    row[f"cnn_{i}"] = v
                rows.append(row)
    return pd.DataFrame(rows)

# =================== 训练与评估 ===================
if __name__ == "__main__":
    df = collect_rows(ROOT)
    print("Feature rows:", df.shape)
    print("Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):\n",
          df["label"].value_counts().sort_index())
    df.to_csv("wsi_stage_features_topk_4class_phys.csv", index=False)

    train_df = df[df.split=="train"].copy()
    test_df  = df[df.split=="test"].copy()

    # 特征列：CNN + 像素比例 + 物理面积（若无 mpp 自动是 NaN，HGB 能处理）
    feature_cols = [c for c in df.columns if c.startswith("cnn_")] + [
        "tumor_frac","largest_cc_frac","cc_count","cc_small_frac","frag_ratio",
        "tumor_area_mm2","largest_cc_mm2","tissue_area_mm2","cc_per_cm2",
        "tumor_area_cm2","largest_cc_cm2","tissue_area_cm2"
    ]

    Xtr = train_df[feature_cols].values
    ytr = train_df["label"].values
    Xte = test_df[feature_cols].values
    yte = test_df["label"].values

    # ===== 类不平衡：先按频次反比，再对 1/4 额外放大 =====
    counts = train_df["label"].value_counts()
    n_classes = 4
    class_weight_map = {c: len(train_df) / (n_classes * counts[c]) for c in counts.index}
    EXTRA_BOOST = {0: 2.0, 3: 3.0}  # 更激进：stage1×2，stage4×3
    for k, mult in EXTRA_BOOST.items():
        if k in class_weight_map:
            class_weight_map[k] *= mult
    weights = train_df["label"].map(class_weight_map).values

    # ===== HGB 参数（可微调）=====
    clf = HistGradientBoostingClassifier(
        max_depth=7,           # 6~8
        max_iter=800,          # 600~1200
        learning_rate=0.06,    # 0.05~0.1
        min_samples_leaf=16,   # 10~40
        l2_regularization=0.5, # 0.0~1.0
        random_state=0
    )
    clf.fit(Xtr, ytr, sample_weight=weights)

    # ===== 预测后 class bias（让 1/4 再“响亮一点”）=====
    proba = clf.predict_proba(Xte)  # (N,4)
    BIAS = np.array([1.25, 1.00, 1.00, 1.40], dtype=np.float32)
    proba = proba * BIAS[None, :]
    proba = proba / np.clip(proba.sum(1, keepdims=True), 1e-12, None)
    ypr = proba.argmax(1)

    print("\nConfusion matrix:\n", confusion_matrix(yte, ypr, labels=[0,1,2,3]))
    print("\nReport:\n", classification_report(
        yte, ypr, labels=[0,1,2,3],
        target_names=["stage1","stage2","stage3","stage4"], digits=4))




Feature rows: (133, 1040)
Class distribution (0=stage1, 1=stage2, 2=stage3, 3=stage4):
 label
0    35
1    51
2    38
3     9
Name: count, dtype: int64

Confusion matrix:
 [[4 4 0 0]
 [3 3 2 0]
 [2 4 2 0]
 [0 1 0 1]]

Report:
               precision    recall  f1-score   support

      stage1     0.4444    0.5000    0.4706         8
      stage2     0.2500    0.3750    0.3000         8
      stage3     0.5000    0.2500    0.3333         8
      stage4     1.0000    0.5000    0.6667         2

    accuracy                         0.3846        26
   macro avg     0.5486    0.4062    0.4426        26
weighted avg     0.4444    0.3846    0.3910        26

