In [1]:
import openslide
import numpy as np
from PIL import Image
from skimage import color, filters, morphology
import os
import pandas as pd
from pathlib import Path
import re

# ========= 输入：5 位病人的 Stage 1 WSI =========
# 方式A：放到一个目录，自动扫描 .svs/.tif
SVS_DIR = r"D:\stage 1"
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]
# 方式B：手动列出（如不走目录扫描）
# svs_paths = [
#     r"C:\...\patientA.svs",
#     r"C:\...\patientB.svs",
#     ...
# ]

# 输出目录（建议直接叫 Stage 1）
out_dir = r"D:\stage 1切片"
os.makedirs(out_dir, exist_ok=True)
label = Path(out_dir).name.strip() or "Stage1"

# ========= 参数 =========
LOW_LEVEL = 2                   # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60           # 掩膜覆盖率阈值
ROI_PAD_LOW = 8                 # 低层 ROI 外扩像素
MIN_STD = 10.0                  # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012                # 边缘密度阈值（0~1）
EDGE_THR = 8                    # 边缘阈值
MIN_RANGE = 14.0                # 动态范围阈值

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    slide = openslide.OpenSlide(svs_path)
    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close(); continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close(); continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2: 网格切片 + 覆盖率 & 质量过滤 ===
    saved_this = 0
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            if gray.std() < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            if float(gray.max() - gray.min()) < MIN_RANGE:
                continue

            # 保存：带上 wsi_id，方便之后按 WSI 分组划分
            fname = f"{wsi_id}_x{x}_y{y}.png"
            fpath = os.path.join(out_dir, fname)
            patch.save(fpath)

            records.append([fpath, label, wsi_id, x, y, tissue_ratio, float(gray.std()),
                            float(gray.max() - gray.min())])
            saved_this += 1
            total_patches += 1

    slide.close()
    print(f"✅ {wsi_id} 切割完成：{saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"标签清单已保存：{csv_path}")


✅ TCGA-A1-A0SB-01Z-00-DX1.B34C267B-CAAA-4AB6-AD5C-276C26F997A1 切割完成：21579 个有效 patch
✅ TCGA-A2-A04N-01Z-00-DX1.9E9B7DB0-1CF1-4631-8292-A9DBCA0BD37C 切割完成：33140 个有效 patch
✅ TCGA-A2-A0SX-01Z-00-DX1.219A994C-8974-4458-98FA-FB1F14868E04 切割完成：29594 个有效 patch
✅ TCGA-A2-A0T3-01Z-00-DX1.5E96BC87-F4FB-4ABA-8D81-FAB7F4A80661 切割完成：31735 个有效 patch
✅ TCGA-A7-A26E-01Z-00-DX1.BA4A7E28-0563-4C23-82D0-AB34A2E79AE3 切割完成：19916 个有效 patch
✅ TCGA-AC-A5EI-01Z-00-DX1.A174D91A-730E-460C-AE73-46CA1E5B177B 切割完成：10921 个有效 patch
✅ TCGA-AO-A03M-01Z-00-DX1.9998A9A0-D0A6-48FC-80FB-AE597CB9E8AA 切割完成：15827 个有效 patch
✅ TCGA-AO-A03U-01Z-00-DX1.AE2B55F3-8BA1-4546-82B7-4D2292BE1C78 切割完成：7689 个有效 patch
✅ TCGA-AR-A1AJ-01Z-00-DX1.34B9FCF0-74D8-4328-9B5A-698AD57EDA85 切割完成：19807 个有效 patch
✅ TCGA-B6-A0IP-01Z-00-DX1.3723A250-AA4B-4AFE-A692-C1311E9BA268 切割完成：46513 个有效 patch
✅ TCGA-B6-A0RN-01Z-00-DX1.0D02A3FB-D694-4A5B-80C1-CF1469E29BFD 切割完成：41728 个有效 patch
✅ TCGA-B6-A1KI-01Z-00-DX1.EFAA08A8-02EC-4A9B-AC6B-79BF2DC5A2AA 切割完成：22068 个有效

In [3]:
# ========= 阶段2（Stage 2）WSI Patch 生成脚本 =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"C:\Users\mxjli\PyCharmMiscProject\阶段2"  # <-- 阶段2原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# 若不走目录扫描，可手动列出：
# svs_paths = [
#     r"C:\...\patientA.svs",
#     r"C:\...\patientB.svs",
# ]

# ========= 输出目录（建议就叫 Stage 2）=========
out_dir = r"C:\Users\mxjli\PyCharmMiscProject\stage2"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage2")  # 作为导出清单中的标签名（阶段2）

# ========= 参数（可按需调整）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2: 网格切片 + 覆盖率 & 质量过滤 ===
    saved_this = 0
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            if gray.std() < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            if float(gray.max() - gray.min()) < MIN_RANGE:
                continue

            # 保存：带上 wsi_id，方便之后按 WSI 分组划分
            fname = f"{wsi_id}_x{x}_y{y}.png"
            fpath = os.path.join(out_dir, fname)
            patch.save(fpath)

            records.append([fpath, label, wsi_id, x, y, tissue_ratio, float(gray.std()),
                            float(gray.max() - gray.min())])
            saved_this += 1
            total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（阶段2）切割完成：{saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"阶段2标签清单已保存：{csv_path}")


✅ TCGA-AO-A1KO-01Z-00-222（阶段2）切割完成：15113 个有效 patch
✅ TCGA-C8-A8HP-01Z-00-222222（阶段2）切割完成：23255 个有效 patch
✅ TCGA-E9-A1NH-01Z-00-2（阶段2）切割完成：20835 个有效 patch
✅ TCGA-E9-A227-01Z-00-22（阶段2）切割完成：17992 个有效 patch
❌ 无法打开：C:\Users\mxjli\PyCharmMiscProject\阶段2\TCGA-E9-A3QA-01Z-00-2222.svs，原因：Unsupported or missing image file

总计：5 张 WSI，77195 个有效 patch
阶段2标签清单已保存：C:\Users\mxjli\PyCharmMiscProject\stage2\patch_manifest.csv


In [4]:
# ========= 阶段3（Stage 3）WSI Patch 生成脚本 =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"C:\Users\mxjli\PyCharmMiscProject\阶段3"  # <-- 阶段3原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# 若不走目录扫描，可手动列出：
# svs_paths = [
#     r"C:\...\patientA.svs",
#     r"C:\...\patientB.svs",
# ]

# ========= 输出目录（建议就叫 Stage 3）=========
out_dir = r"C:\Users\mxjli\PyCharmMiscProject\stage3"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage3")  # 作为导出清单中的标签名（阶段3）

# ========= 参数（可按需调整）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2: 网格切片 + 覆盖率 & 质量过滤 ===
    saved_this = 0
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            if gray.std() < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            if float(gray.max() - gray.min()) < MIN_RANGE:
                continue

            # 保存：带上 wsi_id，方便之后按 WSI 分组划分
            fname = f"{wsi_id}_x{x}_y{y}.png"
            fpath = os.path.join(out_dir, fname)
            patch.save(fpath)

            records.append([fpath, label, wsi_id, x, y, tissue_ratio, float(gray.std()),
                            float(gray.max() - gray.min())])
            saved_this += 1
            total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（阶段3）切割完成：{saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"阶段3标签清单已保存：{csv_path}")


✅ TCGA-A2-A0YG-01Z-00-333（阶段3）切割完成：26222 个有效 patch
✅ TCGA-A8-A08X-01Z-00-3（阶段3）切割完成：8181 个有效 patch
✅ TCGA-AR-A1AR-01Z-00-33（阶段3）切割完成：21983 个有效 patch

总计：3 张 WSI，56386 个有效 patch
阶段3标签清单已保存：C:\Users\mxjli\PyCharmMiscProject\stage3\patch_manifest.csv


In [None]:
# ========= 阶段3（Stage 3）WSI Patch 生成脚本 =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"C:\Users\mxjli\PyCharmMiscProject\阶段3"  # <-- 阶段3原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# 若不走目录扫描，可手动列出：
# svs_paths = [
#     r"C:\...\patientA.svs",
#     r"C:\...\patientB.svs",
# ]

# ========= 输出目录（建议就叫 Stage 3）=========
out_dir = r"C:\Users\mxjli\PyCharmMiscProject\stage3"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage3")  # 作为导出清单中的标签名（阶段3）

# ========= 参数（可按需调整）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2: 网格切片 + 覆盖率 & 质量过滤 ===
    saved_this = 0
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            if gray.std() < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            if float(gray.max() - gray.min()) < MIN_RANGE:
                continue

            # 保存：带上 wsi_id，方便之后按 WSI 分组划分
            fname = f"{wsi_id}_x{x}_y{y}.png"
            fpath = os.path.join(out_dir, fname)
            patch.save(fpath)

            records.append([fpath, label, wsi_id, x, y, tissue_ratio, float(gray.std()),
                            float(gray.max() - gray.min())])
            saved_this += 1
            total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（阶段3）切割完成：{saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"阶段3标签清单已保存：{csv_path}")


In [5]:
# ========= 第四阶段（Stage 4）WSI Patch 生成脚本 =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"C:\Users\mxjli\PyCharmMiscProject\阶段4"  # <-- 第四阶段原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# 若不走目录扫描，可手动列出：
# svs_paths = [
#     r"C:\...\patientA.svs",
#     r"C:\...\patientB.svs",
# ]

# ========= 输出目录（建议就叫 Stage 4）=========
out_dir = r"C:\Users\mxjli\PyCharmMiscProject\stage4"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage4")  # 作为导出清单中的标签名（第四阶段）

# ========= 参数（可按需调整）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2: 网格切片 + 覆盖率 & 质量过滤 ===
    saved_this = 0
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            if gray.std() < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            if float(gray.max() - gray.min()) < MIN_RANGE:
                continue

            # 保存：带上 wsi_id，方便之后按 WSI 分组划分
            fname = f"{wsi_id}_x{x}_y{y}.png"
            fpath = os.path.join(out_dir, fname)
            patch.save(fpath)

            records.append([fpath, label, wsi_id, x, y, tissue_ratio, float(gray.std()),
                            float(gray.max() - gray.min())])
            saved_this += 1
            total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（第四阶段）切割完成：{saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"第四阶段标签清单已保存：{csv_path}")


✅ TCGA-A8-A08T-01Z-00-44（第四阶段）切割完成：11569 个有效 patch
✅ TCGA-AC-A62V-01Z-00-4444（第四阶段）切割完成：4210 个有效 patch
✅ TCGA-B6-A0IB-01Z-00-444（第四阶段）切割完成：41376 个有效 patch
✅ TCGA-LL-A73Z-01Z-00-4444444（第四阶段）切割完成：14058 个有效 patch

总计：4 张 WSI，71213 个有效 patch
第四阶段标签清单已保存：C:\Users\mxjli\PyCharmMiscProject\stage4\patch_manifest.csv


In [1]:
# ========= 阶段2（Stage 2）WSI Patch 生成脚本（每张WSI随机最多1000张） =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"G:\stage 2"  # <-- 阶段2原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# ========= 输出目录（建议就叫 Stage 2）=========
out_dir = r"G:\stage 22"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage2")  # 作为导出清单中的标签名（阶段2）

# ========= 参数（保持原值不变）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

# ========= 新增：每张WSI最多随机导出数量 =========
MAX_PATCHES_PER_WSI = 1000

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2（改造）：先收集“通过过滤的候选坐标”，后续随机抽样最多1000个 ===
    candidates = []  # 每项: (x, y, tissue_ratio, std, dyn_range)
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            # 读取高分辨率patch，只用于质量过滤（不立即保存）
            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            std_v = float(gray.std())
            if std_v < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            dyn_range = float(gray.max() - gray.min())
            if dyn_range < MIN_RANGE:
                continue

            candidates.append((x, y, tissue_ratio, std_v, dyn_range))

    # 随机抽样：最多 MAX_PATCHES_PER_WSI 个
    if len(candidates) == 0:
        print(f"⚠️ {wsi_id} 无符合条件的候选 patch")
        slide.close()
        continue

    if len(candidates) > MAX_PATCHES_PER_WSI:
        sampled = random.sample(candidates, MAX_PATCHES_PER_WSI)
    else:
        sampled = candidates

    # 二次读取并保存抽中的patch（为稳妥起见再读一次，避免缓存占用内存）
    saved_this = 0
    for x, y, tissue_ratio, std_v, dyn_range in sampled:
        patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
        fname = f"{wsi_id}_x{x}_y{y}.png"
        fpath = os.path.join(out_dir, fname)
        patch.save(fpath)

        records.append([fpath, label, wsi_id, x, y, tissue_ratio, std_v, dyn_range])
        saved_this += 1
        total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（阶段2）候选 {len(candidates)} 个，随机保留 {saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"阶段2标签清单已保存：{csv_path}")


✅ TCGA-A8-A093-01Z-00-DX1.1C8056D1-11CD-482D-9A23-3A9D1B4E63F0（阶段2）候选 4733 个，随机保留 1000 个有效 patch
✅ TCGA-AC-A2FB-01Z-00-DX1.A4D93E32-BBD7-45E4-8ACF-3724B059ECBC（阶段2）候选 15168 个，随机保留 1000 个有效 patch
✅ TCGA-AC-A62Y-01Z-00-DX1.5075F4DA-488C-40AD-BD73-6DE8953E1864（阶段2）候选 4383 个，随机保留 1000 个有效 patch
✅ TCGA-AN-A0AM-01Z-00-DX1.169CE39A-DD54-46D8-8D03-60B69A473CDB（阶段2）候选 14363 个，随机保留 1000 个有效 patch
✅ TCGA-AO-A0J8-01Z-00-DX1.9BDD4BDE-2A07-4E0C-B146-87365CA9DE3A（阶段2）候选 9851 个，随机保留 1000 个有效 patch
✅ TCGA-AO-A0JC-01Z-00-DX1.C8DD421B-9799-4FE7-9224-5EAC6ED1028E（阶段2）候选 16651 个，随机保留 1000 个有效 patch
✅ TCGA-AO-A128-01Z-00-DX1.4E6BFFBC-87AD-4ED4-959D-FEB5545400BE（阶段2）候选 10131 个，随机保留 1000 个有效 patch
✅ TCGA-B6-A0IO-01Z-00-DX1.D8898C30-4016-4983-9359-5C1507C01715（阶段2）候选 27773 个，随机保留 1000 个有效 patch
✅ TCGA-B6-A0WZ-01Z-00-DX1.6CFB236E-36F5-43D6-8DE3-C4ECBD3C14C6（阶段2）候选 26755 个，随机保留 1000 个有效 patch
✅ TCGA-BH-A0H9-01Z-00-DX1.8AE869C6-5C78-4D52-AC8B-5B6FD5FD91AA（阶段2）候选 29404 个，随机保留 1000 个有效 patch
✅ TCGA-BH-A1EY-01Z-00-D

In [3]:
# ========= 阶段2（Stage 2）WSI Patch 生成脚本（每张WSI随机最多1000张） =========
# 依赖：openslide-python, numpy, pandas, scikit-image
# pip install openslide-python numpy pandas scikit-image

import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import openslide
from skimage import color, filters, morphology

# ========= 输入：放到一个目录，自动扫描 .svs/.tif =========
SVS_DIR = r"G:\stage 4"  # <-- 阶段2原始WSI目录
svs_paths = [str(p) for p in Path(SVS_DIR).glob("*.svs")]
svs_paths += [str(p) for p in Path(SVS_DIR).glob("*.tif")]

# ========= 输出目录（建议就叫 Stage 2）=========
out_dir = r"G:\stage 4 切割"
os.makedirs(out_dir, exist_ok=True)
label = (Path(out_dir).name.strip() or "Stage4")  # 作为导出清单中的标签名（阶段2）

# ========= 参数（保持原值不变）=========
LOW_LEVEL = 2            # 用低分辨率层做掩膜 & ROI
PATCH_SIZE = 256
STEP = 256
TISSUE_COV_THR = 0.60    # 掩膜覆盖率阈值
ROI_PAD_LOW = 8          # 低层 ROI 外扩像素
MIN_STD = 10.0           # 灰度标准差阈值（0~255）
MIN_EDGE = 0.012         # 边缘密度阈值（0~1）
EDGE_THR = 8             # 边缘阈值
MIN_RANGE = 14.0         # 动态范围阈值

# ========= 新增：每张WSI最多随机导出数量 =========
MAX_PATCHES_PER_WSI = 1000

def edge_density_u8(gray_u8: np.ndarray, thr: int = 8) -> float:
    gx = np.abs(gray_u8[:, 1:].astype(np.int16) - gray_u8[:, :-1].astype(np.int16))
    gy = np.abs(gray_u8[1:, :].astype(np.int16) - gray_u8[:-1, :].astype(np.int16))
    h, w = gray_u8.shape
    mag = np.zeros((h, w-1), dtype=np.int16)
    mag += gx
    mag[1:, :] += gy[:, :w-1]
    return float((mag > thr).mean())

def gen_positions(start, end, patch, step):
    xs = list(range(start, max(end - patch, start) + 1, step))
    last = end - patch
    if xs[-1] != last:
        xs.append(last)
    return xs

records = []
total_patches = 0

for svs_path in svs_paths:
    try:
        slide = openslide.OpenSlide(svs_path)
    except Exception as e:
        print(f"❌ 无法打开：{svs_path}，原因：{e}")
        continue

    W0, H0 = slide.dimensions
    lvl = min(LOW_LEVEL, slide.level_count - 1)
    low_w, low_h = slide.level_dimensions[lvl]
    down = float(slide.level_downsamples[lvl])  # 用 float，避免整除误差

    # === Step1: 低层掩膜 + ROI ===
    low_img = slide.read_region((0, 0), lvl, (low_w, low_h)).convert("RGB")
    low_np = np.array(low_img)

    # 组织检测：灰度 Otsu + 饱和度
    hsv = color.rgb2hsv(low_np)
    s_ch = (hsv[..., 1] * 255).astype(np.uint8)
    gray_low = color.rgb2gray(low_np)
    th = filters.threshold_otsu(gray_low)
    mask_otsu = (gray_low < th)
    mask_sat  = (s_ch > 20)
    mask = mask_otsu | mask_sat
    mask = morphology.remove_small_objects(mask, 50)
    mask = morphology.remove_small_holes(mask, 50)

    ys, xs = np.where(mask)
    if xs.size == 0 or ys.size == 0:
        print(f"❌ 无组织区域：{svs_path}")
        slide.close()
        continue

    x0_low = max(0, xs.min() - ROI_PAD_LOW)
    x1_low = min(low_w, xs.max() + 1 + ROI_PAD_LOW)
    y0_low = max(0, ys.min() - ROI_PAD_LOW)
    y1_low = min(low_h, ys.max() + 1 + ROI_PAD_LOW)

    # 映射回 level-0
    x0 = int(round(x0_low * down)); x1 = int(round(x1_low * down))
    y0 = int(round(y0_low * down)); y1 = int(round(y1_low * down))
    x0 = max(0, x0); y0 = max(0, y0)
    x1 = min(W0, x1); y1 = min(H0, y1)

    if x1 - x0 < PATCH_SIZE or y1 - y0 < PATCH_SIZE:
        print(f"❌ ROI 小于 patch：{svs_path}")
        slide.close()
        continue

    xs0 = gen_positions(x0, x1, PATCH_SIZE, STEP)
    ys0 = gen_positions(y0, y1, PATCH_SIZE, STEP)

    # 取 wsi_id（文件名去后缀）
    wsi_id = Path(svs_path).stem

    # === Step2（改造）：先收集“通过过滤的候选坐标”，后续随机抽样最多1000个 ===
    candidates = []  # 每项: (x, y, tissue_ratio, std, dyn_range)
    for y in ys0:
        for x in xs0:
            # 映射到低层窗口（用比例换算）
            x_low = int(round(x / down))
            y_low = int(round(y / down))
            w_low = int(round(PATCH_SIZE / down))
            h_low = int(round(PATCH_SIZE / down))
            x_low = min(max(0, x_low), low_w - w_low)
            y_low = min(max(0, y_low), low_h - h_low)

            mask_crop = mask[y_low:y_low + h_low, x_low:x_low + w_low]
            tissue_ratio = float(mask_crop.mean())
            if tissue_ratio < TISSUE_COV_THR:
                continue

            # 读取高分辨率patch，只用于质量过滤（不立即保存）
            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
            gray = np.array(patch.convert("L"), dtype=np.uint8)

            std_v = float(gray.std())
            if std_v < MIN_STD:
                continue
            if edge_density_u8(gray, thr=EDGE_THR) < MIN_EDGE:
                continue
            dyn_range = float(gray.max() - gray.min())
            if dyn_range < MIN_RANGE:
                continue

            candidates.append((x, y, tissue_ratio, std_v, dyn_range))

    # 随机抽样：最多 MAX_PATCHES_PER_WSI 个
    if len(candidates) == 0:
        print(f"⚠️ {wsi_id} 无符合条件的候选 patch")
        slide.close()
        continue

    if len(candidates) > MAX_PATCHES_PER_WSI:
        sampled = random.sample(candidates, MAX_PATCHES_PER_WSI)
    else:
        sampled = candidates

    # 二次读取并保存抽中的patch（为稳妥起见再读一次，避免缓存占用内存）
    saved_this = 0
    for x, y, tissue_ratio, std_v, dyn_range in sampled:
        patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
        fname = f"{wsi_id}_x{x}_y{y}.png"
        fpath = os.path.join(out_dir, fname)
        patch.save(fpath)

        records.append([fpath, label, wsi_id, x, y, tissue_ratio, std_v, dyn_range])
        saved_this += 1
        total_patches += 1

    slide.close()
    print(f"✅ {wsi_id}（阶段4）候选 {len(candidates)} 个，随机保留 {saved_this} 个有效 patch")

# 保存清单
df = pd.DataFrame(records, columns=["patch_path", "label", "wsi_id", "x0", "y0",
                                    "tissue_cov", "std", "dyn_range"])
csv_path = os.path.join(out_dir, "patch_manifest.csv")
df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n总计：{len(svs_paths)} 张 WSI，{total_patches} 个有效 patch")
print(f"阶段4标签清单已保存：{csv_path}")


✅ TCGA-5L-AAT1-01Z-00-DX1.F3449A5B-2AC4-4ED7-BF44-4C8946CDB47D（阶段4）候选 7886 个，随机保留 1000 个有效 patch
✅ TCGA-A2-A0SW-01Z-00-DX1.E1EA0407-B831-4D75-826E-80B82B821797（阶段4）候选 37334 个，随机保留 1000 个有效 patch
✅ TCGA-A8-A07W-01Z-00-DX1.5970CF33-D675-4AF4-800F-7182AA1A44A6（阶段4）候选 13479 个，随机保留 1000 个有效 patch
✅ TCGA-A8-A08O-01Z-00-DX1.BC87C01D-F081-41CA-939A-875C61588E88（阶段4）候选 5102 个，随机保留 1000 个有效 patch
✅ TCGA-A8-A08T-01Z-00-DX1.ABA1ABF2-DDB1-4E31-89DC-0F7821311D11（阶段4）候选 11569 个，随机保留 1000 个有效 patch
✅ TCGA-AC-A62V-01Z-00-DX1.2D8994FD-58B8-43C1-B99D-AA964E7DFD60（阶段4）候选 4210 个，随机保留 1000 个有效 patch
✅ TCGA-AN-A0FJ-01Z-00-DX1.97B60767-916E-4938-9D0B-E6C0FE1CB3FC（阶段4）候选 31395 个，随机保留 1000 个有效 patch
✅ TCGA-AO-A0J5-01Z-00-DX1.20C14D0C-1A74-4FE9-A5E6-BDDCB8DE7714（阶段4）候选 19028 个，随机保留 1000 个有效 patch
✅ TCGA-B6-A0IB-01Z-00-DX1.BAA1D655-1B80-49E2-B1EB-2ECC83DED989（阶段4）候选 41376 个，随机保留 1000 个有效 patch
✅ TCGA-LL-A73Z-01Z-00-DX1.C010142E-29C0-411D-9E0E-4B7D8A4C09BF（阶段4）候选 14058 个，随机保留 1000 个有效 patch
✅ TCGA-PL-A8LX-01A-01-D