# ST-MIL 全流程（HEST 原始数据）

适用于你已经下载 HEST 原始数据（与 sCellST 同源）的情况。
本 Notebook 会把 HEST 原始数据转换为本项目需要的格式：
- 导出 h5ad（spot 表达）
- 导出细胞中心 CSV（来自 HEST 内置分割）
- 导出细胞 patch H5
- LazySlide 生成细胞嵌入
- 构建 MIL 数据集并做前向检查

注：所有注释为中文，方便你逐步验证。


**提示**：HEST 是 gated 数据集，需要登录或设置环境变量 `HF_TOKEN`。


In [1]:
from pathlib import Path
import os
import sys

ROOT = Path("/home/huangjinjin/Desktop/Morpho-VC").resolve()
os.chdir(ROOT)
# ROOT = Path('..').resolve(strict=False)
sys.path.append(str(ROOT / 'src'))

# 本地 HEST（无需 pip 安装）
hest_src = ROOT / 'third_party' / 'HEST' / 'src'
if hest_src.exists():
    sys.path.insert(0, str(hest_src))

# ====== 需要你修改的路径 ======
hest_dir = Path('data/hest_data')
slide_id = 'INT25'

# 输出路径（本项目）
h5ad_out = Path('data/spatial_data') / f'{slide_id}.h5ad'
cell_csv = Path('data/cell_centers') / f'{slide_id}_cells.csv'
cell_patch_h5 = Path('data/cell_images') / f'{slide_id}_cell_patches.h5'
cell_emb_h5 = Path('data/cell_embeddings') / f'{slide_id}_cell_emb.h5'

# HEST 里常用的细胞分割名称
shape_name = 'cellvit'
coordinates_name = 'he'

# LazySlide 模型
lazyslide_model = 'resnet50'

# CellFM
gene_vocab_path = ROOT / 'assets' / 'cellfm' / 'gene_info.csv'
cellfm_checkpoint = '../checkpoints/CellFM/CellFM_80M_weight.pt'
use_mock_cellfm = False  # 服务器上可改为 False

# ====== 是否执行下载 ======
RUN_DOWNLOAD = True  # 需要网络时才开


In [2]:
# 检查 hest 依赖
print('本地 HEST 路径:', hest_src, '存在:', hest_src.exists())
try:
    import hest  # noqa: F401
    print('hest 可用')
except ModuleNotFoundError:
    raise ModuleNotFoundError(
        '未找到 hest 模块。请确认 third_party/HEST 已放置且包含 src/hest，'
        '然后重启内核再运行。'
    )


本地 HEST 路径: /home/huangjinjin/Desktop/Morpho-VC/third_party/HEST/src 存在: True


  _set_context_ca_bundle_path(ca_bundle_path)


hest 可用


## 步骤 0：可选下载 HEST 原始数据
如果你已经手动下载过，可以跳过。


In [3]:
import os
hf_token = os.environ.get('HF_TOKEN')  # 建议在终端导出 HF_TOKEN 再运行
print('HF_TOKEN 是否已设置:', bool(hf_token))


HF_TOKEN 是否已设置: False


In [4]:
from st_pipeline.data.hest_raw import download_hest

if RUN_DOWNLOAD:
    download_hest(hest_dir=hest_dir, slide_ids=[slide_id], token=hf_token)


## 步骤 1：从 HEST 导出 h5ad
该 h5ad 作为 ST 监督标签使用（spot 表达）。


In [5]:
from st_pipeline.data.hest_raw import export_h5ad_from_hest

export_h5ad_from_hest(hest_dir=hest_dir, slide_id=slide_id, output_h5ad=h5ad_out)


ERROR 1: PROJ: proj_create_from_database: /home/huangjinjin/miniconda3/envs/morpho-vc/lib/python3.11/site-packages/pyogrio/proj_data/proj.db contains DATABASE.LAYOUT.VERSION.MINOR = 5 whereas a number >= 6 is expected. It comes from another PROJ installation.
HEST read failed; retrying without tissue contours: Could not correctly detect PROJ data files installed by pyogrio wheel


PosixPath('data/spatial_data/INT25.h5ad')

## 步骤 1.1：确认 HEST 里有哪些分割结果（shape 名称）
如果 `cellvit` 不存在，请在 `shape_name` 里改成实际名称。


In [6]:
from st_pipeline.data.hest_raw import list_shape_names

print('可用分割名称:', list_shape_names(hest_dir=hest_dir, slide_id=slide_id))


ERROR 1: PROJ: proj_create_from_database: /home/huangjinjin/miniconda3/envs/morpho-vc/lib/python3.11/site-packages/pyogrio/proj_data/proj.db contains DATABASE.LAYOUT.VERSION.MINOR = 5 whereas a number >= 6 is expected. It comes from another PROJ installation.
HEST read failed; retrying without tissue contours: Could not correctly detect PROJ data files installed by pyogrio wheel


可用分割名称: ['cellvit']


## 步骤 2：从 HEST 内置细胞分割导出细胞中心 CSV
这一步会读取 HEST 的分割 shapes（如 cellvit）并取中心点。


In [7]:
from st_pipeline.data.hest_raw import export_cells_csv_from_hest

export_cells_csv_from_hest(
    hest_dir=hest_dir,
    slide_id=slide_id,
    output_csv=cell_csv,
    shape_name=shape_name,
    coordinates_name=coordinates_name,
)


ERROR 1: PROJ: proj_create_from_database: /home/huangjinjin/miniconda3/envs/morpho-vc/lib/python3.11/site-packages/pyogrio/proj_data/proj.db contains DATABASE.LAYOUT.VERSION.MINOR = 5 whereas a number >= 6 is expected. It comes from another PROJ installation.
HEST read failed; retrying without tissue contours: Could not correctly detect PROJ data files installed by pyogrio wheel


PosixPath('data/cell_centers/INT25_cells.csv')

## 步骤 3：从 HEST 直接导出细胞 patch H5
这一步会根据细胞中心坐标直接从 WSI 裁剪 patch。


In [8]:
from st_pipeline.data.hest_raw import export_cell_patches_from_hest

if cell_patch_h5.exists():
    print(f'已存在，跳过导出: {cell_patch_h5}')
else:
    export_cell_patches_from_hest(
        hest_dir=hest_dir,
        slide_id=slide_id,
        output_h5=cell_patch_h5,
        shape_name=shape_name,
        coordinates_name=coordinates_name,
        target_patch_size=72,
        target_pixel_size=0.25,
    )


已存在，跳过导出: data/cell_images/INT25_cell_patches.h5


## 步骤 4：LazySlide 生成细胞嵌入


In [9]:
from st_pipeline.data.cell_embed_lazyslide import EmbedConfig, embed_cells

cfg = EmbedConfig(
    model_name=lazyslide_model,
    model_path="../Morpho-VC/checkpoints/ResNet/model.safetensors",
    device='cuda',
    batch_size=64,
    num_workers=4,
)
embed_cells(cell_patch_h5=cell_patch_h5, output_h5=cell_emb_h5, config=cfg)


  from pkg_resources import DistributionNotFound, get_distribution
  from .autonotebook import tqdm as notebook_tqdm


PosixPath('data/cell_embeddings/INT25_cell_emb.h5')

## 步骤 5：构建 MIL 数据集并检查形状


In [10]:
from st_pipeline.data.h5ad_loader import load_h5ad
from st_pipeline.data.mil_dataset import MilSpotDataset
from st_pipeline.constants import KEYS

data = load_h5ad(
    h5ad_path=h5ad_out,
    genes='HVG:1000',
    spot_radius_px=0,
    gene_vocab_path=gene_vocab_path,
)
dataset = MilSpotDataset(
    adata=data.adata,
    embedding_h5=cell_emb_h5,
    spot_radius_px=data.spot_radius_px,
    gene_ids=data.gene_ids,
)
print('spot 数量:', len(dataset))
sample = dataset[0]
print('X 形状:', sample[KEYS.X].shape)
print('Y_bag 形状:', sample[KEYS.Y_BAG].shape)


spot 数量: 3416
X 形状: torch.Size([19, 2048])
Y_bag 形状: torch.Size([956])


## 步骤 6：模型前向验证（不训练）


In [11]:
import torch
from torch.utils.data import DataLoader
from st_pipeline.data.collate import mil_collate
from st_pipeline.data.gene_vocab import load_gene_vocab
from st_pipeline.model.morpho_cellfm_mil import MorphoCellfmMIL

loader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=mil_collate)
batch = next(iter(loader))

vocab_size = len(load_gene_vocab(gene_vocab_path))

model = MorphoCellfmMIL(
    input_dim=batch[KEYS.X].shape[1],
    n_genes=len(data.genes),
    cellfm_dim=1536,
    cellfm_layers=2,
    cellfm_heads=48,
    cellfm_checkpoint="../Morpho-VC/checkpoints/CellFM/CellFM_80M_weight.pt",
    freeze_cellfm=True,
    use_mock=use_mock_cellfm,
    use_retention=True,
    vocab_size=vocab_size,
    dropout=0.1,
    aggregation='mean',
    dispersion='gene',
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
for k in batch:
    batch[k] = batch[k].to(device)

mu_bag, mu_inst = model(batch)
print('mu_bag 形状:', mu_bag.shape)
print('mu_inst 形状:', mu_inst.shape)


Skipped 2 CellFM keys due to shape mismatch.
Missing CellFM keys: ['gene_emb', 'value_enc.value_enc.a']


mu_bag 形状: torch.Size([2, 956])
mu_inst 形状: torch.Size([22, 956])
