In [1]:
import os
import sys
import torch
import torch.nn as nn
from torchvision import transforms
import openslide
from tqdm import tqdm

In [2]:
# 对于Jupyter Notebook，适配路径获取方式
try:
    # 获取当前笔记本文件的路径
    project_root = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
    sys.path.append(project_root)
except NameError:
    # 如果失败，使用当前工作目录
    project_root = os.path.dirname(os.path.dirname(os.getcwd()))
    sys.path.append(project_root)

In [3]:
from utils.data import WSIPatchDataset 
from models.swin_unet import SwinBackbone
from torch.utils.data import DataLoader

In [4]:
def get_wsi_loader(
    wsi_path,
    patch_size=256,
    stride=256,
    level=0,
    batch_size=16,
    num_workers=8,  # 增加工作进程数，适应多卡
    pin_memory=True,
    shuffle=False
):
    dataset = WSIPatchDataset(
        wsi_path=wsi_path,
        patch_size=patch_size,
        stride=stride,
        level=level
    )
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory
    )
    return loader


In [5]:
class FeatureExtractor:
    def __init__(self, backbone):
        # 检查可用GPU数量
        self.num_gpus = torch.cuda.device_count()
        print(f"发现 {self.num_gpus} 个可用GPU")
        
        # 使用DataParallel实现多卡并行
        if self.num_gpus > 1:
            self.backbone = nn.DataParallel(backbone).cuda()
        else:
            self.backbone = backbone.cuda()
            
        self.backbone.eval()

    def extract(self, loader):
        features_list = []
        coords_list = []

        with torch.no_grad():
            # 增加进度条显示
            for imgs, coords in tqdm(loader, desc="提取特征"):
                # 自动分配到多个GPU
                imgs = imgs.cuda(non_blocking=True)
                feats = self.backbone(imgs)  # 输出形状 [B, C, H', W'] 或 [B, L, C]
                features_list.append(feats.cpu())
                coords_list.append(coords)

        features = torch.cat(features_list, dim=0)
        coords = torch.cat(coords_list, dim=0)
        return features, coords

In [None]:
if __name__ == '__main__':
    wsi_dir = '/mnt/gemlab_data_2/User_database/zhushiwei/PHASE/train_images'
    save_dir = '/mnt/gemlab_data_2/User_database/zhushiwei/PHASE/test_PreceptGuide'
    features_dir = os.path.join(save_dir, 'features')
    coords_dir = os.path.join(save_dir, 'coords')
    os.makedirs(features_dir, exist_ok=True)
    os.makedirs(coords_dir, exist_ok=True)
    
    # 初始化骨干模型
    backbone = SwinBackbone(
        model_name="swin_base_patch4_window7_224",
        pretrained=True,
        out_indices=(3,)
    )

    # 初始化特征提取器（多卡支持）
    feature_extractor = FeatureExtractor(backbone)
    
    # 遍历每一张 WSI（假设为 .tiff）
    wsi_files = [f for f in os.listdir(wsi_dir) if f.endswith('.tiff') ]
    for wsi_filename in tqdm(wsi_files, desc='Extracting features from WSI'):
        wsi_path = os.path.join(wsi_dir, wsi_filename)
        wsi_name = os.path.splitext(wsi_filename)[0]
        features_path = os.path.join(features_dir, f'{wsi_name}.pt')
        coords_path = os.path.join(coords_dir, f'{wsi_name}.pt')
        
        if os.path.exists(features_path) and os.path.exists(coords_path):
            print(f'Skipping {wsi_name}, features already extracted.')
            continue
        
        try:
            # 根据GPU数量调整batch_size，每个GPU处理32个样本
            batch_size = 32 * feature_extractor.num_gpus
            loader = get_wsi_loader(
                wsi_path=wsi_path,
                patch_size=256,
                stride=256,
                level=0,
                batch_size=batch_size,
                num_workers=8,
                pin_memory=True
            )
            
            # 使用多卡提取特征
            features, coords = feature_extractor.extract(loader)
            
            # 保存特征和坐标
            torch.save(features, features_path)
            torch.save(coords, coords_path)

            print(f'WSI: {wsi_name}')
            print(f'Features shape: {features.shape}')
            print(f'Coords shape: {coords.shape}')
            
        except Exception as e:
            print(f'处理 {wsi_name} 时出错: {str(e)}')
            continue

发现 8 个可用GPU


Extracting features from WSI:   0%|                                                                                                               | 0/10615 [00:00<?, ?it/s]

In [3]:
import subprocess
import re
from datetime import datetime, timedelta

def get_gpu_processes():
    """获取占用GPU显存的进程信息"""
    try:
        # 使用更兼容的查询方式
        result = subprocess.check_output(
            ["nvidia-smi", "--query-compute-apps=pid,used_memory,gpu_uuid", 
             "--format=csv,noheader,nounits"],
            stderr=subprocess.STDOUT,
            timeout=10
        )
        
        processes = []
        for line in result.decode('utf-8').strip().split('\n'):
            if not line.strip():
                continue
                
            parts = [p.strip() for p in line.split(',')]
            if len(parts) >= 3:
                try:
                    pid = int(parts[0])
                    memory = int(parts[1])
                    uuid = parts[2]
                    processes.append((pid, memory, uuid))
                except (ValueError, IndexError):
                    continue
                    
        return processes
    except Exception as e:
        print(f"获取GPU进程出错: {str(e)}")
        return []

def get_gpu_uuid_mapping():
    """获取GPU UUID到设备ID的映射"""
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=index,uuid", "--format=csv,noheader,nounits"],
            stderr=subprocess.STDOUT,
            timeout=10
        )
        
        uuid_map = {}
        for line in result.decode('utf-8').strip().split('\n'):
            if not line.strip() or ',' not in line:
                continue
                
            index, uuid = line.split(',', 1)
            uuid_map[uuid.strip()] = index.strip()
            
        return uuid_map
    except Exception as e:
        print(f"获取GPU映射出错: {str(e)}")
        return {}

def parse_elapsed_time(etime):
    """解析ps命令返回的运行时间格式"""
    try:
        # 处理天-时分秒格式 (e.g., "3-08:31:03")
        if '-' in etime:
            days, time_part = etime.split('-')
            days = int(days)
        else:
            days = 0
            time_part = etime
            
        # 处理时分秒格式
        time_components = time_part.split(':')
        if len(time_components) == 3:  # HH:MM:SS
            hours, minutes, seconds = map(int, time_components)
        elif len(time_components) == 2:  # MM:SS
            hours = 0
            minutes, seconds = map(int, time_components)
        else:
            return "未知"
            
        # 转换为时间字符串
        parts = []
        if days > 0:
            parts.append(f"{days}天")
        if hours > 0:
            parts.append(f"{hours}时")
        if minutes > 0:
            parts.append(f"{minutes}分")
        if seconds > 0 or not parts:
            parts.append(f"{seconds}秒")
            
        return ''.join(parts)
    except Exception:
        return "未知"

def get_process_elapsed_time(pid):
    """获取进程运行时间"""
    try:
        result = subprocess.check_output(
            ["ps", "-p", str(pid), "-o", "etime="],
            stderr=subprocess.STDOUT,
            timeout=5
        )
        etime = result.decode('utf-8').strip()
        return parse_elapsed_time(etime) if etime else "未知"
    except Exception:
        return "未知"

def get_process_command(pid):
    """获取进程完整命令行"""
    try:
        result = subprocess.check_output(
            ["ps", "-p", str(pid), "-o", "args="],
            stderr=subprocess.STDOUT,
            timeout=5
        )
        return result.decode('utf-8').strip() or "未知命令"
    except Exception:
        return "未知命令"

def display_gpu_processes():
    """显示GPU进程信息"""
    # 获取查询时间
    query_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"查询时间: {query_time}\n")
    
    # 获取GPU和进程信息
    processes = get_gpu_processes()
    uuid_map = get_gpu_uuid_mapping()
    
    # 打印表头
    print(f"{'显卡ID':<8}{'CUDA设备':<12}{'PID':<10}{'显存(MB)':<12}{'运行时间':<18}{'进程信息'}")
    print("-" * 120)
    
    if not processes:
        print("未找到占用GPU显存的进程")
        return
        
    for pid, memory, uuid in processes:
        gpu_id = uuid_map.get(uuid, "未知")
        cuda_device = f"cuda:{gpu_id}" if gpu_id != "未知" else "未知设备"
        elapsed_time = get_process_elapsed_time(pid)
        cmd = get_process_command(pid)
        
        print(f"{gpu_id:<8}{cuda_device:<12}{pid:<10}{memory:<12}{elapsed_time:<18}{cmd}")

if __name__ == "__main__":
    display_gpu_processes()

查询时间: 2025-08-17 12:58:37

显卡ID    CUDA设备      PID       显存(MB)      运行时间              进程信息
------------------------------------------------------------------------------------------------------------------------
0       cuda:0      1806048   15986       3天20时44分30秒       python train.py /home/xh/711/config_files/config12.yaml
0       cuda:0      3684605   2840        15时48分52秒         python tools/train.py --config configs/exp_a_deformable_attention_head.py --work-dir ./work_dirs/ablations/T1_Baseline --cfg-options randomness.seed=42 resume=True model.use_backbone=False model.use_neck=False model.use_high_freq_enhancement=False model.decode_head.use_feature_fusion=False model.decode_head.use_deformable_conv=False model.decode_head.use_semantic_guidance=False model.decode_head.use_progressive_upsampling=False
1       cuda:1      3691050   29064       15时46分26秒         python tools/train.py --config configs/exp_a_deformable_attention_head.py --work-dir ./work_dirs/ablations/T1_Baseline_

In [15]:
import torch

file_path = "/mnt/gemlab_data_2/User_database/zhushiwei/PHASE/PreceptGuide/features/0a75b377181b60efd8278bce0b6260a5.pt"  
try:
    # 明确设置 weights_only=False 以处理包含自定义对象的文件
    data = torch.load(file_path, weights_only=False)  
    print("文件加载成功，加载内容的类型为:", type(data))
except Exception as e:
    print("加载文件时发生错误:", e)

加载文件时发生错误: Ran out of input


In [8]:
import math
(20-5*2.28)/math.sqrt(5*2.28*0.9772)

2.5766432743171745

In [9]:
3000-700-750-3.2

1546.8