In [None]:
#代码源自https://blog.csdn.net/AI_dataloads/article/details/134126532
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset
import numpy as np
from torch import nn
from PIL import Image, ImageDraw
from torchvision import transforms
import torchvision.models as models
import time
from matplotlib import pyplot as plt
import os
import glob
import random
import pandas as pd
from histolab.slide import Slide
import csv
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, confusion_matrix, make_scorer, auc
from sklearn.preprocessing import LabelBinarizer
from itertools import cycle
from collections import Counter

In [None]:
def model_initialization(dimension=4,freeze=True):
    resnet_model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
    for param in resnet_model.parameters():
        param.requires_grad = not freeze
    in_features = resnet_model.fc.in_features
    resnet_model.fc = nn.Linear(in_features,dimension) #全连接层改为输出4维向量
    params_to_update = []
    for param in resnet_model.parameters():
        if param.requires_grad:
            params_to_update.append(param)
    return resnet_model,params_to_update
def model_load(resnet_model,device,model_pth):
    # 加载保存的模型权重
    state_dict = torch.load(model_pth,weights_only = True)#"stage1/model_stage1_epochs_1.pth")  # 替换为你的文件路径
    resnet_model.load_state_dict(state_dict)
    # 将模型移动到设备（CPU 或 GPU）
    resnet_model.to(device)
    # 切换到评估模式
    resnet_model.eval()
    return resnet_model

#resnet_model,params_to_update = model_initialization(4)
#print(params_to_update)

In [None]:
data_transforms = { #也可以使用PIL库，smote 人工拟合出来数据
    'train':
    transforms.Compose([
        transforms.Resize([256,256]),#是图像变換大小
        transforms.RandomRotation(45),#随机旋转，-45 到45度之间随机选择
        #transforms.CenterCrop(256),#从中心开始裁剪[256, 256]
        transforms.RandomHorizontalFlip(p=0.5),#随机水平翻转 选择一个概率概率
        transforms.RandomVerticalFlip(p=0.5),#随机垂直翻转
        transforms.ColorJitter(brightness=0.2, contrast=0.1, saturation=0.1, hue=0.1),#参数1为亮度，参数2为对比度,参数3为饱和度,参数4为色相
        transforms.Lambda(lambda x: x.convert('RGB')), #将图像转为RGB 3通道
        transforms.RandomGrayscale(p=0.1),#概率转换成灰度率，3通道就是R=G=B
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])#标准化，均值，标准差
    ]),
    'valid':
    transforms.Compose([
        transforms.Resize([256,256]),
        transforms.Lambda(lambda x: x.convert('RGB')), #将图像转为RGB 3通道
        #transforms.Grayscale(3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [None]:
#做了数据增强不代表 训练效果一定会变好，只能说大概率上会变好
class wsi_dataset(Dataset): #food_dataset是自己创建的类名称，可以改为你需要的名称
    def __init__(self, file_path,transform=None):#类的初始化
        self.file_path = file_path
        self.imgs = []
        self.labels = []    
        self.transform = transform
        with open(self.file_path) as f:
            #samples = [x.strip().split(' ') for x in f.readlines()]
            samples = [(x[:x.rfind(" ")].strip('\"'), x[x.rfind(" ") + 1:].strip()) for x in f.readlines()]
            for img_path, label in samples:
                self.imgs.append(img_path)
                self.labels.append(label)
    def __len__(self): #类实例化对象后，可以使用Len函数测量对象的个数
        return len(self.imgs)
        
    def __getitem__(self, idx):#关键，可通过索引的形式获取每一个图片数据及标签
        image = Image.open(self.imgs[idx])
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        label = torch.from_numpy(np.array(label,dtype = np.int64))
        return image, label

In [None]:
#获得训练集和验证集
data_folder = r"E:\THT\DTC_histology\cut"
# 使用 glob 递归查找以 "tile" 开头的 .png 文件
file_pattern = os.path.join(data_folder, "**", "tile*.png")
files = glob.glob(file_pattern, recursive=True)
print(len(files))

In [None]:
#根据不同type分别提取 并输出txt
all_txt = []
type_list = ["tumor","stroma","normal","immune"]
#type_sum = [0] * len(type_list)
type_files = {t: [] for t in type_list}
for file in files:
    in_type = False
    for i in range(len(type_list)):
        if type_list[i] in file:
            all_txt.append(f"{file} {i}")
            type_files[type_list[i]].append(f"{file} {i}")
            #type_sum[i] += 1
            in_type = True
            break
    if not in_type: 
        print(file)
#print(type_sum)
for t in type_list:
    print(f"{t}: {len(type_files[t])} files")

In [None]:
#如何注意不同类别数量的不均衡？
#一、SMOTE：通过合成新的样本来增加少数类样本，从而平衡训练集。适用于数据不平衡的情况，使用 imblearn 库中的 SMOTE 方法。
#二、加权损失函数：通过加大少数类的损失权重，迫使模型更多地关注少数类。可以使用 Keras 提供的 class_weight 参数，或者自定义加权损失函数如 Focal Loss。
#先等比例的来吧
# 分割数据的函数
def split_data(files, train_ratio):
    # 打乱数据
    random.shuffle(files)
    # 按比例分割
    train_size = int(len(files) * train_ratio)
    train_files = files[:train_size]
    valid_files = files[train_size:]
    return train_files, valid_files

train_ratio = 0.7
train_files = {t: [] for t in type_list}
valid_files = {t: [] for t in type_list}
#random.seed(17) #这个seed似乎对于shuffle完全没用呢
for t in type_list:
    train_files[t], valid_files[t] = split_data(type_files[t],train_ratio)
    print(f"{t}: {len(train_files[t])},{len(valid_files[t])} files")

In [None]:
# 合并每类文件路径，并写入 train_txt 和 valid_txt
def merge_dict_to_list(file_dict, output):
    for category, file_list in file_dict.items():
        for file in file_list:
            output.append(file)

# 写入 train.txt 和 valid.txt
train_txt = []
valid_txt = []
merge_dict_to_list(train_files,train_txt)
random.shuffle(train_txt)
with open("train.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(train_txt))
merge_dict_to_list(valid_files,valid_txt)
random.shuffle(valid_txt)
with open("valid.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(valid_txt))

In [None]:
train_data = wsi_dataset(file_path = 'train.txt',transform = data_transforms['train'])
valid_data = wsi_dataset(file_path = 'valid.txt',transform = data_transforms['valid'])

train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True) # 64张图片为一个包,
valid_dataloader = DataLoader(valid_data, batch_size=64, shuffle=True) #GPU上建议始终设置pin_memory=True,

'''展示训练数据集中的图片'''
# from matplotlib import pyplot as plt
# image, Label = iter(train_dataloader).__next__( ) #iter是一个迭代器函数。__next_（）用于获取下一个数据
# sample = image[2] #image
# sample = sample.permute((1, 2, 0)).numpy() #tensor数据的维度转换
# plt.imshow(sample)
# plt.show()
# print('Label is: {}'.format(Label[2].numpy()))

In [None]:
'''定义神经网络'''
class CNN(nn.Module):
    def __init__(self):# 輸入大小(3, 256, 256)
        super(CNN, self).__init__( )
        self.conv1 = nn.Sequential( #将多个层组合成一起。
            nn.Conv2d( #2d一般用于图像，3d用于视频数据（多一个时间维度），1d一般用于结构化的序列数据d
                in_channels=3,# 图像通道个数，1表示灰度图（确定了卷积核 组中的个数），
                out_channels=16,# 要得到几多少个特征图，卷积核的个数
                kernel_size = 5,# 卷积核大小，5*5
                stride=1,# 步长
                padding=2,#一般希望卷积核处理后的结果大小与处理前的数据大小相同,效果会比较好。那padding改如何设计P
            ), #输出的特征图为(16, 256, 256)
            nn.ReLU(), # relu层
            nn.MaxPool2d(kernel_size=2), # 进行池化操作（2x2 区域），输出结果为：(16, 128, 128)
        )
        self.conv2 = nn.Sequential( #输入 (16, 128, 128)
            nn.Conv2d(16, 32, 5, 1, 2), # 输出（32, 128, 128)
            nn.ReLU(), # relu层
            nn.Conv2d(32, 32, 5, 1, 2), # 输出(32, 128, 128)
            nn.ReLU(),
            nn.MaxPool2d(2), # 输出(32, 64, 64)
        )
        self.conv3 = nn.Sequential( #输入 (32, 64, 64)
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.ReLU(), # 输出(64, 64, 64)
        )
        self.out = nn.Linear(64 * 64 * 64, 4) #全连接层得到的结果
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)# 输出(64,64, 32, 32)
        x = x.view(x.size(0), -1) # flatten操作，结果为： (batch_size, 64 * 32 * 32)
        output = self.out(x)
        return output

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
#pytorch提供2种方式来切换训练和测试的模式，分别是: model.train()和model.eval()。
# 一般用法是：在训练开始之前写上model.trian()，在测试时写上 model.eval() 。
#    batch_size_num = 1
    for X, y in dataloader: #其中batch为每一个数据的编号
        X, y = X.to(device), y.to(device) #把训练数据集和标签传入cpu或GPU
        pred = model.forward(X) #自动初始化 w权值
        loss = loss_fn(pred, y) #通过交叉熵损失函数计算损失值Loss
        # Backpropagation 进来一个batch的数据，计算一次梯度，更新一次网络
        optimizer.zero_grad() #梯度值清零
        loss.backward() #反向传播计算得到每个参数的梯度值
        optimizer.step() #根据梯度更新网络参数

        # Loss = loss.item() #获取损失值
        # print(f"Loss: {Loss:>7f} [number:{batch_size_num}]")
        # batch_size_num += 1

In [None]:
def test(dataloader, model, loss_fn):
    global best_acc
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad(): #一个上下文管理器，关闭梯度计算。当你确认不会调用Tensor.backward()的时候。这可以减少计算所用内存消耗。
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model.forward(X)
            test_loss += loss_fn(pred, y).item() #
            #print(X,y,pred.argmax(1),sep="||")
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            #a = (pred.argmax(1) == y) #dim=1表示每一行中的最大值对应的索引号， dim=0表示每一列中的最大值对应的索引号
            #b = (pred.argmax(1) == y).type(torch.float)
        test_loss /= num_batches
        correct /= size
        print(f"Test result: \n Accuracy: {(100*correct)}%, Avg loss: {test_loss}")
        acc_s.append (correct)
        loss_s.append(test_loss)     
        if correct > best_acc:
            best_acc = correct

In [None]:
'''cnn卷积神经网络部分'''
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
model = resnet_model.to(device)#不会创建一个新对象，而是让 model 和 resnet_model 指向同一个模型。修改 model 会直接影响 resnet_model，包括训练时的权重更新。
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params_to_update,lr=0.001)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建一个优化器， SGD为随机梯度下降算法？?
#scheduler = torch.optim.Lr_scheduler.stepLR(optimizer,step_size=25,gamma=0.5)
'''训练模型'''
best_acc = 0
epochs = 20
acc_s = []
loss_s = []
for t in range(epochs):
    start_time = time.time()
    #train_dataloader = DataLoader(training_data,batch_size=64,shuffle=True)
    #test_dataloader = DataLoader(test_data,batch_size=64,shuffle=True)
    print(f"Epoch {t+1}\n--------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    torch.save(model.state_dict(), "stage1\model_stage1_epochs_"+str(t)+"_train.pth")
    test(valid_dataloader, model, loss_fn)
    #保存模型权重
    torch.save(model.state_dict(), "stage1\model_stage1_epochs_"+str(t)+"_test.pth")
    #scheduler.step() #可用来自动调整学习率
    end_time = time.time()
    time_diff = end_time - start_time
    print("时间差：", time_diff)
print()

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())
#2.5.1+cu118
#True
#11.8
#90100

In [None]:
# 创建子图,分别用于绘制准确率和损失值
plt.subplot(1, 2, 1)
plt.plot(range(0, epochs), acc_s)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.subplot(1, 2, 2)
plt.plot(range(0, epochs), loss_s)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show() # 显示绘制的图形
print("Done!") # 训练结束

In [None]:
# 需预测文件路径等预处理
def pre_txt_path(image_dir,tile_dir,txt_dir,wsi_type):
    index_data = []
    # 遍历 image_dir 下的所有 .mrxs 文件
    #image_paths = glob.glob(os.path.join(image_dir, "*"+wsi_type))
    image_paths = glob.glob(f"{image_dir}/**/*{wsi_type}", recursive=True)
    # 保存每个 image_path 对应的子文件信息
    for image_path in image_paths:
        # 提取子文件夹名称（基于文件名，不含扩展名）
        subfolder_name = os.path.splitext(os.path.basename(image_path))[0]
        subfolder_path = os.path.join(tile_dir, subfolder_name)
        # 如果子文件夹不存在，则跳过
        if not os.path.isdir(subfolder_path):
            print(f"Subfolder not found for: {image_path}")
            index_data.append({'image_path': image_path, 'txt_path': 'NA'})
            continue
        # 获取子文件夹中所有 tile*.png 文件
        tile_files = glob.glob(os.path.join(subfolder_path, "tile*.png"))
        # 如果没有匹配的 tile 文件，跳过
        if not tile_files:
            print(f"No tile files found in: {subfolder_path}")
            continue
        # 构造 DataFrame
        sub_df = pd.DataFrame({
            "tile_file_name": tile_files,
            "label": -1  # 所有文件初始标签为 -1
        })
        # 保存为 txt 文件
        output_txt = os.path.join(txt_dir, f"{subfolder_name}.txt")
        index_data.append({'image_path': image_path, 'txt_path': output_txt})
        sub_df.to_csv(output_txt, sep=" ", index=False, header=False)
        print(f"Sub_df saved for {image_path} to {output_txt}")
    index_df = pd.DataFrame(index_data)
    # 保存索引文件
    index_df.to_csv(os.path.join(txt_dir, "index.csv"), index=False)
    print(f"Index file saved to {os.path.join(txt_dir, 'index.csv')}")

#pre_txt_path(image_dir = r"E:\THT\HE_DTC", 
#             tile_dir = r"E:\THT\HE_DTC_try",
#             txt_dir = r"E:\THT\HE_DTC_txt",
#             wsi_type = ".mrxs")
#pre_txt_path(image_dir = r"E:\THT\HE_DTC_TCGA\slides", 
#             tile_dir = r"E:\THT\HE_DTC_TCGA\cut",
#             txt_dir = r"E:\THT\HE_DTC_TCGA\txt",
#             wsi_type = ".svs")
pre_txt_path(image_dir = r"E:\THT\HE_DTC_second\second\mrxs", 
             tile_dir = r"E:\THT\HE_DTC_second\second\cut_123",
             txt_dir = r"E:\THT\HE_DTC_second\second\txt_123",
             wsi_type = ".mrxs")
#print("------------我是分界线------------")
pre_txt_path(image_dir = r"E:\THT\HE_DTC_second\second\kfb\svs", 
             tile_dir = r"E:\THT\HE_DTC_second\second\cut_pudong",
             txt_dir = r"E:\THT\HE_DTC_second\second\txt_pudong",
             wsi_type = ".svs")

In [None]:
#读分割好的tiles by slide，输出预测labels.csv   #可视化
#onco-fusion/tissue-type-training/eval_tissue_tile.py/def _visualize
def get_color_from_class(pred_class):
    """
    根据预测类别返回对应的颜色（这里可以根据需要自定义）。
    Args:
        pred_class (int): 预测的类别。
    Returns:
        tuple: 对应的颜色（RGB）。
    """
    color_map = {
        0: (255, 0, 0),  # 类别 0 - 红色
        1: (0, 255, 0),  # 类别 1 - 绿色
        2: (0, 0, 255),  # 类别 2 - 蓝色
        3: (255, 255, 0),  # 类别 3 - 黄色
    }
    return color_map.get(pred_class, (0, 0, 0))  # 默认黑色（如果类别不在 map 中）
    
def make_preds_by_slide_and_visualize(model, index_df, device, output_dir, transform, 
                                      visualize=True, out_all=True, all_shuffle=True,
                                      batch_size=64, num_workers=28, n_classes=4, scale_factor=1, 
                                      wsi_type=".mrxs",sum_calc=False,pick_label=-1):
    """
    根据每个瓦片的预测结果生成完整图像。
    Args:
        model (torch.nn.Module): 训练好的模型。
        index_df (pd.DataFrame): 包含 'image_path' 和 'txt_path' 的索引文件。
        output_dir (str): 结果保存的目录。
        transform (callable): 图像转换函数（如数据增强）。
        device (torch.device): 用于运行模型的设备 (CPU 或 GPU)。
    """
    from PIL import Image
    #消除PIL库image最大像素限制
    Image.MAX_IMAGE_PIXELS = 230000000000
    
    def scale(x,scale_factor):
        return round(x / scale_factor)

    # 定义 CSV 表头
    header = ['tile_file_name', 'label']
    header.extend(['score_{}'.format(k) for k in range(n_classes)])
    # 合并生成一个txt
    all_file = []
    if sum_calc:
        sum_all = [['wsi_name','out_label']] #,*range(n_classes)]] #* --> 解包
        sum_all[0].extend((" ".join(f"{i} {i}(%)" for i in range(n_classes))).split(" "))
    if pick_label>-1:
        pick_path = []
        pick_dir = os.path.join(output_dir,"txt")
        os.makedirs(pick_dir, exist_ok=True)

    os.makedirs(output_dir, exist_ok=True)
    for _, row in index_df.iterrows():
        image_path = row['image_path']  # 获取原图路径
        image_name = os.path.basename(image_path)
        txt_path = row['txt_path']      # 获取对应的 txt 文件路径
        # 如果 txt 文件不存在，跳过
        if (txt_path != txt_path) or (not os.path.exists(txt_path)): # 注意np.nan != np.nan
            print(f"Warning: {txt_path} not found. Skipping...")
            continue
        # 创建 Dataset 和 DataLoader
        dataset = wsi_dataset(file_path=txt_path, transform=transform)
        dataloader = DataLoader(dataset, batch_size=batch_size, 
                                num_workers=num_workers, pin_memory=True, shuffle=False)
        if visualize:
            # 读取原始图像尺寸信息
            slide = Slide(image_path,output_dir)
            slide_width, slide_height = slide.dimensions
            if wsi_type==".svs":
                slide_height, slide_width = slide.dimensions
            #按照后缀分没用，有正有歪的 #还是放个正方形来看吧
            if slide_width>slide_height:
                slide_height = slide_width
            else:
                slide_width = slide_height
            # 创建一个空白图像，大小为原始图像的大小，背景为黑色
            #full_image = Image.new('RGB', (scale(slide_width,scale_factor), scale(slide_height,scale_factor)), (0, 0, 0))
            #full_image_array = np.array(full_image)
            full_image_array = np.zeros((scale(slide_width,scale_factor), scale(slide_height,scale_factor), 3), dtype=np.uint8)

        # 创建输出文件
        output_file = os.path.join(output_dir, os.path.basename(image_path).replace(wsi_type, '.csv'))
        if sum_calc:
            sum_list = [0 for i in range(n_classes)]
        if pick_label>-1:
            pick_list = []
        
        with open(output_file, 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(header)
            # 模型预测
            model.eval()
            with torch.no_grad():
                for batch_idx,(X,y) in enumerate(dataloader):
                #for X,y in dataloader:
                    X = X.to(device)
                    # 获取预测结果
                    preds = model(X)
                    preds = preds.detach().cpu().numpy()
                    #print(len(preds)) #==batch_size
                    #print(batch_idx)
                    #labels = labels.cpu().tolist()
                    img_paths = dataset.imgs[batch_idx*batch_size:(batch_idx+1)*batch_size]
                    for img_path,pred_list in zip(img_paths,preds):
                    #for img_path, pred_list in zip(dataset.imgs, preds):
                        #print(img_path,pred_list)
                        # 获取预测类别，并根据类别填充颜色
                        pred_class = pred_list.argmax()  # 选择最大概率的类别
                        color = get_color_from_class(pred_class)
                        img_name = os.path.basename(img_path)
                        #print(pred_list,pred_class,img_name)
                        # 写入每个瓦片的预测结果
                        #print(img_path,pred_class)
                        write_row = [img_path,pred_class]
                        write_row.extend(pred_list)
                        writer.writerow(write_row)
                        all_file.append(f"{img_path} {pred_class}")
                        if pick_label > -1:
                            if int(pred_class)==int(pick_label):
                                pick_list.append(f"{img_path} -1")
                        if sum_calc:
                            sum_list[int(pred_class)] += 1
                        # 填充瓦片位置
                        if visualize:
                            # 获取瓦片的左上角和右下角坐标
                            x1, y1, x2, y2 = map(int, img_name.rsplit('_', 1)[-1].replace('.png', '').split('-'))
                            #print(x1,y1,x2,y2)
                            full_image_array[scale(y1,scale_factor):scale(y2,scale_factor), scale(x1,scale_factor):scale(x2,scale_factor)] = color
        #print(f"Predictions for {image_path} saved to {output_file}")
        if pick_label > -1:
            pick_txt = os.path.join(pick_dir, os.path.basename(txt_path))
            pick_path.append({'image_path': image_path, 'txt_path': pick_txt})
            with open(pick_txt, "w", encoding="utf-8") as f:
                f.write("\n".join(pick_list))
        if sum_calc:
            sum_single = [image_name,str(sum_list.index(max(sum_list)))]
            sum_file = os.path.join(output_dir, os.path.basename(image_path).replace(wsi_type,'_sum_out_label='+str(sum_list.index(max(sum_list)))+'.txt'))
            with open(sum_file, "w") as f:
                total = sum(sum_list)
                percentages = [(x / total) * 100 for x in sum_list]
                f.write("\n\n统计结果：\n")
                for i, (count, percent) in enumerate(zip(sum_list, percentages)):
                    f.write(f"类别 {i}: 数量 {count}, 百分比 {percent:.2f}%\n")
                    sum_single.append(str(count))
                    sum_single.append(f"{percent:.2f}%")
                f.write(f"\n总数: {total}\n")
            sum_all.append(sum_single)
            #print(f"Sum of predictions saved to {sum_file}")
        if visualize:
            # 将完整图像从 NumPy 数组转换回 PIL 图像
            #full_image = Image.fromarray(full_image_array)
            full_image = Image.fromarray(full_image_array.astype(np.uint8))
            # 保存完整图像
            output_image_path = os.path.join(output_dir, os.path.basename(image_path).replace(wsi_type, '_full_pred.png'))
            #plt.imshow(full_image_array)
            #plt.title("Generated Image Preview")
            #plt.show()
            full_image.save(output_image_path, format='PNG')
            #full_image.thumbnail((1024, 1024))  # 生成缩略图进行调试
            #full_image.save(output_image_path.replace('.png','_thumbnail_1024.png'), format='PNG')
            #print(f"Generated and saved visualization to {output_image_path}")
    if all_shuffle:
        random.shuffle(all_file)
    if out_all:
        all_txt = os.path.join(output_dir, "all_file.txt")
        with open(all_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(all_file))
    if sum_calc:
        sum_csv = os.path.join(output_dir,"sum_all.csv")
        with open(sum_csv, "w", newline="") as f:  
            writer = csv.writer(f)  
            writer.writerows(sum_all)
    if pick_label>-1:
        pick_index_df = pd.DataFrame(pick_path)
        pick_index_df.to_csv(os.path.join(pick_dir, "pick_index.csv"), index=False)

In [None]:
index_df = pd.read_csv(r'E:\THT\HE_DTC_txt\index.csv')
#print(index_df["image_path"][:2])
#print(index_df.iloc[2])
cut_lists = ["10-03611D", "14-08954B","15-04187","10-16623C",
             "15-41593G","16-03266F","16-06309","16-11786C",
             "16-39386B","16-54346D","17-03704B","17-04172B","17-25206B",
             "17-42804B","18-14810B","18-29417D","19-45599D"]
cut_mrxs_lists = ['E:\\THT\\HE_DTC\\'+x+".mrxs" for x in cut_lists]
#print(cut_mrxs_lists)
df_filtered = index_df[index_df['image_path'].isin(cut_mrxs_lists)]
print(len(df_filtered))
# 剩余的行
df_remaining = index_df[~index_df['image_path'].isin(cut_mrxs_lists)]
print(len(df_remaining))

In [None]:
index_df = pd.read_csv(r'E:\THT\HE_DTC_txt\index.csv')
make_preds_by_slide_and_visualize(model_epoch_10, df_filtered, device, r"E:\THT\HE_DTC_output",visualize=True,out_all=False,
                                  transform=data_transforms['valid'], batch_size=64, num_workers=0, n_classes=4, scale_factor=80)

#index_df = pd.read_csv(r'E:\THT\HE_DTC_txt\index.csv')
#index_two = index_df[-6:-4]
#print(index_df[-6:-4])
#make_preds_by_slide_and_visualize(model_epoch_10, index_two, device, r"E:\THT\HE_DTC_output\remain_visual",visualize=False,out_all=False,
#                                  transform=data_transforms['valid'], batch_size=64, num_workers=0, n_classes=4, scale_factor=1)
#到时候 首先把已经用过的训练/验证的切片去除掉 index_df[16:]
#其次，输出最后放在一个txt里，[path,pred] 不用可视化了，
make_preds_by_slide_and_visualize(model_epoch_10, df_remaining, device, r"E:\THT\HE_DTC_output\remain",visualize=True,out_all=True,
                                  transform=data_transforms['valid'], batch_size=64, num_workers=0, n_classes=4, scale_factor=80)

In [None]:
#增加蛋白分型标签，选取肿瘤对应样本
#tumor-1 == case4
#18-27078 == 18-27018
#13-03990 == 13-03980
protein_label_path = r"E:\THT\DTC_histology\protein_label.txt"
index_df = pd.read_csv(r'E:\THT\HE_DTC_txt\index.csv')
wsi_origin = [index_df.iloc[x][0] for x in range(len(index_df))]
wsi_origin = [x[x.rfind("\\")+1:x.rfind(".")] for x in wsi_origin]
#print(wsi_origin)
wsi_list = [re.sub(r'[A-Z\s]', '', index_df.iloc[x][0]) for x in range(len(index_df))]
wsi_list = [x[x.rfind("\\")+1:x.rfind(".")] for x in wsi_list]
protein_list = []
protein_dict = {}
with open(protein_label_path) as f:
    protein_labels = [x[:-1] for x in f.readlines()]
for i in range(len(protein_labels)//2):
    protein_list.append((protein_labels[i][2:],protein_labels[i+len(protein_labels)//2]))
#print(wsi_list,protein_list)
protein_name = [x[0]for x in protein_list]
for i in wsi_list:
    if i not in protein_name:
        print(i)
for i in range(len(wsi_list)):
    for j in range(len(protein_name)):
        if wsi_list[i]==protein_name[j]:
            protein_dict[wsi_origin[i]] = int(protein_list[j][1])
print(protein_dict)

In [None]:
#增加蛋白分型标签，选取肿瘤对应样本
#filtered --> 17 txts --> all_files.txt
#filtered_txt = df_filtered["txt_path"]
#print(filtered_txt.iloc[1])
filtered_file_path = "all_files.txt"
#remain --> all_file.txt
remain_file_path = r"E:\THT\HE_DTC_output\remain\all_file.txt"
tumor_txts_path = [filtered_file_path,remain_file_path]
#print(tumor_txts_path)
tumor_tiles_with_protein = "tumor_tiles_with_protein.txt"
tumor_tiles_list = []
protein_labels_list = []
sep_list = ["cut","try"] #两文件路径还不一样的哎
none_list = []
#with open(tumor_tiles_with_protein, "w", encoding="utf-8") as wf:
#    for i in range(len(tumor_txts_path)):
#        with open(tumor_txts_path[i], 'r', encoding='utf-8') as rf:
#            for line in rf:
#                tile_with_type = line
#                tile_with_protein = line[:-2]
#                tile_wsi_path = tile_with_type[tile_with_type.rfind(sep_list[i])+4:tile_with_type.rfind("tile")]
#                tile_wsi_name = tile_wsi_path[:tile_wsi_path.find('\\')]
#                #print(tile_with_type[-2]=="1", tile_with_protein, tile_wsi_path, tile_wsi_name)
#                if tile_with_type[-2]=="0":
#                    tile_with_protein = tile_with_protein
#                    tile_protein_type = protein_dict.get(tile_wsi_name,protein_dict.get(tile_wsi_name[:-1],0))-1
#                    if tile_protein_type==-1:
#                        if tile_wsi_name not in none_list:
#                            none_list.append(tile_wsi_name)
#                    else:
#                        tumor_tiles_list.append(tile_with_protein[:-1])
#                        protein_labels_list.append(str(tile_protein_type))
#                        wf.write(tile_with_protein+str(tile_protein_type)+"\n")
#不写入txt 只存入list
for i in range(len(tumor_txts_path)):
    with open(tumor_txts_path[i], 'r', encoding='utf-8') as rf:
        for line in rf:
            tile_with_type = line
            tile_with_protein = line[:-2]
            tile_wsi_path = tile_with_type[tile_with_type.rfind(sep_list[i])+4:tile_with_type.rfind("tile")]
            tile_wsi_name = tile_wsi_path[:tile_wsi_path.find('\\')]
            if tile_with_type[-2]=="0":
                tile_with_protein = tile_with_protein
                tile_protein_type = protein_dict.get(tile_wsi_name,protein_dict.get(tile_wsi_name[:-1],0))-1
                if tile_protein_type==-1:
                    if tile_wsi_name not in none_list:
                        none_list.append(tile_wsi_name)
                else:
                    tumor_tiles_list.append(tile_with_protein[:-1])
                    protein_labels_list.append(str(tile_protein_type))              
print(none_list)
X_train, X_test, y_train, y_test = train_test_split(tumor_tiles_list, protein_labels_list, test_size=0.3, random_state=17)

In [None]:
#分型模型 训练/验证集
with open("protein_train.txt", "w", encoding="utf-8") as f:
    for i in range(len(X_train)):
        if i==len(X_train):
            f.write(X_train[i]+" "+y_train[i])
        else:
            f.write(X_train[i]+" "+y_train[i]+"\n")
with open("protein_valid.txt", "w", encoding="utf-8") as f:
    for i in range(len(X_test)):
        if i==len(X_test):
            f.write(X_test[i]+" "+y_test[i])
        else:
            f.write(X_test[i]+" "+y_test[i]+"\n")

In [None]:
#根据人 分割训练/验证集 #p for person
resum_dict = resum_data(get_name(tumor_tiles_list,cut_pre="\\",cut_end="\\t"),protein_labels_list)
pX_train, pX_test, py_train, py_test = train_test_split(list(resum_dict.keys()),
                                                        [x for y in list(resum_dict.values()) for x in y], 
                                                        test_size=0.3, random_state=17)
with open("protein_train_by_person.txt", "w", encoding="utf-8") as ft:
    with open("protein_valid_by_person.txt", "w", encoding="utf-8") as fv:
        for i in range(len(tumor_tiles_list)):
            is_in_train = False
            is_in_valid = False
            for j in pX_train:
                if j in tumor_tiles_list[i]:
                    is_in_train = True
                    break
            for j in pX_test:
                if j in tumor_tiles_list[i]:
                    is_in_valid = True
                    break
            if not is_in_train^is_in_valid:
                print(tumor_tiles_list[i])
            if i==len(tumor_tiles_list):
                if is_in_train:
                    ft.write(tumor_tiles_list[i]+" "+protein_labels_list[i])
                else:
                    fv.write(tumor_tiles_list[i]+" "+protein_labels_list[i])
            else:
                if is_in_train:
                    ft.write(tumor_tiles_list[i]+" "+protein_labels_list[i]+"\n")
                else:
                    fv.write(tumor_tiles_list[i]+" "+protein_labels_list[i]+"\n")

X_train, y_train = reget_data("protein_train_by_person.txt")
X_test, y_test = reget_data("protein_valid_by_person.txt")
print(set(list(resum_data(get_name(X_train,cut_pre="\\",cut_end="\\t"),y_train).keys()))&set(list(resum_data(get_name(X_test,cut_pre="\\",cut_end="\\t"),y_test).keys())))
print(set(list(resum_data(get_name(X_train,cut_pre="\\",cut_end="\\t"),y_train).keys()))|set(list(resum_data(get_name(X_test,cut_pre="\\",cut_end="\\t"),y_test).keys())))
print(len(set(list(resum_data(get_name(X_train,cut_pre="\\",cut_end="\\t"),y_train).keys()))|set(list(resum_data(get_name(X_test,cut_pre="\\",cut_end="\\t"),y_test).keys()))))

In [None]:
protein_train_data = wsi_dataset(file_path = 'protein_train_by_person.txt',transform = data_transforms['train'])
protein_valid_data = wsi_dataset(file_path = 'protein_valid_by_person.txt',transform = data_transforms['valid'])

protein_train_dataloader = DataLoader(protein_train_data, batch_size=64, shuffle=True,
                                      #num_workers=4,
                                      #persistent_workers=True,
                                      pin_memory=True) # 64张图片为一个包,
protein_valid_dataloader = DataLoader(protein_valid_data, batch_size=64, shuffle=True,
                                      #num_workers=4,
                                      #persistent_workers=True,
                                      pin_memory=True) #GPU上建议始终设置pin_memory=True,

In [None]:
class CNN(nn.Module):
    def __init__(self):# 輸入大小(3, 256, 256)
        super(CNN, self).__init__( )
        self.conv1 = nn.Sequential( #将多个层组合成一起。
            nn.Conv2d( #2d一般用于图像，3d用于视频数据（多一个时间维度），1d一般用于结构化的序列数据d
                in_channels=3,# 图像通道个数，1表示灰度图（确定了卷积核 组中的个数），
                out_channels=16,# 要得到几多少个特征图，卷积核的个数
                kernel_size = 5,# 卷积核大小，5*5
                stride=1,# 步长
                padding=2,#一般希望卷积核处理后的结果大小与处理前的数据大小相同,效果会比较好。那padding改如何设计P
            ), #输出的特征图为(16, 256, 256)
            nn.ReLU(), # relu层
            nn.MaxPool2d(kernel_size=2), # 进行池化操作（2x2 区域），输出结果为：(16, 128, 128)
        )
        self.conv2 = nn.Sequential( #输入 (16, 128, 128)
            nn.Conv2d(16, 32, 5, 1, 2), # 输出（32, 128, 128)
            nn.ReLU(), # relu层
            nn.Conv2d(32, 32, 5, 1, 2), # 输出(32, 128, 128)
            nn.ReLU(),
            nn.MaxPool2d(2), # 输出(32, 64, 64)
        )
        self.conv3 = nn.Sequential( #输入 (32, 64, 64)
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.ReLU(), # 输出(64, 64, 64)
        )
        self.out = nn.Linear(64 * 64 * 64, 3) #全连接层得到的结果
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)# 输出(64,64, 32, 32)
        x = x.view(x.size(0), -1) # flatten操作，结果为： (batch_size, 64 * 32 * 32)
        output = self.out(x)
        return output

In [None]:
#分型模型 #冻结仅57%,尝试不冻结
'''cnn卷积神经网络部分'''
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
resnet_model,params_to_update = model_initialization(dimension=3,freeze=False)
model = resnet_model.to(device)#不会创建一个新对象，而是让 model 和 resnet_model 指向同一个模型。修改 model 会直接影响 resnet_model，包括训练时的权重更新。
loss_fn = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(params_to_update,lr=0.001)
# 修改优化器配置（增加权重衰减）
optimizer = torch.optim.Adam(params_to_update, lr=0.003, weight_decay=1e-4)
# 添加学习率调度器（根据验证损失调整）
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建一个优化器， SGD为随机梯度下降算法？?
#scheduler = torch.optim.Lr_scheduler.stepLR(optimizer,step_size=25,gamma=0.5)
'''训练模型'''
best_acc = 0
epochs = 20
acc_s = []
loss_s = []
for t in range(epochs):
    start_time = time.time()
    #train_dataloader = DataLoader(training_data,batch_size=64,shuffle=True)
    #test_dataloader = DataLoader(test_data,batch_size=64,shuffle=True)
    print(f"Epoch {t+1}\n--------------------")
    train(protein_train_dataloader, model, loss_fn, optimizer)
    #保存模型权重
    torch.save(model.state_dict(), "stage2_2\model_stage2_epochs_"+str(t)+".pth")
    test(protein_valid_dataloader, model, loss_fn)
    scheduler.step(loss_s[-1]) #可用来自动调整学习率
    end_time = time.time()
    time_diff = end_time - start_time
    print("时间差：", time_diff)
    # 早停机制（连续3次未提升则停止）
    if t > 10 and best_acc not in acc_s[-3:]:
        print("Early stopping triggered")
        break
print()

In [None]:
# 创建子图,分别用于绘制准确率和损失值
plt.subplot(1, 2, 1)
plt.plot(range(0, epochs), acc_s)
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.subplot(1, 2, 2)
plt.plot(range(0, epochs), loss_s)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show() # 显示绘制的图形
print("Done!") # 训练结束

In [None]:
#重新获取训练集和验证集
def reget_data(file_path):
    X, y = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line[-1]=="\n":
                part_0 = line[:-3]
                part_1 = line[-2:-1]
            else:
                part_0 = line[:-2]
                part_1 = line[-1:]
            X.append(part_0)
            y.append(part_1)
    return np.array(X), np.array(y)

# 读取训练集和验证集
X_train, y_train = reget_data("protein_train.txt")
X_test, y_test = reget_data("protein_valid.txt")
print(X_train[:3], X_test[:3], y_train[:3], y_test[-3:])

In [None]:
#统计训练集和验证集中数目
from collections import Counter
#print(Counter(y_train))
#print(Counter(y_test))
def get_name(name_list,cut_pre="",cut_end=""):
    name_cut_end = [x[:x.find(cut_end)] for x in name_list]
    name_cut = [x[x.rfind(cut_pre)+1:] for x in name_cut_end]
    return name_cut

def resum_data(name_list,value_list):
    value_dict = {}
    for name, value in zip(name_list, value_list):
        if name not in value_dict:
            value_dict[name] = []
        if value not in value_dict[name]:
            value_dict[name].append(value)
    result = {}
    is_all_same = True
    for name, values in value_dict.items():
        # 若名称重复出现且对应多个值，检查一致性
        is_same = True
        if len(values) > 1:
            is_same = len(set(values)) == 1  # 使用集合去重判断值是否一致‌:ml-citation{ref="6" data="citationList"}
            print("Not same:",name,values)
            is_all_same = False
        result[name] = {
            "values": values,
            "is_same": is_same
        }
    print("名称列表统计：",Counter(name_list))
    print("键值列表统计：",Counter(value_list))
    if is_all_same:
        print("全子元素统计：",Counter([value for values in value_dict.values() for value in values]))
        print("仅子元素统计：",Counter([tuple(values) for values in value_dict.values()]))
        return value_dict
    else: 
        print("全子元素统计：",Counter([value for values in result.values() for value in values["values"]]))
        print("仅子元素统计：",Counter([tuple(values["values"]) for values in result.values()]))
        return result
        
print(resum_data(get_name(X_train,cut_pre="\\",cut_end="\\t"),y_train))
print(resum_data(get_name(X_test,cut_pre="\\",cut_end="\\t"),y_test))

In [None]:
#计算AUC 绘制ROC
resnet_model,params_to_update = model_initialization(dimension=3,freeze=False)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model = model_load(resnet_model,device,"stage2_best_54\model_stage2_epochs_"+str(13)+".pth")
model.eval()
# 读取测试数据
#X_test, y_test = reget_data("protein_valid.txt")
#X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)  # 转为 Tensor
#y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
# 创建 DataLoader
#batch_size = 256
#test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(protein_valid_data, batch_size=64, shuffle=False)
#protein_valid_dataloader = DataLoader(protein_valid_data, batch_size=64, shuffle=True)
# 用于存储真实标签和预测概率的列表
y_scores = []
y_labels = []
#with torch.no_grad(): #一个上下文管理器，关闭梯度计算。当你确认不会调用Tensor.backward()的时候。这可以减少计算所用内存消耗。
#    for X, y in test_loader:
#        X, y = X.to(device), y.to(device)
#        pred = model.forward(X)
#        #test_loss += loss_fn(pred, y).item() #
#        print(X,y,pred.argmax(1),sep="||")
#        #correct += (pred.argmax(1) == y).type(torch.float).sum().item()
#        #break
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)  # 计算输出
        probs = torch.softmax(outputs, dim=1)  # [:,1]获取正类（1）的概率,但这是三分类 不适用
        y_scores.append(probs.cpu().numpy())  # 存储预测概率
        y_labels.append(y_batch.cpu().numpy())  # 存储真实标签
y_probs = np.concatenate(y_scores)
y_true = np.concatenate(y_labels)
y_onehot_true = LabelBinarizer().fit(range(3)).transform(y_true)
# 因为你是三分类任务，所以y_probs的形状应该是(num_samples, 3)
# y_true的形状应该是(num_samples,)
#print("-------------------------------------------")
# 计算每个类别的ROC曲线和AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = 3
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true == i, y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# roc_auc_score计算宏/微平均AUC
print(roc_auc_score(y_true, y_probs, multi_class='ovr',average='macro'))
print(roc_auc_score(y_true, y_probs, multi_class='ovo',average='macro'))
print(roc_auc_score(y_true, y_probs, multi_class='ovr',average='micro'))
print(y_true.shape,y_probs.shape)

In [None]:
y_onehot_true = LabelBinarizer().fit(range(3)).transform(y_true)
# Compute micro-average ROC curve and ROC area（方法二）
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_true.ravel(), y_probs.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
# Compute macro-average ROC curve and ROC area（方法一）
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
 
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
 
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
 
# Plot all ROC curves
lw=2
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.4f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)
 
plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.4f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)
 
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.4f})'
             ''.format(i, roc_auc[i]))
 
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.02, 1.0])
plt.ylim([0.0, 1.02])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

In [None]:
#应用model1
index_df_123 = pd.read_csv(r'E:\THT\HE_DTC_second\second\txt_123\index.csv')
index_df_pudong = pd.read_csv(r'E:\THT\HE_DTC_second\second\txt_pudong\index.csv')
resnet_model_epoch_10,params_no_use = model_initialization(4)
model_epoch_10 = model_load(resnet_model_epoch_10,device,"stage1\model_stage1_epochs_10_train.pth")
model_epoch_10.eval()
make_preds_by_slide_and_visualize(model_epoch_10, index_df_123, device, r"E:\THT\HE_DTC_second\second\output\model1_out\123",transform=data_transforms['valid'],
                                  visualize=True,out_all=True,all_shuffle=False,batch_size=64, num_workers=0, n_classes=4, scale_factor=80,
                                  wsi_type=".mrxs",sum_calc=True,pick_label=0)
print("------------------我是分界线！-------------------")
make_preds_by_slide_and_visualize(model_epoch_10, index_df_pudong, device, r"E:\THT\HE_DTC_second\second\output\model1_out\pudong",transform=data_transforms['valid'],
                                  visualize=True,out_all=True,all_shuffle=False,batch_size=64, num_workers=0, n_classes=4, scale_factor=80,
                                  wsi_type=".svs",sum_calc=True,pick_label=0)

In [None]:
#应用model2 #TCGA
#pick_index_df = pd.read_csv(r'E:\THT\HE_DTC_second\second\output\model1_out\123\txt\pick_index.csv')
#output_dir = r'E:\THT\HE_DTC_second\second\output\model2_out\123'
pick_index_df = pd.read_csv(r'E:\THT\HE_DTC_TCGA\output\model1_out\txt\pick_index.csv')
output_dir = r'E:\THT\HE_DTC_TCGA\output\model2_2out'
#for t in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19]:
#for t in [0,3,6,18]:
for t in range(21):
    resnet_model,params_to_update = model_initialization(dimension=3,freeze=False)
    model = model_load(resnet_model,device,"stage2_best_54\model_stage2_epochs_"+str(t)+".pth")
    model.eval()
    make_preds_by_slide_and_visualize(model, pick_index_df, device, os.path.join(output_dir,str(t)), transform=data_transforms['valid'], 
                                      visualize=True,out_all=True,all_shuffle=False,batch_size=64, num_workers=0, n_classes=3, scale_factor=80,
                                      wsi_type=".svs",sum_calc=True)
    output_result = os.path.join(output_dir, str(t), 'output_result.csv')
    data = []
    pattern = re.compile(r"(.+)_sum_out_label=(.+)\.txt")
    for filename in os.listdir(os.path.join(output_dir,str(t))):
        match = pattern.match(filename)
        if match:
            wsi_name, out_label = match.groups()
            data.append([wsi_name, out_label])
    df = pd.DataFrame(data, columns=["wsi_name", "out_label"])
    df.to_csv(output_result, index=False)
    print(f"转换完成，结果已保存至: {output_result}")
    print(df['out_label'].value_counts())

In [None]:
#valid
pick_index_df = pd.read_csv(r'.\valid_txt\pick_index.csv')
output_dir = r'.\valid_txt\output'
for t in range(1):
    resnet_model,params_to_update = model_initialization(dimension=3,freeze=False)
    model = model_load(resnet_model,device,"stage2\model_stage2_epochs_"+str(t)+".pth")
    model.eval()
    make_preds_by_slide_and_visualize(model, pick_index_df, device, os.path.join(output_dir,str(t)), transform=data_transforms['valid'], 
                                      visualize=True,out_all=True,all_shuffle=False,batch_size=64, num_workers=0, n_classes=3, scale_factor=80,
                                      wsi_type=".mrxs",sum_calc=True)
    output_result = os.path.join(output_dir, str(t), 'output_result.csv')
    data = []
    pattern = re.compile(r"(.+)_sum_out_label=(.+)\.txt")
    for filename in os.listdir(os.path.join(output_dir,str(t))):
        match = pattern.match(filename)
        if match:
            wsi_name, out_label = match.groups()
            data.append([wsi_name, out_label])
    df = pd.DataFrame(data, columns=["wsi_name", "out_label"])
    df.to_csv(output_result, index=False)
    print(f"转换完成，结果已保存至: {output_result}")
    print(df['out_label'].value_counts())

In [None]:
#E:\THT\HE_DTC_second\second 里面还有一波mrxs的片子

#将{wsi_name}_sum_out_label={out_lable}.txt转换为 output_result.csv
output_dir = r'E:\THT\HE_DTC_TCGA\output\model2_out\0'
output_result = os.path.join(output_dir, 'output_result.csv')
data = []
pattern = re.compile(r"(.+)_sum_out_label=(.+)\.txt")
for filename in os.listdir(output_dir):
    match = pattern.match(filename)
    if match:
        wsi_name, out_label = match.groups()
        data.append([wsi_name, out_label])
df = pd.DataFrame(data, columns=["wsi_name", "out_label"])
#df.to_csv(output_result, index=False)
print(f"转换完成，结果已保存至: {output_result}")
print(df['out_label'].value_counts())
#转换完成，结果已保存至: E:\THT\HE_DTC_TCGA\output\model2_out\18\output_result.csv
#out_label
#0    467
#2     20
#Name: count, dtype: int64

In [None]:
print(time.strftime("%m月 %d日 %H时 %M分 %S秒", time.gmtime(time.time())))