In [12]:
import os
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import nibabel as nib
import numpy as np
import pandas as pd

In [13]:
select_section = [30, 50, 70, 90, 110, 140] #range from 0 to 155, 从中选六张截面

In [14]:
My_transform = transforms.Compose([
    transforms.Lambda(lambda x: x.permute(0, 3, 1, 2)), # 将维度重排为 [C, D, H, W]
    #transforms.ToTensor(), # 将numpy数组转换为PyTorch张量
    transforms.Normalize(mean=[0], std=[1]), # 归一化
])


In [0]:
class MyDataset(Dataset):
    def __init__(self, root_dir, csv_dir, transform = None):
        self.root_dir = root_dir
        self.samples = os.listdir(root_dir)
        self.csv_file = pd.read_csv(csv_dir)
        self.transform = transform
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample_path = os.path.join(self.root_dir, self.samples[idx])
        
        # Load each MRI sequence from the nii files,-6是除去文件夹名称里的_nifti
        t1 = nib.load(os.path.join(sample_path, self.samples[idx][:-6] + '_T1.nii.gz')).get_fdata()
        t1 = t1[:,:,select_section]
        t1c = nib.load(os.path.join(sample_path, self.samples[idx][:-6] + '_T1c.nii.gz')).get_fdata()
        t1c = t1c[:,:,select_section]
        t2 = nib.load(os.path.join(sample_path, self.samples[idx][:-6] + '_T2.nii.gz')).get_fdata()
        t2 = t2[:,:,select_section]
        flair = nib.load(os.path.join(sample_path, self.samples[idx][:-6] + '_FLAIR.nii.gz')).get_fdata()
        flair = flair[:,:,select_section]
        asl = nib.load(os.path.join(sample_path, self.samples[idx][:-6] + '_ASL.nii.gz')).get_fdata()
        asl = asl[:,:,select_section]

        # Combine the four MRI sequences into a single input tensor
        input_tensor = torch.Tensor(np.stack([t1, t1c, t2, flair, asl], axis=0))
        #input_tensor = input_tensor.permute(0,3,1,2) #将维度重排为 [C, D, H, W]，有了transform就不需要在这里重排了
        if self.transform:
            input_tensor = self.transform(input_tensor)
        
        #从csv文件中读取label
        # 原ID比csv文件中的ID多了个0，比如UCSF-PDGM-0004，csv中是UCSF-PDGM-004。所以要修改一下
        id = self.samples[idx][:-6]
        id_fit = id[0:-4] + id[-3:]
        label = self.csv_file.loc[self.csv_file['ID'] == id_fit, 'WHO CNS Grade'].values[0]
        label = label - 2 #从[2,3,4]转为[0,1,2]
        #PyTorch会自动把整数型的label转为one-hot型，用于计算CE loss这里需要确保label是从0开始的,from深入浅出pytorch
        
        # Return the input tensor and any additional labels or targets
        return input_tensor, label  # label是你的样本的标签，需要自己定义


In [15]:
#随机将数据集按比例分配为训练集和测试集
custom_dataset = MyDataset(root_dir='UCSF-PDGM-v3',csv_dir='data/UCSF-PDGM-metadata_v2.csv', transform = My_transform)
train_size = int(len(custom_dataset) * 0.7)
test_size = len(custom_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(custom_dataset, [train_size, test_size])

print(len(train_dataset))
print(len(test_dataset))

In [16]:
if __name__ == '__main__':
    # Create a dataloader for the dataset
    train_dataset = MyDataset(root_dir='data/MRI/Train',csv_dir='data/UCSF-PDGM-metadata_v2.csv', transform = My_transform)
    test_dataset = MyDataset(root_dir='data/MRI/Test',csv_dir='data/UCSF-PDGM-metadata_v2.csv', transform = My_transform)
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

    samples, labels = next(iter(train_loader))
    print(samples.shape)

UCSF-PDGM-352
UCSF-PDGM-004
UCSF-PDGM-007
UCSF-PDGM-008
torch.Size([4, 240, 240, 6])
