In [1]:
column_names = [
    "HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "EtCO2",
    "BaseExcess", "HCO3", "FiO2", "pH", "PaCO2", "SaO2", "AST", "BUN", "Alkalinephos", "Calcium", "Chloride", 
    "Creatinine", "Bilirubin_direct", "Glucose", "Lactate", "Magnesium", "Phosphate", "Potassium", "Bilirubin_total",
    "TroponinI", "Hct", "Hgb", "PTT", "WBC", "Fibrinogen", "Platelets",
    "Age", "Gender", "Unit1", "Unit2", "HospAdmTime", "ICULOS",
    "SepsisLabel"
]

In [2]:
import os
import torch
import numpy as np
import csv
from matplotlib import pyplot as plt
from sklearn import model_selection
import pickle

class SEPSIS(object):

    params = [
        'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2',
        'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose',
        'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
        'Fibrinogen', 'Platelets'
    ]
    
    params_dict = {k: i for i, k in enumerate(params)}

    def __init__(self, root, device=torch.device("cpu")):
        self.root = root
        self.device = device

        self.data = []
        self.labels = []

        self.process_data()


    def process_data(self):
        patients = []
        record_id = 1  # 初始化 record_id，从 1 开始

        for patient_file in os.listdir(self.root):
            if not patient_file.endswith(".csv"):
                continue
            
            file_path = os.path.join(self.root, patient_file)
            with open(file_path, 'r') as f:
                reader = csv.DictReader(f)
                rows = list(reader)

                tt = []
                vals = []
                mask = []
                sepsis_label = 0  # 初始化标签为0

                # 遍历所有行，检查 SepsisLabel 中是否有 1
                for row in rows:
                    if int(row['SepsisLabel']) == 1:  # 如果 SepsisLabel 为 1，则设置 sepsis_label
                        sepsis_label = 1

                # 只提取前6小时的数据
                for row in rows[:6]:
                    time = float(row['ICULOS'])  # 使用 ICULOS 作为时间
                    tt.append(time)

                    val_tensor = torch.zeros(len(self.params)).to(self.device)
                    mask_tensor = torch.zeros(len(self.params)).to(self.device)

                    # 处理每一个参数
                    for param in self.params:
                        if row[param] != '':  # 如果该参数有值
                            try:
                                val = float(row[param])
                                if not np.isnan(val):
                                    # 将参数值填充到相应位置
                                    val_tensor[self.params_dict[param]] = val
                                    # 将 mask 对应位置设为 1，表示该值可用
                                    mask_tensor[self.params_dict[param]] = 1
                            except ValueError:
                                pass

                    vals.append(val_tensor)
                    mask.append(mask_tensor)

                # 转换时间为 tensor
                tt = torch.tensor(tt).to(self.device)
                vals = torch.stack(vals)
                mask = torch.stack(mask)
                sepsis_label = torch.tensor(sepsis_label)

                # 保存病人的 record_id, tt, vals, mask, sepsis_label
                patients.append((record_id, tt, vals, mask, sepsis_label))
                record_id += 1  # 增加 record_id

        self.data = patients

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

    def visualize(self, timesteps, data, mask, plot_name):
        width = 15
        height = 15

        non_zero_attributes = (torch.sum(mask,0) > 2).numpy()
        non_zero_idx = [i for i in range(len(non_zero_attributes)) if non_zero_attributes[i] == 1.]
        n_non_zero = sum(non_zero_attributes)

        mask = mask[:, non_zero_idx]
        data = data[:, non_zero_idx]

        params_non_zero = [self.params[i] for i in non_zero_idx]
        params_dict = {k: i for i, k in enumerate(params_non_zero)}

        n_col = 3
        n_row = n_non_zero // n_col + (n_non_zero % n_col > 0)
        fig, ax_list = plt.subplots(n_row, n_col, figsize=(width, height), facecolor='white')

        for i in range(n_non_zero):
            param = params_non_zero[i]
            param_id = params_dict[param]

            tp_mask = mask[:, param_id].long()

            tp_cur_param = timesteps[tp_mask == 1.]
            data_cur_param = data[tp_mask == 1., param_id]

            ax_list[i // n_col, i % n_col].plot(tp_cur_param.numpy(), data_cur_param.numpy(), marker='o') 
            ax_list[i // n_col, i % n_col].set_title(param)

        fig.tight_layout()
        fig.savefig(plot_name)
        plt.close(fig)


In [3]:
dataset_obj = SEPSIS('./training/processed', device=torch.device("cpu"))

In [4]:
dataset_obj

<__main__.SEPSIS at 0x1078afcd0>

In [5]:
dataset_obj.data

[(1,
  tensor([3., 4., 5., 6., 7., 8.]),
  tensor([[ 65.0000, 100.0000,  36.5000, 157.5000, 101.5000,  66.5000,  15.5000,
             0.0000,   0.0000,   0.0000,   0.0000,   7.2800,  42.0000,  99.0000,
             0.0000,  14.0000,   0.0000,   4.3600,   0.0000,   0.5100,   0.0000,
           193.5000,   0.8700,   0.0000,   0.0000,   5.2500,   0.0000,   0.0000,
            29.2000,  10.8000,  29.0000,  11.6000, 305.0000, 172.0000],
          [ 63.0000, 100.0000,   0.0000, 156.5000, 102.0000,  66.5000,  12.0000,
             0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
             0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
             0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
             0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000],
          [ 69.0000, 100.0000,  37.0000, 164.0000, 108.0000,  74.0000,  15.0000,
             0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
     

In [6]:
len(dataset_obj.data)

40336

In [7]:
total_dataset = dataset_obj[:len(dataset_obj)]
train_data, test_data = model_selection.train_test_split(total_dataset, train_size=0.8, random_state=42, shuffle=True)

In [8]:
record_id, tt, vals, mask, labels = train_data[0]

In [9]:
len(tt)

6

In [10]:
train_data, val_data = model_selection.train_test_split(train_data, train_size=0.8,
                                                                random_state=42, shuffle=True)

In [11]:
file = open('./training/process/total_dataset.pickle', 'wb')
pickle.dump(total_dataset, file)
file.close()

In [12]:
file = open('./training/process/train_val_test_dataset.pickle', 'wb')
pickle.dump((train_data,val_data,test_data), file)
file.close()