In [1]:

!pip install vitaldb
!pip install tensorboardX

Collecting vitaldb
  Downloading vitaldb-1.4.7-py3-none-any.whl (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting wfdb (from vitaldb)
  Downloading wfdb-4.1.2-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.0/160.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: wfdb, vitaldb
Successfully installed vitaldb-1.4.7 wfdb-4.1.2
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [2]:
 import pandas as pd
df_cases = pd.read_csv("https://api.vitaldb.net/cases")  # clinical information
df_trks = pd.read_csv("https://api.vitaldb.net/trks")    # track list
df_labs = pd.read_csv('https://api.vitaldb.net/labs')    # laboratory results

# loader

## Class Dataloader

In [3]:
import torch
import numpy as np
from pandas import read_csv
import os
from tqdm import tqdm
import torch.utils.data as dat
import matplotlib.pyplot as plt


class Dataloader:
    def __init__(self, database_wdir, nums, time_step, tw):
        self.database_wdir = database_wdir
        self.nums = nums
        self.time_step = time_step
        self.tw = tw

    def label_stat(self, case_nums=1, traindata="train"):
        """
        :param case_nums:一次性加载的样本数
        :return: 带有bis,生理特征,用药量信息的字典列表,列表长度为 case_nums
        case_id:样本id列表
        case_information:样本的生理信息表
        case_in_information:样本在信息表中的位置，如 case3:->[0], case30:->[13]
        """
        case_information = read_csv(f'/HDD_data/HYK/bis/database/new_{traindata}_clean.csv')
        case_id = self.file_name(data=traindata)
        for i in range(len(case_id)):
            case_id[i] = case_id[i].split('.')[0]  # 字符串转数字

        case_id = list(map(int, case_id))
        case_id.sort()

        data_label = []

        for i in tqdm(range(case_nums)):
            df = read_csv(f'{self.database_wdir}/{traindata}/{case_id[i]}.csv')
            x_len = int(len(df.BIS) / self.time_step)

            label = np.zeros(x_len)
            for j in range(0, x_len, 1):
                label[int(j)] = df.BIS.values[j * self.time_step]

            data_label.extend(label)

        data_label.sort()
        for i in range(len(data_label)):
            data_label[i] = int(data_label[i])
        j = 0
        label_num = list(np.zeros(100))
        for i in range(100):
            while data_label[j] == i:
                label_num[i] += 1
                j += 1
                if j == len(data_label)-10:
                    break

        import matplotlib.pyplot as plt

        plt.grid(True)
        plt.autoscale(axis='x', tight=True)
        plt.bar(list(range(100)), label_num)
        plt.xlabel("bis index")
        plt.ylabel("label nums")
        plt.show()

        return data_label, label_num

    def dataload(self, case_nums=1, traindata="train"):
        """
        :param case_nums:一次性加载的样本数
        :return: 带有bis,生理特征,用药量信息的字典列表,列表长度为 case_nums
        case_id:样本id列表
        case_information:样本的生理信息表
        case_in_information:样本在信息表中的位置，如 case3:->[0], case30:->[13]
        x1:ppf_vol
        x2:rftn_vol
        x3:pkpd_bis
        X4:bis_history
        X5:RFTN_CP
        x6-x9:body information(age, sex, height, weight)
        """
        # case_information = read_csv(f'/HDD_data/HYK/bis/database/new_{traindata}_clean.csv')
        case_information = read_csv(f'/HDD_data/HYK/bis/database/ni_dataset/info.csv')
        case_id = self.file_name(data=traindata)
        # for i in range(len(case_id)):
        #     case_id[i] = case_id[i].split('.')[0]  # 字符串转数字

        # case_id = list(map(int, case_id))


        case_id.sort()
        print("file_name:", case_id)
        case_in_information = self.information_deal(case_id, traindata)

        data_seq = [0] * case_nums
        data_label = [0] * case_nums

        for i in tqdm(range(case_nums)):
            df = read_csv(f'{self.database_wdir}/{traindata}/{case_id[i]}.csv')
            x_len = int(len(df.BIS) / self.time_step)
            # body信息读取
            age = case_information.age[case_in_information[i]]
            sex = case_information.sex[case_in_information[i]]
            height = case_information.height[case_in_information[i]]
            weight = case_information.weight[case_in_information[i]]
            body = torch.tensor([age, sex, height, weight]).float().reshape(1, 1, 4).repeat(x_len, self.tw, 1)

            # 清除异常值
            modify_RFTN = df.RFTN20_VOL.values
            modify_PPF = df.PPF20_VOL.values
            diff_RFTN = np.diff(modify_RFTN)
            diff_PPF = np.diff(modify_PPF)
            for j in range(len(diff_RFTN)):
                if diff_RFTN[j] < 0:
                    temp = (modify_RFTN[j] + modify_RFTN[j + 2]) / 2
                    df.loc[j + 1, "RFTN20_VOL"] = temp
                if diff_PPF[j] < 0:
                    temp = (modify_PPF[j] + modify_PPF[j + 2]) / 2
                    df.loc[j + 1, "PPF20_VOL"] = temp

            # 为0时刻补上-1800s的零数据
            PPF = list(np.zeros(self.tw * 10))
            PPF.extend(df.PPF20_VOL.values)
            RFTN = list(np.zeros(self.tw * 10))
            RFTN.extend(df.RFTN20_VOL.values)

            ppf_cp = list(np.zeros(self.tw * 10))
            ppf_cp.extend(df.PPF_CP.values)
            rftn_cp = list(np.zeros(self.tw * 10))
            rftn_cp.extend(df.RFTN20_CP.values)

            ppf_ce = df.PPF_CE.values
            rftn_ce = df.RFTN20_CE.values

            pkpd_bis = self.pkpd(ppf_ce, rftn_ce)
            PKPD_bis = list(np.ones(self.tw * 10)*98)
            PKPD_bis.extend(pkpd_bis)

            history_bis = df.BIS.values
            bis = list(np.zeros(self.tw * 10))
            bis.extend(history_bis)

            # 特征制作
            X1 = torch.zeros((x_len, self.tw))
            X2 = torch.zeros((x_len, self.tw))
            X3 = torch.zeros((x_len, self.tw))
            X4 = torch.zeros((x_len, self.tw))
            X5 = torch.zeros((x_len, self.tw))

            for x in range(self.tw*10, len(PPF) - self.time_step, self.time_step):
                # 从补完数据1800s（实际0s）时刻开始取数据段
                PPF_10s, RFTN_10s, BIS_10s, history_10s, RFTN_CP_10s = [], [], [], [], []
                for k in range(self.tw-1, -1, -1):
                    # 第k个10s片段, 共180个
                    PPF_10s.append((PPF[x - k * 10] - PPF[x - (k + 1) * 10]) * 0.1)
                    RFTN_10s.append((RFTN[x - k * 10] - RFTN[x - (k + 1) * 10]) * 0.1)
                    BIS_10s.append((PKPD_bis[x - k * 10]))
                    history_10s.append((bis[x - k * 10]))
                    RFTN_CP_10s.append((rftn_cp[x - k * 10]))


                X1[int((x - self.tw * 10) / self.time_step)] = torch.tensor(PPF_10s)
                X2[int((x - self.tw * 10) / self.time_step)] = torch.tensor(RFTN_10s)
                X3[int((x - self.tw * 10) / self.time_step)] = torch.tensor(BIS_10s)
                X4[int((x - self.tw * 10) / self.time_step)] = torch.tensor(history_10s)
                X5[int((x - self.tw * 10) / self.time_step)] = torch.tensor(RFTN_CP_10s)

            # bis = torch.tensor(df.BIS.values)
            # for k in range(x_len):
            #     if k * self.time_step < self.tw:
            #         X4[k, :] = torch.cat((torch.ones(self.tw - k * self.time_step) * 98, bis[:k * self.time_step]), dim=0)
            #         # X3[k, :] = torch.cat((torch.zeros(self.tw - k * self.time_step), pkpd_bis[:k * self.time_step]), dim=0)
            #         # X5[k, :] = torch.cat((torch.zeros(180 - k * self.time_step), rftn_ce[:k * self.time_step]), dim=0)
            #
            #     else:
            #         X4[k, :] = bis[k * self.time_step - self.tw:k * self.time_step]
            #         # X3[k, :] = pkpd_bis[k * self.time_step - self.tw:k * self.time_step]
            #         # X5[k, :] = rftn_ce[k * self.time_step - 180:k * self.time_step]

            seq = torch.zeros((x_len, self.tw, 5)).float()
            seq[:, :, 0] = X1  # ppf vol
            seq[:, :, 1] = X2  # rftn vol
            # 归一化
            mean = torch.mean(seq, dim=1).reshape((seq.shape[0], 1, seq.shape[2])).repeat(1, self.tw, 1)
            std = torch.std(seq, dim=1).reshape((seq.shape[0], 1, seq.shape[2])).repeat(1, self.tw, 1) + 1e-3
            seq = self.normalizition(x=seq, mu=mean, sigma=std)

            seq[:, :, 2] = X3  # pk-pd bis
            seq[:, :, 3] = X4  # ppf cp
            seq[:, :, 4] = X5  # rftn cp

            out = torch.cat((seq, body), dim=2)
            # out = torch.cat((out, seq[:, :, 2].reshape(seq.shape[0], 180, 1)), dim=2)

            data_seq[i] = out.float()
            label = np.zeros(x_len)
            for j in range(0, x_len, 1):
                label[int(j)] = df.BIS.values[j * self.time_step]

            data_label[i] = torch.tensor(label).float()

        print(f"{traindata}data load finish!", 'case_nums = ', case_nums)
        return data_seq, data_label

    def train_data_loader(self, batch=1, batch_size=1, data="train", shuffle=True):
        train_seq, train_label = self.dataload(case_nums=batch, traindata=data)
        A = train_seq[0]
        B = train_label[0]
        for i in range(1, batch):
            A = torch.cat((A, train_seq[i]), 0)
            B = torch.cat((B, train_label[i]), 0)

        torch.save(A, f"/HDD_data/HYK/bis/database/validdata.pt")
        torch.save(B, f"/HDD_data/HYK/bis/database/validlabel.pt")

        # np.save(A.data.numpy(), "/HDD_data/HYK/bis/database/traindata.npy")
        # np.save(B.data.numpy(), "/HDD_data/HYK/bis/database/trainlabel.npy")

        # train_data = dat.TensorDataset(A, B)
        # train_loader = dat.DataLoader(dataset=train_data,
        #                               batch_size=batch_size,
        #                               drop_last=True,
        #                               num_workers=4,
        #                               pin_memory=True,
        #                               shuffle=shuffle)
        # return train_loader
        return 0

    def test_data_loader(self, batch=1, batch_size=1, data="test"):
        test_seq, test_label = self.dataload(case_nums=batch, traindata=data)
        test_data = list(np.zeros(batch))
        test_loader = list(np.zeros(batch))
        for i in range(batch):
            torch.save(test_seq[i], f"/HDD_data/HYK/bis/database/test_box/testndata{i}.pt")
            torch.save(test_label[i], f"/HDD_data/HYK/bis/database/test_box/testlabel{i}.pt")
            # test_data[i] = dat.TensorDataset(test_seq[i], test_label[i])
            # test_loader[i] = dat.DataLoader(dataset=test_data[i],
            #                                 batch_size=batch_size,
            #                                 drop_last=True,
            #                                 pin_memory=True,
            #                                 num_workers=8)
        return test_loader, test_label

    def information_deal(self, people_list, data="train"):
        """
        :param people_list: 样本的id列表，如[3, 30, 67 ...]
        :return: 样本在information表中的位置
        """
        case_information = list(read_csv(f'/HDD_data/HYK/bis/database/new_{data}_clean.csv').caseid)
        case_location = list(np.zeros(len(people_list)))
        for i in range(len(people_list)):
            case_location[i] = case_information.index(people_list[i])
        return case_location  # clear3，30，36......等csv信息在information文件中的位置

    def time_devide(self, case_nums=1, traindata="test"):
        """
        :param traindata: 测试集或验证集
        :param case_nums:加载的样本数
        :return: istart:开始注射时间 istop: 停止注射时间
        """
        case_id = self.file_name(traindata)

        for i in range(len(case_id)):
            case_id[i] = case_id[i].split('.')[0]  # 字符串转数字

        case_id = list(map(int, case_id))
        case_id.sort()
        print("file_name:", case_id)
        infusion_start, infusion_stop = [0] * case_nums, [0] * case_nums
        for i in tqdm(range(case_nums)):
            df = read_csv(f'/HDD_data/HYK/bis/database/{traindata}/{case_id[i]}.csv')

            x_len = int(len(df.BIS))
            ppf = df.PPF20_VOL.values
            start_flag = True
            stop_flag = True
            for j in range(x_len):
                if ppf[j] > 0 and start_flag:
                    infusion_start[i] = j
                    start_flag = False
                if ppf[-j - 1] != ppf[-j - 2] and stop_flag:
                    infusion_stop[i] = x_len - j + 1
                    stop_flag = False
                if not start_flag and not stop_flag:
                    break

        print(f"{traindata}data load finish!", 'case_nums = ', case_nums)
        return infusion_start, infusion_stop

    def file_name(self, data):
        for root, dirs, files in os.walk(f'{self.database_wdir}/{data}'):
            return files  # 当前路径下所有非目录子文件,列表

    @staticmethod
    def pkpd(Ec1, Ec2):
        ppf_ec50 = 4.47
        rftn_ec50 = 19.3
        gamma = 1.43
        p_gamma = (Ec1/ppf_ec50 + Ec2/rftn_ec50)**gamma
        bis = 98. - 98. * p_gamma / (1 + p_gamma)
        return bis

    @staticmethod
    def normalizition(x, mu, sigma):
        # mu 均值 sigms 标准差
        x = (x - mu) / sigma
        return x

    def ceload(self, case_nums=1, traindata="test"):
        """
        :param case_nums:一次性加载的样本数
        :return: 带有bis,生理特征,用药量信息的字典列表,列表长度为 case_nums
        case_id:样本id列表
        case_information:样本的生理信息表
        case_in_information:样本在信息表中的位置，如 case3:->[0], case30:->[13]
        x1:ppf_vol
        x2:rftn_vol
        x3:pkpd_bis
        X4:RFTN_CP
        x5-x8:body information(age, sex, height, weight)
        """
        case_id = self.file_name(data=traindata)
        for i in range(len(case_id)):
            case_id[i] = case_id[i].split('.')[0]  # 字符串转数字
        case_id = list(map(int, case_id))
        case_id.sort()

        PKPD_bis = []
        for i in tqdm(range(case_nums)):
            df = read_csv(f'{self.database_wdir}/{traindata}/{case_id[i]}.csv')
            x_len = int(len(df.BIS) / self.time_step)

            ppf_ce = df.PPF_CE.values
            rftn_ce = df.RFTN20_CE.values

            pkpd_bis = self.pkpd(ppf_ce, rftn_ce)
            PKPD_bis.append(pkpd_bis)

        return PKPD_bis

    def data_save(self, case_nums=1, traindata="test"):
        case_information = read_csv(f'/HDD_data/HYK/bis/database/before_bodyinformation.csv')
        case_id = self.file_name(data=traindata)
        for i in range(len(case_id)):
            case_id[i] = case_id[i].split('.')[0]  # 字符串转数字
        case_id = list(map(int, case_id))
        case_in_information = self.information_deal(case_id, traindata)
        case_id.sort()
        X = list(range(case_nums))
        for i in tqdm(range(case_nums)):
            df = read_csv(f'{self.database_wdir}/{traindata}/{case_id[i]}.csv')
            age = case_information.age[case_in_information[i]]
            sex = case_information.sex[case_in_information[i]]
            height = case_information.height[case_in_information[i]]
            weight = case_information.weight[case_in_information[i]]

            X[i] = [
                np.median(df.BIS.values),
                df.PPF20_VOL.values[-1]*20/1000,
                df.RFTN20_VOL.values[-1]*20/1000,
                np.median(df.PPF_CE.values),
                np.median(df.RFTN20_CE.values),
                age, sex, height, weight]
        file = {}
        X = np.asarray(X)
        name = ["bis", "ppf_dose", "rftn_dose", "ppf_ce", "rftn_ce", "age", "sex", "height", "weight"]
        for j in range(len(name)):
            file[f"{name[j]}"] = X[:, j]

        import pandas as pd
        df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in file.items()]))

        df.to_csv(f'/HDD_data/HYK/bis/database/{traindata}.csv')
        return X

    def pt_load(self, dataset, batch_size):
        import gc
        if dataset == "train":
            A = torch.load("/HDD_data/HYK/bis/database/traindata.pt")
            B = torch.load("/HDD_data/HYK/bis/database/trainlabel.pt")
            train_data = dat.TensorDataset(A, B)
            train_loader = dat.DataLoader(dataset=train_data,
                                          batch_size=batch_size,
                                          drop_last=True,
                                          num_workers=4,
                                          pin_memory=True,
                                          shuffle=True)
            print("Training set loading completed")
            del A, B
            gc.collect()
            return train_loader
        elif dataset == "test":
            test_loader = list(np.zeros(76))
            B = list(np.zeros(76))
            for i in tqdm(range(76)):
                A = torch.load(f"/HDD_data/HYK/bis/database/test_box/testndata{i}.pt")
                B[i] = torch.load(f"/HDD_data/HYK/bis/database/test_box/testlabel{i}.pt")
                C = dat.TensorDataset(A, B[i])
                test_loader[i] = dat.DataLoader(
                    dataset=C,
                    batch_size=batch_size,
                    drop_last=True, )

                del A, C
                gc.collect()

            print("Testing set loading completed")
            return test_loader, B
        elif dataset == "valid":
            A = torch.load("/HDD_data/HYK/bis/database/validdata.pt")
            B = torch.load("/HDD_data/HYK/bis/database/validlabel.pt")
            train_data = dat.TensorDataset(A, B)
            valid_loader = dat.DataLoader(dataset=train_data,
                                          batch_size=batch_size,
                                          drop_last=True,
                                          num_workers=4,
                                          pin_memory=True,
                                          shuffle=True)
            print("Validation set loading completed")
            return valid_loader

    def load_all(self, vb, trb, teb):
        vaild_loader = self.pt_load(
            dataset="valid",
            batch_size=vb
        )

        train_loader = self.pt_load(
            dataset="train",
            batch_size=trb,
        )

        test_loader, test_label = self.pt_load(
            dataset="test",
            batch_size=teb,
        )
        return vaild_loader, train_loader, test_loader, test_label


def data_distribution_bar(data, label_error=None):
    """
    :param data: data will be plot in bar
    :return:
    """
    fig = plt.figure(figsize=(24, 16))

    da = plt.Rectangle((24, 0), 38, 50, color="cornsilk")
    ga = plt.Rectangle((32, 0), 14.5, 50, color="paleturquoise")
    # s = plt.Rectangle((60, 0), 30, 50, color="cornsilk")
    w = plt.Rectangle((0, 0), 100, 50, color="pink")

    if not label_error:
        ax = fig.add_subplot(111)
        plt.xlabel('BIS', fontsize=30)
    else:
        ax_error = fig.add_subplot(212)
        ax_error.add_patch(w)
        ax_error.add_patch(da)
        ax_error.add_patch(ga)

        ax_error.bar(list(range(100)), label_error[1], color='forestgreen')
        ax_error.bar(list(range(100)), label_error[0], color='salmon')
        ax_error.legend(['Few-shot region', 'Medium-shot region', 'Many-shot region', 'Baseline', 'Ours'],
                        fontsize=25, loc=1)
        plt.xlim(0, 100)
        plt.ylim(0, 50)
        plt.xticks(fontsize=30)
        plt.yticks(fontsize=30)
        plt.xlabel('BIS', fontsize=30)
        plt.ylabel('Test error', fontsize=30)

        ax = fig.add_subplot(211)

    plt.xlim(0, 100)
    plt.ylim(0, 6.5)
    plt.xticks(fontsize=30)
    plt.yticks(fontsize=30)
    plt.ylabel('Percentage(%)', fontsize=30)

    da = plt.Rectangle((24, 0), 38, 50, color="cornsilk")
    ga = plt.Rectangle((32.5, 0), 14, 50, color="paleturquoise")
    # s = plt.Rectangle((60, 0), 30, 50, color="cornsilk")
    w = plt.Rectangle((0, 0), 100, 50, color="pink")

    ax.add_patch(w)
    ax.add_patch(da)
    ax.add_patch(ga)

    # for i in range(3):
    #     ax.text(15+i*30, 6.65, '%.2f%%' % sum(data[i*30:i*30+30]),
    #             fontsize=30, ha='center', va='bottom')
    ax.text(12, 6.65, '%.2f%%' % sum(data[:24]),
            fontsize=30, ha='center', va='bottom')
    ax.text(28, 6.65, '%.2f%%' % sum(data[24:32]),
            fontsize=30, ha='center', va='bottom')
    ax.text(39.75, 6.65, '%.2f%%' % sum(data[32:46]),
            fontsize=30, ha='center', va='bottom')
    ax.text(54.25, 6.65, '%.2f%%' % sum(data[46:62]),
            fontsize=30, ha='center', va='bottom')
    ax.text(81, 6.65, '%.2f%%' % sum(data[62:]),
            fontsize=30, ha='center', va='bottom')

    ax.bar(list(range(100)), data, color='darkslateblue')
    ax.legend(['Few-shot region', 'Medium-shot region', 'Many-shot region', 'Label Percentage'],
              fontsize=25, loc=1)

    plt.savefig('/HDD_data/HYK/bis/output/test error.jpg')
    plt.show()


def error_down(e):
    e1 = np.asarray(e[0][:98])
    e2 = np.asarray(e[1][:98])
    return (e1-e2)/e2


if __name__ == "__main__":
    data = 1
    # data_distribution_bar(data, label_error=None)


# model

## baseline

### <font color='coral'>class</font> baseline

In [4]:
import numpy as np
import torch
import torch.nn as nn


class LstmModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.input_dim = config.input_dim
        self.memory_cell = config.memory_cell
        self.body_dim = config.body_dim
        self.n = config.n
        self.lstm1 = nn.LSTM(1, self.memory_cell, batch_first=True)
        self.lstm2 = nn.LSTM(1, self.memory_cell, batch_first=True)

        self.fc = nn.Sequential(
            nn.Linear(self.memory_cell*2+self.body_dim, config.n),
            nn.ReLU(),
            nn.Linear(config.n, 1),
        )

    def forward(self, x, b):
        #  self.lstm(input_seq.reshape(seq_len序列长度, batch批量大小, input_size特征维度), (h0, c0))
        x1, (hn, cn) = self.lstm1(x[..., 0].unsqueeze(-1))
        x2, (hn, cn) = self.lstm2(x[..., 1].unsqueeze(-1))
        x1 = x1[:, -1]
        x2 = x2[:, -1]
        x = torch.cat((x1, x2), dim=1)
        x = torch.cat((x, b), dim=1)
        x = self.fc(x)

        return x


### <font color='coral'>class </font>params

In [5]:
import argparse


class Params:
    def __init__(self):
        self.x = 1

    @staticmethod
    def lstm_params():
        parser = argparse.ArgumentParser()
        parser.add_argument('--input_dim',     default=2,              type=int)
        parser.add_argument('--memory_cell',     default=8,              type=int)
        parser.add_argument('--body_dim',   default=4,              type=int)
        parser.add_argument('--n',         default=16,              type=int)

        args = parser.parse_args()
        return args

    @staticmethod
    def trainparam():
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_name', default="baseline", type=str)
        parser.add_argument('--tw',           default=180,   type=int)
        parser.add_argument('--train_batch',  default=100,   type=int)
        parser.add_argument('--vaild_batch',  default=30,    type=int)
        parser.add_argument('--test_batch',   default=76,    type=int)
        parser.add_argument('--batch_size',   default=64,    type=int)
        parser.add_argument('--train_epoch',  default=50,    type=int)
        parser.add_argument('--lr',           default=3e-4,  type=float)
        parser.add_argument('--pre_train',    default=False,  type=bool)
        parser.add_argument('--pre_tr_times', default=0,     type=int)
        parser.add_argument('--device',       default=3,     type=int)
        parser.add_argument('--best_loss',    default=80000, type=int)

        args = parser.parse_args()

        # 预训练文件路径
        pre_file = f'/home/user02/HYK/bis_transformer/output/{args.model_name}/epoch{args.pre_tr_times}.pth'
        model_file = f'/home/user02/HYK/bis_transformer/output/{args.model_name}/model/epoch{args.pre_tr_times}.pth'
        best_file = f'/home/user02/HYK/bis_transformer/output/{args.model_name}/model/best_epoch.pth'
        # 保存文件路径
        save_file = f'/home/user02/HYK/bis_transformer/output/{args.model_name}/epoch{args.pre_tr_times}.pth'

        parser.add_argument('--pre_file', default=pre_file, type=str)
        parser.add_argument('--model_file', default=model_file, type=str)
        parser.add_argument('--best_file', default=best_file, type=str)
        parser.add_argument('--save_file', default=save_file, type=str)
        args = parser.parse_args()

        return args

### <font color='coral'> class</font> params_ni</font>

In [6]:
import argparse


class Params_ni:
    def __init__(self):
        self.x = 1

    @staticmethod
    def lstm_params():
        parser = argparse.ArgumentParser()
        parser.add_argument('--input_dim',     default=2,              type=int)
        parser.add_argument('--memory_cell',     default=8,              type=int)
        parser.add_argument('--body_dim',   default=4,              type=int)
        parser.add_argument('--n',         default=16,              type=int)

        args = parser.parse_args()
        return args

    @staticmethod
    def trainparam():
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_name', default="baseline", type=str)
        parser.add_argument('--tw',           default=180,   type=int)
        parser.add_argument('--train_batch',  default=100,   type=int)
        parser.add_argument('--vaild_batch',  default=30,    type=int)
        parser.add_argument('--test_batch',   default=76,    type=int)
        parser.add_argument('--batch_size',   default=64,    type=int)
        parser.add_argument('--train_epoch',  default=50,    type=int)
        parser.add_argument('--lr',           default=3e-4,  type=float)
        parser.add_argument('--pre_train',    default=True,  type=bool)
        parser.add_argument('--pre_tr_times', default=0,     type=int)
        parser.add_argument('--device',       default=1,     type=int)
        parser.add_argument('--best_loss',    default=80000, type=int)

        args = parser.parse_args()
        root = '/data/HYK/DATASET/bis/output'
        # 预训练文件路径
        pre_file = f'{root}/{args.model_name}/epoch{args.pre_tr_times}.pth'
        model_file = f'{root}/{args.model_name}/model/epoch{args.pre_tr_times}.pth'
        best_file = f'{root}/{args.model_name}/model/best_epoch.pth'
        # 保存文件路径
        save_file = f'{root}/{args.model_name}/epoch{args.pre_tr_times}.pth'

        parser.add_argument('--pre_file', default=pre_file, type=str)
        parser.add_argument('--model_file', default=model_file, type=str)
        parser.add_argument('--best_file', default=best_file, type=str)
        parser.add_argument('--save_file', default=save_file, type=str)
        args = parser.parse_args()

        return args

### <font color='coral'>class </font>class trainer

In [7]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from tensorboardX import SummaryWriter
#from model.baseline import baseline
#from model.baseline import params
import imp



class Trainer:
    def __init__(self, config):
        self.model_name = config.model_name
        self.device = config.device
        self.epoch = config.train_epoch
        self.pre_train = config.pre_train
        self.pre_tr_times = config.pre_tr_times
        self.save_pth = f"/data/HYK/DATASET/bis/output/{config.model_name}"


        args = params.Params.lstm_params()

        self.loss_function = nn.MSELoss()
        self.model = baseline.LstmModel(config=args).cuda()

        # 参数初始化
        # self.model.apply(weights_init)

    def train(self, X, X2, lr, model_file, best_loss):
        print("train begin")

        model = self.model.train()

        if self.pre_train:
            model.load_state_dict(torch.load(model_file))
            best_loss = np.loadtxt(f"{self.save_pth}/loss.txt")[0]
            print(best_loss)
            print(self.pre_tr_times)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        for i in range(1, self.epoch + 1):
            loss = 0

            for seq, labels in tqdm(X):
                optimizer.zero_grad()

                labels = labels.cuda()
                seq = seq.cuda()
                x1 = seq[:, :, :2]
                # x1:(batchsize, 180, 2)
                x2 = seq[:, 0, 5:]
                rnn_out = model.forward(x1, x2)
                batchloss = self.loss_function(rnn_out, labels.unsqueeze(-1))
                # batchloss = sum(self.loss_function(rnn_out, labels.unsqueeze(-1)))
                batchloss.backward()

                optimizer.step()

                loss += batchloss.detach().item()

            vaild_loss = self.vaild_full(X=X2, model=model)
            model = model.train()

            if vaild_loss < best_loss:
                print("new")
                best_loss = vaild_loss
                np.savetxt(f"{self.save_pth}/loss.txt", np.asarray([vaild_loss, vaild_loss]))
                torch.save(model.state_dict(), f'{self.save_pth}/model/best_epoch.pth')
            torch.save(model.state_dict(), f'{self.save_pth}/model/epoch{i + self.pre_tr_times}.pth')
            print(f"{i} train loss: {loss}")
            print(f"eval loss: {vaild_loss}")

        return

    def vaild_full(self, X, model):
        model = model.eval()

        loss = 0
        for seq, labels in tqdm(X):
            seq = seq.cuda()
            labels = labels.cuda()
            x1 = seq[:, :, :2]
            x2 = seq[:, 0, 5:]

            with torch.no_grad():

                rnn_out = model.forward(x1, x2)
                batchloss = self.loss_function(rnn_out, labels.unsqueeze(-1))
                # batchloss = sum(self.loss_function(rnn_out, labels.unsqueeze(-1)))
                loss += batchloss.detach().item()

        return loss

    def test(self, X, epoch_pth, test_batch):
        print("test begin")
        test_output = []
        for _ in range(len(X)):
            test_output.append([])

        model2 = self.model.eval()
        model2.load_state_dict(torch.load(f'{epoch_pth}', map_location='cuda:0'))

        for j in tqdm(range(test_batch)):
            for seq, labels in X[j]:
                seq = seq.cuda()
                x1 = seq[:, :, :2]
                x2 = seq[:, 0, 5:]
                with torch.no_grad():
                    y_pred = model2(x1, x2)
                    # y_pred = y_pred.view(y_pred.shape[0])
                    test_output[j].extend(y_pred.squeeze(-1).tolist())

        return test_output


def weights_init(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(m.bias, 0)
    # 也可以判断是否为conv2d，使用相应的初始化方式
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
    # 是否为批归一化层
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

# mainer

##<font color='coral'>class </font>evaluate

In [8]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np


class Evalulate:
    def __init__(self, x, y, istart, istop, case_num):
        # x:预测结果 y:label len:样本长度
        self.len = case_num
        self.case_num = case_num
        self.x = x[:self.len]
        self.y = y[:self.len]
        self.MSE = nn.MSELoss()
        self.MAE = nn.L1Loss()
        self.istrat = istart
        self.istop = istop

        for i in range(case_num):
            self.x[i] = torch.tensor(self.x[i])
            self.y[i] = torch.tensor(self.y[i])
            if len(x[i]) < len(y[i]):
                self.y[i] = self.y[i][:len(x[i])]
            else:
                self.x[i] = self.x[i][:len(y[i])]

    def rateplot(self, i):
        r = self.x[i] / self.y[i]
        plt.plot(r)

    def ratelist(self):
        r = [0] * self.len
        for i in range(self.len):
            r[i] = self.x[i] / self.y[i]
            plt.plot(r[i])
        plt.show()

    def loss(self, period=0):
        # 0, 1, 2, 3 全阶段,引导期，维持期，复苏期
        t1, t2 = 0, 0

        PE = [0] * self.len
        MSE = [0] * self.len
        MAE = [0] * self.len

        MDPE, MDAPE, RMSE = [0] * self.len, [0] * self.len, [0] * self.len
        for i in range(self.len):
            if period == 0:
                t1 = self.istrat[i]
                t2 = -1
            elif period == 1:
                t1 = self.istrat[i]
                t2 = self.istrat[i] + 600
            elif period == 2:
                t1 = self.istrat[i] + 600
                t2 = self.istop[i]
            elif period == 3:
                t1 = self.istop[i]
                t2 = -1
            PE[i] = ((self.x[i][t1:t2] - self.y[i][t1:t2]) / self.x[i][t1:t2])
            MSE[i] = self.MSE(self.x[i][t1:t2].unsqueeze(-1), self.y[i][t1:t2].unsqueeze(-1))
            MAE[i] = self.MAE(self.x[i][t1:t2].unsqueeze(-1), self.y[i][t1:t2].unsqueeze(-1))
            MDPE[i], MDAPE[i], RMSE[i] = self.estimate(PE=PE[i], MSE=MSE[i])

        out = {"MDPE": MDPE,
               "MDAPE": MDAPE,
               "RMSE": RMSE,
               "MAE": MAE,
               "meanMDPE": np.mean(MDPE),
               "meanMDAPE": np.mean(MDAPE),
               "meanRMSE": np.mean(RMSE),
               "meanMAE": np.mean(MAE),
               "SD": [np.std(MDPE), np.std(MDAPE), np.std(RMSE), np.std(MAE)],
               }
        return out

    @staticmethod
    def estimate(PE, MSE):
        """
        :param PE: 每个样本的bis误差（预测bis-真实bis），输入格式：list([样本误差])
        :param MSE: 每个样本的loss， 输入格式：list([样本loss])
        :return: MDPE:误差中位数， MDAPE:绝对误差中位数， RMSE:均方差
        """
        MDPE = np.median(PE) * 100
        MDAPE = np.median(np.abs(PE)) * 100
        RMSE = np.sqrt(MSE)
        return MDPE, MDAPE, RMSE

    def test_error(self, label_num):
        t_error = []
        out, label = [], []
        for i in range(self.case_num):
            out.extend(self.x[i])
            label.extend(self.y[i])

        for i in range(len(t_error)):
            out[i] = int(out[i])
            label[i] = int(label[i])
        out = np.asarray(out)
        label = np.asarray(label)

        index = np.argsort(label)
        label = label[index]
        out = out[index]
        # t_error = out - label

        """
            label,out: 排好序的向量
            t_error:每个样本点（80万个）的误差
            label_error:从0到100，label的误差均值
        """
        for i in range(len(label)):
            label[i] = int(label[i])
        j = 0
        label_error = list(np.zeros(100))

        for i in range(100):
            label_error[i] = []
            while label[j] == i:
                # label_num[i] += 1
                label_error[i].append(out[j]-label[j])
                j += 1
                if j == len(label)-10:
                    break
            label_error[i] = np.abs(np.mean(label_error[i]))

        """
            误差图
        """
        plt.autoscale(axis='x', tight=True)
        plt.bar(list(range(100)), label_error)
        plt.xlabel("bis index")
        plt.ylabel("label nums")
        plt.show()

        """
            相关性计算
        """
        a = np.asarray(label_num)
        b = np.asarray(label_error)
        for i in range(100):
            if b[i] < 0:
                b[i] = -b[i]
        plt.subplot(2, 1, 1)
        plt.title(f"Pearson correlation: -{np.corrcoef(a, b)[0, 1]}")
        plt.bar(range(100), a, color='lightskyblue')
        plt.ylabel("sample nums")
        plt.subplot(2, 1, 2)
        plt.bar(range(100), b, color='lightcoral')
        plt.xlabel("label space")
        plt.ylabel("test error")
        plt.show()
        """
            卷积
        """
        from scipy.ndimage import convolve1d
        p = a/len(label)
        lds_kernel_window = self.get_lds_kernel_window(kernel='gaussian', ks=10, sigma=8)

        eff_label_dist = convolve1d(p, weights=lds_kernel_window, mode='constant')
        cor = np.corrcoef(eff_label_dist, b)[0, 1]
        plt.bar(range(100), eff_label_dist, color='lightcoral')
        plt.show()
        return t_error, label_error

    @staticmethod
    def get_lds_kernel_window(kernel, ks, sigma):
        from scipy.ndimage import gaussian_filter1d
        from scipy.signal.windows import triang
        assert kernel in ['gaussian', 'triang', 'laplace']
        half_ks = (ks - 1) // 2
        if kernel == 'gaussian':
            base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
            kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
        elif kernel == 'triang':
            kernel_window = triang(ks)
        else:
            laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
            kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

        return kernel_window



"""if __name__ == "__main__":
    x1 = [torch.ones(3000)*10, torch.ones(3000)*110, torch.ones(3000)*56]
    y1 = [torch.ones(3001), torch.ones(3001), torch.ones(3001)]
    e = Evalulate(x1, y1)
    MDPE, MDAPE, RMSE = e.loss()
    e.ratelist()"""

'if __name__ == "__main__":\n    x1 = [torch.ones(3000)*10, torch.ones(3000)*110, torch.ones(3000)*56]\n    y1 = [torch.ones(3001), torch.ones(3001), torch.ones(3001)]\n    e = Evalulate(x1, y1)\n    MDPE, MDAPE, RMSE = e.loss()\n    e.ratelist()'

##<font color='coral'>class </font> main_baseline

In [9]:
import torch
import matplotlib.pyplot as plt
import tqdm
#from loader import database
#import evaluate
import numpy as np
import random
#from model.baseline import trainer, params
import imp


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True


# 设置随机数种子
setup_seed(2)
# 训练或测试模式
mode = 'test'
# 训练参数读取
args = Params.trainparam()


if __name__ == "__main__":
    with torch.cuda.device(args.device):
        args.device = torch.device(f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu')
        if torch.cuda.is_available():
            print(f"GPU{args.device} open")
        else:
            print("cpu open")

        box = trainer.Trainer(args)
        d_box = database.Dataloader(
            database_wdir="/HDD_data/HYK/bis/database",
            time_step=1,
            nums=1,
            tw=180
        )
        # 开始训练或测试
        if mode == "train":

            test_loader, test_label = d_box.test_data_loader(
                data="test",
                batch=76,
                batch_size=128
            )

            d_box.time_step = 10
            vaild_loader = d_box.train_data_loader(
                data="test",
                batch=30,
                batch_size=512
            )

            train_loader = d_box.train_data_loader(
                batch=100,
                batch_size=1024,
            )

            box.train(
                X=train_loader,
                X2=vaild_loader,
                lr=args.lr,
                model_file=args.best_file,
                best_loss=args.best_loss
            )

            test_out = box.test(
                X=test_loader,
                epoch_pth=args.best_file,
                test_batch=76)

            ist, isp = d_box.time_devide(case_nums=76, traindata="test")
            access = evaluate.Evalulate(test_label, test_out, ist, isp, case_num=76)
            print("MDPE    MDAPE    RMSE\r")
            for i in range(4):
                print("%.2f     %.2f     %.2f" % access.loss(i))

        elif mode == "test":
            test_loader, test_label = d_box.test_data_loader(
                batch=args.test_batch,
                batch_size=76
            )

            pre_tr_times = 9
            pre_file = f'/home/user02/HYK/bis_transformer/output/baseline/model/epoch{pre_tr_times}.pth'
            test_out = box.test(
                X=test_loader,
                epoch_pth=args.best_file,
                test_batch=76)

            import statsmodels.api as sm
            lowess = sm.nonparametric.lowess
            new = list(range(76))
            for i in tqdm.tqdm(range(76)):
                axis = list(range(len(test_out[i])))
                new[i] = lowess(test_out[i], axis, frac=0.03)[:, 1]
            ist, isp = d_box.time_devide(case_nums=76, traindata="test")
            access = evaluate.Evalulate(test_label, new, ist, isp, case_num=76)
            print("MDPE    MDAPE    RMSE\r")
            for i in range(4):
                print("%.2f     %.2f     %.2f" % access.loss(i))


            plt.grid(True)
            plt.autoscale(axis='x', tight=True)
            for i in range(4,9):
                plt.figure()
                plt.plot(test_label[i])
                plt.plot(test_out[i])
                plt.show()

usage: colab_kernel_launcher.py [-h] [--model_name MODEL_NAME] [--tw TW]
                                [--train_batch TRAIN_BATCH] [--vaild_batch VAILD_BATCH]
                                [--test_batch TEST_BATCH] [--batch_size BATCH_SIZE]
                                [--train_epoch TRAIN_EPOCH] [--lr LR] [--pre_train PRE_TRAIN]
                                [--pre_tr_times PRE_TR_TIMES] [--device DEVICE]
                                [--best_loss BEST_LOSS]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-82714648-aaaa-4ada-81ff-a04dda4c4052.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
