# Task06. 自定义数据集下游任务的两种分析方法

在本节中，我们将基于 **Task05 插补后的数据集** 进行下游任务分析，主要包括：

1. 基于 XGBoost 的基线分类
2. 基于 PyPOTS SAITS 模型的时序分类

## 1. 基于插补数据的基线分类分析

### 1.1 加载插补后的数据集

首先，从之前保存的插补结果文件中加载训练集、验证集和测试集：

In [None]:
import pandas as pd

# 重新加载保存的三个数据集
df_train_imputed = pd.read_csv('train_imputed.csv')
df_val_imputed = pd.read_csv('val_imputed.csv')
df_test_imputed = pd.read_csv('test_imputed.csv')

### 1.2 提取特征和标签

In [None]:
# 去除不必要的列
X_train_2d = df_train_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_train = df_train_imputed['label']

X_val_2d = df_val_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_val = df_val_imputed['label']

X_test_2d = df_test_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_test = df_test_imputed['label']

print(f"Train: {X_train_2d.shape}, {y_train.shape}")
print(f"Val: {X_val_2d.shape}, {y_val.shape}")
print(f"Test: {X_test_2d.shape}, {y_test.shape}")

Train: (4908, 20), (4908,)
Val: (614, 20), (614,)
Test: (614, 20), (614,)


In [2]:
X_train_2d.head()

Unnamed: 0,apacheadmissiondx,ethnicity,gender,GCS Total,Eyes,Motor,Verbal,admissionheight,admissionweight,age,Heart Rate,MAP (mmHg),Invasive BP Diastolic,Invasive BP Systolic,O2 Saturation,Respiratory Rate,Temperature (C),glucose,FiO2,pH
0,-0.676732,0.3022,0.918308,0.795591,0.674137,0.524492,0.771686,1.165612,0.928327,0.907871,0.538832,0.318641,-0.023323,0.19702,-0.199657,1.652006,-0.366409,-0.071124,-0.048971,0.201896
1,-0.516926,0.3022,-1.08896,0.365205,0.557349,0.474666,0.538797,-1.677569,-0.180996,1.332242,-0.89154,-0.412638,-0.616842,-0.466705,-0.407183,-0.905164,-0.806545,-0.528615,-0.226574,0.09355
2,-0.490291,0.3022,-1.08896,0.748909,0.690633,0.461668,0.76076,-0.16621,-1.057595,0.59258,1.386459,-0.678885,-1.012521,-0.289712,0.09749,0.85289,0.024736,-0.034354,-0.084154,0.227154
3,-0.730001,0.3022,-1.08896,0.557269,0.479801,0.309801,0.727504,-1.462887,-0.095642,0.59258,0.190069,0.465096,0.312368,0.427536,0.09749,3.569884,-0.524262,3.389644,0.225751,0.242584
4,-0.78327,0.3022,-1.08896,0.723559,0.775377,0.465457,0.695992,-1.248206,-0.090857,0.693584,0.750738,-0.70226,-0.682788,-0.732195,-0.793952,1.172537,0.051975,-0.165092,-0.009692,0.099024


### 1.3 基于 XGBoost 进行分类建模

In [3]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score

# 初始化XGBoost分类器
model = xgb.XGBClassifier(
    n_estimators=500,
    early_stopping_rounds=10,
    objective='binary:logistic',  # For binary classification (mortality: 0 or 1)
    eval_metric='logloss',        # Loss function to monitor
)

In [4]:
# 训练模型
model.fit(
    X_train_2d, y_train,
    eval_set=[(X_val_2d, y_val)],
    verbose=True
)

[0]	validation_0-logloss:0.30800
[1]	validation_0-logloss:0.29491
[2]	validation_0-logloss:0.28646
[3]	validation_0-logloss:0.28469
[4]	validation_0-logloss:0.27681
[5]	validation_0-logloss:0.27418
[6]	validation_0-logloss:0.26841
[7]	validation_0-logloss:0.26555
[8]	validation_0-logloss:0.26772
[9]	validation_0-logloss:0.26706
[10]	validation_0-logloss:0.26751
[11]	validation_0-logloss:0.26650
[12]	validation_0-logloss:0.26636
[13]	validation_0-logloss:0.26435
[14]	validation_0-logloss:0.26369
[15]	validation_0-logloss:0.26515
[16]	validation_0-logloss:0.26655
[17]	validation_0-logloss:0.26542
[18]	validation_0-logloss:0.26486
[19]	validation_0-logloss:0.26336
[20]	validation_0-logloss:0.26200
[21]	validation_0-logloss:0.26024
[22]	validation_0-logloss:0.26033
[23]	validation_0-logloss:0.26241
[24]	validation_0-logloss:0.26131
[25]	validation_0-logloss:0.26086
[26]	validation_0-logloss:0.25906
[27]	validation_0-logloss:0.25879
[28]	validation_0-logloss:0.25807
[29]	validation_0-loglos

In [5]:
# 测试模型性能
y_pred = model.predict(X_test_2d)            # Class predictions
y_pred_proba = model.predict_proba(X_test_2d)[:, 1]  # Probabilities for AUC

# 计算准确率和AUC
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

Accuracy: 0.8990
AUC: 0.8344


## 2. 使用 PyPOTS SAITS对自定义数据集进行基于端到端的时序建模与分类分析

### 2.1 构建 PyPOTS 数据集

In [1]:
import pypots
import numpy as np
import pandas as pd
import tsdb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from benchpots.utils.logging import logger, print_final_dataset_info
from benchpots.utils.missingness import create_missingness # 生成人工缺失值

# 设置模型的运行设备为cpu, 如果你有gpu设备可以设置为cuda
DEVICE='cuda'
df = pd.read_csv('synthetic_eicu.csv')

max_length = 48

def pad_truncate(df):
    if len(df) > max_length:
        # 如果 DataFrame 超过最大长度，则截断
        # 这里我们选择保留前 max_length 行
        # 你也可以选择其他策略，比如保留最后 max_length 行
        return df.iloc[:max_length]
    else:
        # 如果 DataFrame 少于最大长度，则填充
        # 这里我们用 NaN 填充
        # 你也可以选择其他填充值，比如 0 或者均值等
        padding = pd.DataFrame(
            index=range(max_length - len(df)),
            columns=df.columns
        )
        if not padding.empty:
            return pd.concat([df, padding])
        else:
            return df

new_df = df.groupby('sample_id').apply(pad_truncate).reset_index(drop=True)

unique_sample_ids = new_df['sample_id'].unique()

train_ids, temp_ids = train_test_split(unique_sample_ids, test_size=0.2, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

train_df = new_df[new_df['sample_id'].isin(train_ids)]
val_df = new_df[new_df['sample_id'].isin(val_ids)]
test_df = new_df[new_df['sample_id'].isin(test_ids)]

print(f"Train DataFrame shape: {train_df.shape}")
print(f"Validation DataFrame shape: {val_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")

def separate_features_labels(df, feature_cols, label_col='label'):
    X = df[feature_cols].values.reshape(-1, 48, len(feature_cols))
    # 获取唯一的样本 ID
    unique_ids = df['sample_id'].unique()
    # 获取每个样本 ID 的第一个标签
    y = df.groupby('sample_id')[label_col].first().loc[unique_ids].values
    return X, y

feature_columns = [col for col in df.columns if col not in ['sample_id', 'label', 'timestamp']]

train_X, train_y = separate_features_labels(train_df.copy(), feature_columns)
val_X, val_y = separate_features_labels(val_df.copy(), feature_columns)
test_X, test_y = separate_features_labels(test_df.copy(), feature_columns)

print(f"Train features shape: {train_X.shape}, Train labels shape: {train_y.shape}")
print(f"Validation features shape: {val_X.shape}, Validation labels shape: {val_y.shape}")
print(f"Test features shape: {test_X.shape}, Test labels shape: {test_y.shape}")

scaler = StandardScaler()
# Flatten the data before scaling and then reshape it into time series samples
train_X = scaler.fit_transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
val_X = scaler.transform(val_X.reshape(-1, val_X.shape[-1])).reshape(val_X.shape)
test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)

processed_dataset = {
        # general info
        "n_classes": len(np.unique(train_y)),
        "n_steps": train_X.shape[-2],
        "n_features": train_X.shape[-1],
        "scaler": scaler,
        # train set
        "train_X": train_X,
        "train_y": train_y.flatten(),
        # val set
        "val_X": val_X,
        "val_y": val_y.flatten(),
        # test set
        "test_X": test_X,
        "test_y": test_y.flatten(),
    }

rate = 0.1 # 10%缺失率

# 在训练集上创建缺失值作为ground truth
train_X = create_missingness(train_X, rate, 'point')

# 在验证集上创建缺失值作为ground truth
val_X = create_missingness(val_X, rate, 'point' )

# 在测试集上创建缺失值作为ground truth
test_X = create_missingness(test_X, rate, 'point' )


processed_dataset["train_X"] = train_X
processed_dataset["val_X"] = val_X
processed_dataset["test_X"] = test_X

processed_dataset['train_y'] = train_y
processed_dataset['val_y'] = val_y
processed_dataset['test_y'] = test_y


# 组装训练集
dataset_for_training = {
    "X": processed_dataset['train_X'],
    'y': processed_dataset['train_y'],
}

# 组装验证集
dataset_for_validating = {
    "X": processed_dataset['val_X'],
    "y": processed_dataset['val_y'],
}

# 组装测试集
dataset_for_testing = {
    "X": processed_dataset['test_X'],
    "y": processed_dataset['test_y'],
  }

  from .autonotebook import tqdm as notebook_tqdm


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



  new_df = df.groupby('sample_id').apply(pad_truncate).reset_index(drop=True)


Train DataFrame shape: (235584, 23)
Validation DataFrame shape: (29472, 23)
Test DataFrame shape: (29472, 23)
Train features shape: (4908, 48, 20), Train labels shape: (4908,)
Validation features shape: (614, 48, 20), Validation labels shape: (614,)
Test features shape: (614, 48, 20), Test labels shape: (614,)


### 2.2 SAITS建模分析

In [3]:
from pypots.nn.functional import calc_mae
from pypots.optim import Adam
from pypots.classification import SAITS

# 创建 SAITS 模型
# SAITS 模型的参数可以根据需要进行调整
saits = SAITS(
    n_steps=processed_dataset['n_steps'],
    n_features=processed_dataset['n_features'],
    n_classes=processed_dataset['n_classes'],
    n_layers=1,
    d_model=256,
    d_ffn=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=DEVICE,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/classification/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

# 训练阶段，使用训练集和验证集
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2025-05-10 08:19:18 [INFO]: Using the given device: cuda
2025-05-10 08:19:18 [INFO]: Model files will be saved to tutorial_results/classification/saits/20250510_T081918
2025-05-10 08:19:18 [INFO]: Tensorboard file will be saved to tutorial_results/classification/saits/20250510_T081918/tensorboard
2025-05-10 08:19:18 [INFO]: Using customized CrossEntropy as the training loss function.
2025-05-10 08:19:18 [INFO]: Using customized PR_AUC as the validation metric function.
2025-05-10 08:19:18 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 693,170
2025-05-10 08:19:20 [INFO]: Epoch 001 - training loss (CrossEntropy): 0.2985, validation PR_AUC: 0.4365
2025-05-10 08:19:22 [INFO]: Epoch 002 - training loss (CrossEntropy): 0.2623, validation PR_AUC: 0.4418
2025-05-10 08:19:23 [INFO]: Epoch 003 - training loss (CrossEntropy): 0.2442, validation PR_AUC: 0.5476
2025-05-10 08:19:25 [INFO]: Epoch 004 - training loss (CrossEntropy): 0.2489, validation PR_

In [4]:
from pypots.nn.functional.classification import calc_binary_classification_metrics

saits_results = saits.predict(dataset_for_testing)
saits_prediction = saits_results["classification"]

classification_metrics=calc_binary_classification_metrics(
    saits_prediction, dataset_for_testing["y"]
)
print(f"SAITS在测试集上的ROC-AUC为: {classification_metrics['roc_auc']:.4f}\n")
print(f"SAITS在测试集上的PR-AUC为: {classification_metrics['pr_auc']:.4f}\n")

SAITS在测试集上的ROC-AUC为: 0.8313

SAITS在测试集上的PR-AUC为: 0.4962

