In [None]:
# Install PyPots
!pip install pypots

In [None]:
!pip install https://github.com/WenjieDu/PyPOTS/archive/dev.zip


# **Necessary Imports**

In [2]:
import pypots
import numpy as np
import pandas as pd
import tsdb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from benchpots.utils.logging import logger, print_final_dataset_info
from benchpots.utils.missingness import create_missingness #To create artificial missingness


2025-03-19 08:15:14 [INFO]: Wrote new configs to config.ini successfully.
2025-03-19 08:15:14 [INFO]: 💫 Initialized PyPOTS Ecosystem configuration file /root/.pypots/config.ini successfully.


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



# Load data

In [None]:
df = pd.read_csv('attachments/synthetic_eicu.csv')
df.head()

Unnamed: 0,sample_id,timestamp,apacheadmissiondx,ethnicity,gender,GCS Total,Eyes,Motor,Verbal,admissionheight,...,MAP (mmHg),Invasive BP Diastolic,Invasive BP Systolic,O2 Saturation,Respiratory Rate,Temperature (C),glucose,FiO2,pH,label
0,0,0,17.0,394.0,398.0,,,,,182.9,...,80.0,56.0,119.0,99.0,,,,,,0
1,0,1,17.0,394.0,398.0,,,,,182.9,...,79.0,56.0,112.0,98.0,,,,,,0
2,0,2,17.0,394.0,398.0,413.0,,,,182.9,...,75.0,56.0,112.0,98.0,20.0,35.3,,,,0
3,0,3,17.0,394.0,398.0,,,,,182.9,...,79.0,58.0,108.0,97.0,,,,,,0
4,0,4,17.0,394.0,398.0,,,,,182.9,...,76.0,55.0,111.0,91.0,,,,,,0





# Ensure Consistent Time Steps:
If your time series have varying lengths, you need to standardize them by padding shorter series with missing values (NaN) or truncating longer ones. Let’s set a maximum length, for our example we have 48 time steps denoting 48 hrs of records for each patient (you can adjust this based on your data).



In [10]:
max_length = 48

def pad_truncate(df):
    if len(df) > max_length:
        # If the DataFrame is longer than the maximum length, truncate it
        return df.iloc[:max_length]
    else:
        # If the DataFrame is shorter than the maximum length, pad it with NaN values
        padding = pd.DataFrame(
            index=range(max_length - len(df)),
            columns=df.columns
        )
        if not padding.empty:
            return pd.concat([df, padding])
        else:
            return df

# Apply padding/truncating to each sample
new_df = df.groupby('sample_id').apply(pad_truncate).reset_index(drop=True)

  new_df = df.groupby('sample_id').apply(pad_truncate).reset_index(drop=True)


# Split the data into train, test and validation sets

We have to divide our dataset into three distinct sets: a training set, a validation set, and a test set. This is a crucial step in machine learning to ensure that our model learns effectively and can generalize to new, unseen data. We'll use the train_test_split function to split our data based on unique sample_id values. This ensures that all data points belonging to the same sample stay together in the same set, preventing data leakage.


In [64]:
unique_sample_ids = new_df['sample_id'].unique()

# Split into train and a temporary set (e.g., 80% train, 20% temp)
train_ids, temp_ids = train_test_split(unique_sample_ids, test_size=0.2, random_state=42)

# Split the temporary set into validation and test (e.g., 50% val, 50% test of the temp set, which is 10% each of the original)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

train_df = new_df[new_df['sample_id'].isin(train_ids)]
val_df = new_df[new_df['sample_id'].isin(val_ids)]
test_df = new_df[new_df['sample_id'].isin(test_ids)]

print(f"Train DataFrame shape: {train_df.shape}")
print(f"Validation DataFrame shape: {val_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")



# Separate Features and Labels

def separate_features_labels(df, feature_cols, label_col='label'):
    X = df[feature_cols].values.reshape(-1, 48, len(feature_cols))
    # Get the unique sample IDs to ensure correct label extraction
    unique_ids = df['sample_id'].unique()
    # Extract the label for each unique sample ID (assuming label is consistent for each ID)
    y = df.groupby('sample_id')[label_col].first().loc[unique_ids].values
    return X, y

# Assuming 'feature_columns' is a list of your feature column names
feature_columns = [col for col in df.columns if col not in ['sample_id', 'label', 'timestamp']]

train_X, train_y = separate_features_labels(train_df.copy(), feature_columns)
val_X, val_y = separate_features_labels(val_df.copy(), feature_columns)
test_X, test_y = separate_features_labels(test_df.copy(), feature_columns)

print(f"Train features shape: {train_X.shape}, Train labels shape: {train_y.shape}")
print(f"Validation features shape: {val_X.shape}, Validation labels shape: {val_y.shape}")
print(f"Test features shape: {test_X.shape}, Test labels shape: {test_y.shape}")

Train DataFrame shape: (235584, 23)
Validation DataFrame shape: (29472, 23)
Test DataFrame shape: (29472, 23)
Train features shape: (4908, 48, 20), Train labels shape: (4908,)
Validation features shape: (614, 48, 20), Validation labels shape: (614,)
Test features shape: (614, 48, 20), Test labels shape: (614,)


# Normalize the data

In [66]:
scaler = StandardScaler()
# Flatten the data before scaling and then reshape it into time series samples
train_X = scaler.fit_transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
val_X = scaler.transform(val_X.reshape(-1, val_X.shape[-1])).reshape(val_X.shape)
test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)

In [67]:
processed_dataset = {
        # general info
        "n_classes": len(np.unique(train_y)),
        "n_steps": train_X.shape[-2],
        "n_features": train_X.shape[-1],
        "scaler": scaler,
        # train set
        "train_X": train_X,
        "train_y": train_y.flatten(),
        # val set
        "val_X": val_X,
        "val_y": val_y.flatten(),
        # test set
        "test_X": test_X,
        "test_y": test_y.flatten(),
    }

# create artificial missingness


In [68]:

# hold out ground truth in the original data for evaluation
train_X_ori = train_X
val_X_ori = val_X
test_X_ori = test_X

rate = 0.1 # 10% missingness
# mask values in the train set as ground truth
train_X = create_missingness(train_X, rate, 'point')

# mask values in the validation set as ground truth
val_X = create_missingness(val_X, rate, 'point' )
# mask values in the test set as ground truth
test_X = create_missingness(test_X, rate, 'point' )


processed_dataset["train_X"] = train_X
processed_dataset["val_X"] = val_X
processed_dataset["test_X"] = test_X

processed_dataset['train_X_ori'] = train_X_ori
processed_dataset['val_X_ori'] = val_X_ori
processed_dataset['test_X_ori'] = test_X_ori

# Prepare the data for imputation


In [69]:
## calculate the mask to indicate the ground truth positions in  ori data, will be used by metric funcs to evaluate models

train_X_indicating_mask = np.isnan(train_X_ori) ^ np.isnan(train_X)
val_X_indicating_mask = np.isnan(val_X_ori) ^ np.isnan(val_X)
test_X_indicating_mask = np.isnan(test_X_ori) ^ np.isnan(test_X)

# assemble the datasets for training
dataset_for_training = {
    "X": processed_dataset['train_X'],
    'X_ori': processed_dataset['train_X_ori'],
}
# assemble the datasets for validation
dataset_for_validating = {
    "X": processed_dataset['val_X'],
    "X_ori": processed_dataset['val_X_ori'],
}
# assemble the datasets for test
dataset_for_testing = {
    "X": processed_dataset['test_X'],
    "X_ori": processed_dataset['test_X_ori'],
  }

test_X_indicating_mask = np.isnan(processed_dataset['test_X_ori']) ^ np.isnan(processed_dataset['test_X'])
# metric functions do not accpet input with NaNs, hence fill NaNs with 0
test_X_ori = np.nan_to_num(processed_dataset['test_X_ori'])

# Use BRITS imputation model from the pypots package


In [75]:
from pypots.optim import Adam
from pypots.imputation import BRITS
from pypots.utils.metrics import calc_mae

# initialize the model
brits = BRITS(
    n_steps=processed_dataset['n_steps'],
    n_features=processed_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=5,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/brits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
brits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
test_set_imputation = brits.impute(dataset_for_testing)

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    test_set_imputation,
    test_X_ori,
    test_X_indicating_mask,
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


2025-03-19 11:16:47 [INFO]: No given device, using default device: cpu
2025-03-19 11:16:47 [INFO]: Model files will be saved to tutorial_results/imputation/brits/20250319_T111647
2025-03-19 11:16:47 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/brits/20250319_T111647/tensorboard
2025-03-19 11:16:47 [INFO]: Using customized MAE as the training loss function.
2025-03-19 11:16:47 [INFO]: Using customized MSE as the validation metric function.
2025-03-19 11:16:47 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 187,936
2025-03-19 11:18:04 [INFO]: Epoch 001 - training loss (MAE): 1.1051, validation MSE: 0.2599
2025-03-19 11:19:14 [INFO]: Epoch 002 - training loss (MAE): 0.7470, validation MSE: 0.2164
2025-03-19 11:20:44 [INFO]: Epoch 003 - training loss (MAE): 0.6702, validation MSE: 0.2038
2025-03-19 11:21:40 [INFO]: Epoch 004 - training loss (MAE): 0.6358, validation MSE: 0.1985
2025-03-19 11:22:34 [INFO]: Epoch 005 - tr

Testing mean absolute error: 0.2196


In [76]:
# impute the train and val sets
train_set_imputation = brits.impute(dataset_for_training)
val_set_imputation = brits.impute(dataset_for_validating)

In [77]:
train_set_imputation.shape

(4908, 48, 20)

# Convert 3D NumPy array to a DataFrame with original scale features.

In [92]:
def convert_to_dataframe(X, labels, sample_ids, scaler, invers_norm = False, n_steps=48):
    """
    Convert 3D NumPy array to a DataFrame with sample_id, timestamp, and original scale features.

    Parameters:
    - X: 3D NumPy array of shape (n_samples, n_steps, n_features)
    - labels: 1D NumPy array of shape (n_samples,) -> labels for each sample
    - sample_ids: 1D NumPy array with sample IDs corresponding to each sample
    - scaler: Scaler used for normalization (MinMaxScaler/StandardScaler)
    - n_steps: Number of time steps (default: 48)

    Returns:
    - DataFrame with sample_id, timestamp, features, and labels
    """
    n_samples, _, n_features = X.shape

    assert len(feature_columns) == n_features, "Number of features in X does not match feature_columns"
    assert len(labels) == n_samples, "Number of labels does not match number of samples"
    assert len(sample_ids) == n_samples, "Number of sample IDs does not match number of samples"

    # extract the last timestep record for each sample_id  to get one row per sample,
    # using the final timestep’s data (e.g., the last hour if n_steps=48 represents hourly data)

    X_last = X[:, -1, :]  # Shape: (n_samples, n_features)

    # Inverse normalization
    if invers_norm:
      X_original = scaler.inverse_transform(X_last)
    else:
      X_original = X_last


    # Create DataFrame
    df = pd.DataFrame(X_original, columns=feature_columns)
    df['sample_id'] = sample_ids
    df['timestamp'] = n_steps - 1  # Last timestep (e.g., 47 if 0-indexed)
    df['label'] = labels

    # Reorder columns: sample_id, timestamp, features, label
    df = df[['sample_id', 'timestamp'] + feature_columns + ['label']]

    return df

In [93]:
# Convert all datasets
df_train_imputed = convert_to_dataframe(train_set_imputation, train_y, train_ids, scaler)
df_val_imputed = convert_to_dataframe(val_set_imputation, val_y, val_ids, scaler)
df_test_imputed = convert_to_dataframe(test_set_imputation, test_y, test_ids, scaler)

# Check the shapes
print(df_train_imputed.shape, df_val_imputed.shape, df_test_imputed.shape)

(4908, 23) (614, 23) (614, 23)


In [94]:
df_train_imputed.head()

Unnamed: 0,sample_id,timestamp,apacheadmissiondx,ethnicity,gender,GCS Total,Eyes,Motor,Verbal,admissionheight,...,MAP (mmHg),Invasive BP Diastolic,Invasive BP Systolic,O2 Saturation,Respiratory Rate,Temperature (C),glucose,FiO2,pH,label
0,3098,47,-0.676732,0.3022,0.918308,0.661468,0.250186,0.425022,0.544028,1.156228,...,0.318641,-0.02475,0.19702,-0.199657,1.652006,-0.124427,-0.036221,0.074256,-0.176366,0
1,4221,47,-0.516926,0.3022,-1.088959,0.365205,0.297844,0.18469,0.452102,-1.677569,...,-0.361959,-0.616842,-0.466705,-0.496805,-0.015378,-0.806545,-0.528615,-0.081251,0.031246,0
2,3154,47,-0.490291,0.3022,-0.529956,0.142003,0.257404,0.208831,0.40913,-0.16621,...,-0.335883,-1.012521,-0.289712,0.09749,0.85289,0.225891,-0.109319,-0.33446,-0.10257,0
3,4041,47,-0.730001,0.3022,-1.088959,-0.109745,0.063663,-0.010945,0.152749,-1.462887,...,0.411501,0.102339,0.438261,0.09749,3.569884,-0.524262,3.389644,0.166113,0.06081,1
4,2664,47,-0.78327,0.3022,-1.088959,0.146961,0.248287,0.157304,0.318992,-1.248206,...,-0.70226,-0.682788,-0.304788,-0.793952,1.172537,0.21042,-0.160382,-0.107644,0.083878,0


#  Prepare the Data for Classification


In [95]:
# Drop sample_id and timestamp and seperate the labels
X_train_2d = df_train_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_train = df_train_imputed['label']

X_val_2d = df_val_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_val = df_val_imputed['label']

X_test_2d = df_test_imputed.drop(['sample_id', 'timestamp', 'label'], axis=1)
y_test = df_test_imputed['label']

print(f"Train: {X_train_2d.shape}, {y_train.shape}")
print(f"Val: {X_val_2d.shape}, {y_val.shape}")
print(f"Test: {X_test_2d.shape}, {y_test.shape}")

Train: (4908, 20), (4908,)
Val: (614, 20), (614,)
Test: (614, 20), (614,)


In [96]:
X_train_2d.head()

Unnamed: 0,apacheadmissiondx,ethnicity,gender,GCS Total,Eyes,Motor,Verbal,admissionheight,admissionweight,age,Heart Rate,MAP (mmHg),Invasive BP Diastolic,Invasive BP Systolic,O2 Saturation,Respiratory Rate,Temperature (C),glucose,FiO2,pH
0,-0.676732,0.3022,0.918308,0.661468,0.250186,0.425022,0.544028,1.156228,0.928327,0.777496,0.538832,0.318641,-0.02475,0.19702,-0.199657,1.652006,-0.124427,-0.036221,0.074256,-0.176366
1,-0.516926,0.3022,-1.088959,0.365205,0.297844,0.18469,0.452102,-1.677569,-0.180996,1.332242,-0.89154,-0.361959,-0.616842,-0.466705,-0.496805,-0.015378,-0.806545,-0.528615,-0.081251,0.031246
2,-0.490291,0.3022,-0.529956,0.142003,0.257404,0.208831,0.40913,-0.16621,-1.057595,0.59258,1.386459,-0.335883,-1.012521,-0.289712,0.09749,0.85289,0.225891,-0.109319,-0.33446,-0.10257
3,-0.730001,0.3022,-1.088959,-0.109745,0.063663,-0.010945,0.152749,-1.462887,-0.42848,0.59258,0.538832,0.411501,0.102339,0.438261,0.09749,3.569884,-0.524262,3.389644,0.166113,0.06081
4,-0.78327,0.3022,-1.088959,0.146961,0.248287,0.157304,0.318992,-1.248206,-0.461162,0.654219,0.750739,-0.70226,-0.682788,-0.304788,-0.793952,1.172537,0.21042,-0.160382,-0.107644,0.083878


In [101]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score

# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    n_estimators=500,
    early_stopping_rounds=10,
    objective='binary:logistic',  # For binary classification (mortality: 0 or 1)
    eval_metric='logloss',        # Loss function to monitor
)


In [None]:

# Train the model with early stopping
model.fit(
    X_train_2d, y_train,
    eval_set=[(X_val_2d, y_val)],
    verbose=True
)


[0]	validation_0-logloss:0.31016
[1]	validation_0-logloss:0.28234
[2]	validation_0-logloss:0.27017
[3]	validation_0-logloss:0.25920
[4]	validation_0-logloss:0.24730
[5]	validation_0-logloss:0.24300
[6]	validation_0-logloss:0.23986
[7]	validation_0-logloss:0.23759
[8]	validation_0-logloss:0.23893
[9]	validation_0-logloss:0.23980
[10]	validation_0-logloss:0.23825
[11]	validation_0-logloss:0.23919
[12]	validation_0-logloss:0.23995
[13]	validation_0-logloss:0.24077
[14]	validation_0-logloss:0.23888
[15]	validation_0-logloss:0.23481
[16]	validation_0-logloss:0.23290
[17]	validation_0-logloss:0.23212
[18]	validation_0-logloss:0.23444
[19]	validation_0-logloss:0.23361
[20]	validation_0-logloss:0.23525
[21]	validation_0-logloss:0.23729
[22]	validation_0-logloss:0.23716
[23]	validation_0-logloss:0.23525
[24]	validation_0-logloss:0.23466
[25]	validation_0-logloss:0.23375
[26]	validation_0-logloss:0.23469
[27]	validation_0-logloss:0.23462


In [None]:
# Predict on the test set
y_pred = model.predict(X_test_2d)            # Class predictions
y_pred_proba = model.predict_proba(X_test_2d)[:, 1]  # Probabilities for AUC

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

Accuracy: 0.8958
AUC: 0.8064
