In [13]:
# Cell 1: Imports and Dataset Definition
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import os

# Dataset wrapper for sequences
class SepsisDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        print(f"Dataset created with {len(self.X)} samples, each of shape {self.X.shape[1:]}")
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [14]:
# Cell 2: GRU Regression Model
class GRUTimeToSepsis(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=1):
        super(GRUTimeToSepsis, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.gru(x)  # shape: (batch, seq_len, hidden_size)
        out = out[:, -1, :]   # last timestep
        out = self.fc(out)    # shape: (batch, 1)
        return out.squeeze(1) # flatten to (batch,)


In [15]:
# Cell 3: Load Preprocessed Data and Patient IDs
# Load your preprocessed full dataset (make sure it contains 'Patient_ID' column)
full_df = pd.read_csv('../data/processed_sepsis_dataset.csv')

print(f"Full dataframe shape: {full_df.shape}")
print(full_df.head())

# Extract the list of features used for modeling
exclude_cols = ['Patient_ID', 'Hour', 'SepsisLabel', 'time_to_sepsis']
features = [col for col in full_df.columns if col not in exclude_cols]

print(f"Features used: {features}")


Full dataframe shape: (1552210, 44)
   Hour        HR     O2Sat      Temp       SBP       MAP       DBP      Resp  \
0     0  0.726119 -0.693117 -1.039536 -1.107301 -0.449908 -0.138132  0.066447   
1     1  0.726119 -0.693117 -1.039536 -1.107301 -0.449908 -0.138132  0.066447   
2     2  0.266199  0.592670 -1.039536 -0.073710  0.200366 -0.138132  0.647755   
3     3  0.323689 -0.693117 -1.039536 -0.073710  0.200366 -0.138132  2.197910   
4     4  1.071059 -2.782520 -1.039536 -0.073710  0.525199 -0.138132  1.132179   

      EtCO2  BaseExcess  ...  Platelets       Age   Gender     Unit1  \
0 -0.019219    9.617977  ...   1.101793  1.289531 -1.12648 -0.655897   
1 -0.019219    9.617977  ...   1.101793  1.289531 -1.12648 -0.655897   
2 -0.019219    9.617977  ...   1.101793  1.289531 -1.12648 -0.655897   
3 -0.019219    9.617977  ...   1.101793  1.289531 -1.12648 -0.655897   
4 -0.019219    9.617977  ...   1.101793  1.289531 -1.12648 -0.655897   

      Unit2  HospAdmTime    ICULOS  SepsisLa

In [16]:
# Cell 4: Patient-level Train/Test Split
import numpy as np

# Unique patients
patient_ids = full_df['Patient_ID'].unique()
np.random.seed(42)
np.random.shuffle(patient_ids)

split_idx = int(len(patient_ids) * 0.8)  # 80% train, 20% test split
train_patients = patient_ids[:split_idx]
test_patients = patient_ids[split_idx:]

print(f"Number of train patients: {len(train_patients)}")
print(f"Number of test patients: {len(test_patients)}")

# Split dataframe accordingly
train_df = full_df[full_df['Patient_ID'].isin(train_patients)]
test_df = full_df[full_df['Patient_ID'].isin(test_patients)]


Number of train patients: 32268
Number of test patients: 8068


In [17]:
# Cell 5: Sequence Creation Function (redefine if not already)
def create_sequences(df, patient_col='Patient_ID', hour_col='Hour', feature_cols=None, seq_length=12):
    if feature_cols is None:
        feature_cols = [c for c in df.columns if c not in [patient_col, hour_col, 'SepsisLabel', 'time_to_sepsis']]
    sequences = []
    labels = []
    patients = df[patient_col].unique()
    for patient in patients:
        patient_data = df[df[patient_col] == patient].sort_values(hour_col)
        features_array = patient_data[feature_cols].values
        time_to_sepsis_array = patient_data['time_to_sepsis'].values
        for start in range(len(patient_data) - seq_length + 1):
            end = start + seq_length
            seq_X = features_array[start:end]
            seq_y = time_to_sepsis_array[end - 1]
            sequences.append(seq_X)
            labels.append(seq_y)
    X = np.array(sequences)
    y = np.array(labels)
    print(f"Created {len(X)} sequences each of length {seq_length}.")
    return X, y

# Create sequences for train and test sets
seq_length = 12
X_train, y_train = create_sequences(train_df, feature_cols=features, seq_length=seq_length)
X_test, y_test = create_sequences(test_df, feature_cols=features, seq_length=seq_length)


Created 884728 sequences each of length 12.
Created 225463 sequences each of length 12.


In [18]:
# Cell 6: Prepare Dataloaders for Training and Testing
from torch.utils.data import DataLoader

batch_size = 128

train_dataset = SepsisDataset(X_train, y_train)
test_dataset = SepsisDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Dataset created with 884728 samples, each of shape torch.Size([12, 40])
Dataset created with 225463 samples, each of shape torch.Size([12, 40])


In [19]:
# Cell 7: Training and Evaluation Functions

def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_idx, (X_batch, y_batch) in enumerate(dataloader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
        if batch_idx % 10 == 0 or batch_idx == len(dataloader)-1:
            print(f"Batch {batch_idx+1}/{len(dataloader)} - Loss: {loss.item():.4f}")

    return total_loss / len(dataloader.dataset)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_idx, (X_batch, y_batch) in enumerate(dataloader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item() * X_batch.size(0)
            if batch_idx % 10 == 0 or batch_idx == len(dataloader)-1:
                print(f"Eval Batch {batch_idx+1}/{len(dataloader)} - Loss: {loss.item():.4f}")
    return total_loss / len(dataloader.dataset)


In [None]:
# Cell 8: Initialize Model, Optimizer, Criterion and Train

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

input_size = X_train.shape[2]
hidden_size = 64
num_layers = 1
learning_rate = 0.001
num_epochs = 5

model = GRUTimeToSepsis(input_size, hidden_size, num_layers).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(1, num_epochs+1):
    print(f"\nEpoch {epoch}/{num_epochs}")
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch} - Train Loss: {train_loss:.4f}")
    val_loss = evaluate_model(model, test_loader, criterion, device)
    print(f"Epoch {epoch} - Validation Loss: {val_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), "gru_time_to_sepsis.pth")
print("Saved model to gru_time_to_sepsis.pth")


Using device: cpu

Epoch 1/5
Batch 1/6912 - Loss: 552.3064
Batch 11/6912 - Loss: 539.0088
Batch 21/6912 - Loss: 527.3292
Batch 31/6912 - Loss: 467.8586
Batch 41/6912 - Loss: 373.0977
Batch 51/6912 - Loss: 304.3159
Batch 61/6912 - Loss: 278.6742
Batch 71/6912 - Loss: 242.8077
Batch 81/6912 - Loss: 218.2467
Batch 91/6912 - Loss: 189.0996
Batch 101/6912 - Loss: 172.9069
Batch 111/6912 - Loss: 160.2070
Batch 121/6912 - Loss: 146.5423
Batch 131/6912 - Loss: 133.5807
Batch 141/6912 - Loss: 121.6431
Batch 151/6912 - Loss: 109.1778
Batch 161/6912 - Loss: 99.9287
Batch 171/6912 - Loss: 92.1254
Batch 181/6912 - Loss: 82.1348
Batch 191/6912 - Loss: 74.9256
Batch 201/6912 - Loss: 71.2886
Batch 211/6912 - Loss: 63.9621
Batch 221/6912 - Loss: 54.9045
Batch 231/6912 - Loss: 48.9568
Batch 241/6912 - Loss: 43.7858
Batch 251/6912 - Loss: 45.7541
Batch 261/6912 - Loss: 37.1774
Batch 271/6912 - Loss: 34.9693
Batch 281/6912 - Loss: 32.0364


In [None]:
# Cell 9: Testing – Evaluate on Test Set

test_loss = evaluate_model(model, test_loader, criterion, device)
print(f"Final Test Loss (MSE): {test_loss:.4f}")
