In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
data_filtered = pd.read_csv("/content/drive/My Drive/vital_sign_text.csv")

  data_filtered = pd.read_csv("/content/drive/My Drive/vital_sign_text.csv")


In [None]:
data_cleaned = data_filtered.dropna(subset=['text'])

In [None]:
data_cleaned

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,deathtime,stay_id,charttime,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,pain,hospital_expire_flag,death,text
0,0,10000032,22595853,,33258284,2180-05-06 23:04:00,97.7,79.0,16.0,98.0,107.0,60.0,,0,0,0,EXAMINATION: CHEST (PA AND LAT)\n\nINDICATION...
1,1,10000032,22841357,,38112554,2180-06-26 18:42:00,97.9,76.0,18.0,95.0,95.0,64.0,,5,0,0,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...
2,2,10000032,22841357,,38112554,2180-06-26 20:54:00,97.9,86.0,17.0,93.0,96.0,57.0,,,0,0,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...
3,3,10000032,25742920,,35968195,2180-08-05 23:50:00,98.5,96.0,17.0,100.0,102.0,58.0,,,0,0,INDICATION: ___ year old woman with cirrhosis...
4,4,10000032,25742920,,35968195,2180-08-06 01:07:00,98.1,91.0,18.0,99.0,98.0,60.0,,,0,0,INDICATION: ___ year old woman with cirrhosis...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735144,735144,16993438,21110385,,31378381,2169-03-12 15:03:00,99.1,63.0,18.0,96.0,97.0,40.0,,3,0,0,HISTORY: Left lower leg swelling. Evaluate f...
735145,735145,16993438,21110385,,31378381,2169-03-12 15:23:00,,56.0,21.0,95.0,89.0,35.0,,,0,0,HISTORY: Left lower leg swelling. Evaluate f...
735146,735146,16993438,21110385,,31378381,2169-03-12 15:54:00,,66.0,18.0,96.0,81.0,40.0,,0,0,0,HISTORY: Left lower leg swelling. Evaluate f...
735147,735147,16993438,21110385,,31378381,2169-03-12 16:31:00,98.1,58.0,20.0,95.0,94.0,36.0,,,0,0,HISTORY: Cellulitis.\n\nTECHNIQUE: 2 views o...


In [None]:
death_counts = data_cleaned['death'].value_counts()

# 打印结果
print(death_counts)

death
0    347941
1       115
Name: count, dtype: int64


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# 假设你已经有data_cleaned的DataFrame
class VitalSignsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['text']
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        # 处理vital signs，确保其为float类型并处理NaN
        vital_signs = row[['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']].astype(float).fillna(0.0).values
        vital_signs = torch.tensor(vital_signs, dtype=torch.float)

        label = torch.tensor(row['death'], dtype=torch.float)

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'vital_signs': vital_signs,
            'label': label
        }

class BertVitalSignsModel(nn.Module):
    def __init__(self):
        super(BertVitalSignsModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc1 = nn.Linear(self.bert.config.hidden_size + 6, 128)  # 假设有6个vital sign特征
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, vital_signs):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output  # 使用正确的pooler输出
        combined_input = torch.cat((cls_output, vital_signs), dim=1)
        x = torch.relu(self.fc1(combined_input))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    vital_signs = torch.stack([item['vital_signs'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'vital_signs': vital_signs, 'label': labels}



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载数据和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

# 标准化vital signs
scaler = StandardScaler()
data_cleaned[['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']] = scaler.fit_transform(data_cleaned[['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']])

# 数据集分割
train_data, test_data = train_test_split(data_cleaned, test_size=0.2, random_state=42)
train_dataset = VitalSignsDataset(train_data, tokenizer, max_len)
test_dataset = VitalSignsDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

model = BertVitalSignsModel().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
from torch.optim.lr_scheduler import StepLR

scheduler = StepLR(optimizer, step_size=1, gamma=0.9)  # 每个epoch后学习率减少

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']] = scaler.fit_transform(data_cleaned[['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']])


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
num_epochs = 5
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        vital_signs = batch['vital_signs'].to(device)
        labels = batch['label'].unsqueeze(1).to(device)  # [8, 1]

        outputs = model(input_ids, attention_mask, vital_signs).squeeze().unsqueeze(1)  # [8] -> [8, 1]
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪
        optimizer.step()
    scheduler.step()  # 更新学习率

# 模型评估
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        vital_signs = batch['vital_signs'].to(device)
        labels = batch['label'].unsqueeze(1).to(device)  # [8, 1]

        outputs = model(input_ids, attention_mask, vital_signs).squeeze().unsqueeze(1)  # [8] -> [8, 1]
        preds = outputs.cpu().numpy()
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds)

# 计算 Accuracy 和 AUROC
all_labels = np.array(all_labels)
all_preds = np.array(all_preds)
accuracy = accuracy_score(all_labels, [1 if p > 0.5 else 0 for p in all_preds])
auroc = roc_auc_score(all_labels, all_preds)

print(f"Accuracy: {accuracy}")
print(f"AUROC: {auroc}")

Accuracy: 0.9995834051600299
AUROC: 0.7673354198830602


Epoch 1/3


RuntimeError: all elements of input should be between 0 and 1

In [None]:
import numpy as np
numeric_columns = data.select_dtypes(include=[np.number]).columns

# 用列的平均值替换数值列中的NaN值
data[numeric_columns] = data[numeric_columns].apply(lambda x: x.fillna(x.mean()), axis=0)

In [None]:
data.head()

Unnamed: 0,hadm_id,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,pain,death
0,22595853,97.7,79.0,16.0,98.0,107.0,60.0,,0.0,0
1,22841357,97.9,76.0,18.0,95.0,95.0,64.0,,5.0,0
2,22841357,97.9,86.0,17.0,93.0,96.0,57.0,,,0
3,25742920,98.5,96.0,17.0,100.0,102.0,58.0,,,0
4,25742920,98.1,91.0,18.0,99.0,98.0,60.0,,,0


In [None]:
X = data.drop(columns=[ 'rhythm', 'pain'])
#y = data['death']

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset

# Handle missing values by filling with the median of each column

# Select relevant features and target
features = [col for col in X.columns if col != 'hamd_id' and col != 'death']  # Exclude 'hamd_id' and target from features
scaler = MinMaxScaler()
X[features] = scaler.fit_transform(X[features])

# Create sequences for LSTM input based on hamd_id
def create_sequences_grouped_by_hamd_id(df, features, target):
    sequences = []
    for hamd_id, group in df.groupby('hadm_id'):
        group_X = group[features].values
        group_y = group[target].values
        for i in range(1, len(group)):
            sequences.append((group_X[:i+1], group_y[i]))
    return sequences

target_column = 'death'  # Replace with your actual target column name

sequences = create_sequences_grouped_by_hamd_id(X, features, target_column)

# Split sequences into features and targets
X_sequences = [seq[0] for seq in sequences]
y_sequences = [seq[1] for seq in sequences]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Custom Dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = [torch.tensor(seq, dtype=torch.float32) for seq in X]
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

def collate_fn(batch):
    batch_X, batch_y = zip(*batch)
    lengths = [len(x) for x in batch_X]
    padded_X = pad_sequence(batch_X, batch_first=True)
    batch_y = torch.tensor(batch_y, dtype=torch.float32)
    return padded_X, batch_y, lengths

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hn, cn) = self.lstm(packed_input, (h0, c0))
        out, _ = pad_packed_sequence(packed_output, batch_first=True)
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

# Model parameters
input_size = len(features)
hidden_size = 50
num_layers = 2
output_size = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch, lengths in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch, lengths in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch, lengths)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())

# Calculate AUROC
y_pred = np.array(y_pred)
auroc = roc_auc_score(y_true, y_pred)
print(f'AUROC: {auroc:.4f}')

# Convert predictions to binary outcome for other evaluation metrics
y_pred_binary = y_pred > 0.5

# Print evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_true, y_pred_binary))
print(classification_report(y_true, y_pred_binary))

Epoch [1/20], Loss: 0.0005
Epoch [2/20], Loss: 0.0003
Epoch [3/20], Loss: 0.0003
Epoch [4/20], Loss: 0.0003
Epoch [5/20], Loss: 0.0002
Epoch [6/20], Loss: 0.0005
Epoch [7/20], Loss: 0.0002
Epoch [8/20], Loss: 0.0002
Epoch [9/20], Loss: 0.0004
Epoch [10/20], Loss: 0.0005
Epoch [11/20], Loss: 0.0001
Epoch [12/20], Loss: 0.0002
Epoch [13/20], Loss: 0.0002
Epoch [14/20], Loss: 0.0003
Epoch [15/20], Loss: 0.0002
Epoch [16/20], Loss: 0.0002
Epoch [17/20], Loss: 0.0006
Epoch [18/20], Loss: 0.0006
Epoch [19/20], Loss: 0.0002
Epoch [20/20], Loss: 0.0002
AUROC: 0.4365
[[170387      0]
 [    31      0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    170387
         1.0       0.00      0.00      0.00        31

    accuracy                           1.00    170418
   macro avg       0.50      0.50      0.50    170418
weighted avg       1.00      1.00      1.00    170418



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
i = 0
for hamd_id, group in X.groupby('hadm_id'):
  print(hamd_id)
  print(group)
  i+=1
  if i == 5:
    break

0.0
       hadm_id  temperature  heartrate  resprate     o2sat       sbp  \
46718      0.0     0.100304   0.075605  0.000169  0.009703  0.102917   
46719      0.0     0.099190   0.065524  0.000169  0.010003  0.085900   

            dbp  death  
46718  0.000494      0  
46719  0.000515      0  
5.000105001151667e-07
             hadm_id  temperature  heartrate  resprate     o2sat       sbp  \
727616  5.000105e-07     0.099899   0.072581  0.000169  0.009903  0.105348   
727617  5.000105e-07     0.099696   0.063508  0.000191  0.010003  0.115883   
727618  5.000105e-07     0.099113   0.060484  0.000212  0.010003  0.114263   
727619  5.000105e-07     0.099113   0.060484  0.000191  0.010003  0.121556   
727620  5.000105e-07     0.099393   0.053427  0.000191  0.009903  0.114263   

             dbp  death  
727616  0.000474      0  
727617  0.000587      0  
727618  0.000525      0  
727619  0.000577      0  
727620  0.000783      0  
3.8000798014969916e-06
         hadm_id  temperature  hea

In [None]:
y_true

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset

# Handle missing values by filling with the median of each column
X = X.fillna(X.median())

# Select relevant features and target
features = [col for col in X.columns if col != 'hadm_id' and col != 'death']  # Exclude 'hamd_id' and target from features
scaler = MinMaxScaler()
X[features] = scaler.fit_transform(X[features])

# Create sequences for LSTM input based on hamd_id
def create_sequences_grouped_by_hamd_id(df, features, target):
    sequences = []
    for hamd_id, group in df.groupby('hadm_id'):
        group_X = group[features].values
        group_y = group[target].values[-1]  # Use the last value as the target
        sequences.append((group_X, group_y))
    return sequences

target_column = 'death'  # Replace with your actual target column name

sequences = create_sequences_grouped_by_hamd_id(X, features, target_column)

# Split sequences into features and targets
X_sequences = [seq[0] for seq in sequences]
y_sequences = [seq[1] for seq in sequences]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Custom Dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = [torch.tensor(seq, dtype=torch.float32) for seq in X]
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

def collate_fn(batch):
    batch_X, batch_y = zip(*batch)
    lengths = [len(x) for x in batch_X]
    padded_X = pad_sequence(batch_X, batch_first=True)
    batch_y = torch.tensor(batch_y, dtype=torch.float32)
    return padded_X, batch_y, lengths

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hn, cn) = self.lstm(packed_input, (h0, c0))
        out, _ = pad_packed_sequence(packed_output, batch_first=True)
        out = self.fc(out[:, -1, :])  # Take the output of the last time step
        out = self.sigmoid(out)
        return out

# Model parameters
input_size = len(features)
hidden_size = 50
num_layers = 2
output_size = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch, lengths in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch, lengths in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch, lengths)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())

# Calculate AUROC
y_pred = np.array(y_pred)
auroc = roc_auc_score(y_true, y_pred)
print(f'AUROC: {auroc:.4f}')

# Convert predictions to binary outcome for other evaluation metrics
y_pred_binary = y_pred > 0.5

# Print evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_true, y_pred_binary))
print(classification_report(y_true, y_pred_binary))

Epoch [1/20], Loss: 0.0294
Epoch [2/20], Loss: 0.0037
Epoch [3/20], Loss: 0.0008
Epoch [4/20], Loss: 0.0004
Epoch [5/20], Loss: 0.0003
Epoch [6/20], Loss: 0.0004
Epoch [7/20], Loss: 0.0003
Epoch [8/20], Loss: 0.0003
Epoch [9/20], Loss: 0.0005
Epoch [10/20], Loss: 0.0003
Epoch [11/20], Loss: 0.0005
Epoch [12/20], Loss: 0.0003
Epoch [13/20], Loss: 0.0002
Epoch [14/20], Loss: 0.0002
Epoch [15/20], Loss: 0.0003
Epoch [16/20], Loss: 0.0003
Epoch [17/20], Loss: 0.0004
Epoch [18/20], Loss: 0.0004
Epoch [19/20], Loss: 0.0003
Epoch [20/20], Loss: 0.0006
AUROC: 0.5247
[[39422     0]
 [   12     0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     39422
         1.0       0.00      0.00      0.00        12

    accuracy                           1.00     39434
   macro avg       0.50      0.50      0.50     39434
weighted avg       1.00      1.00      1.00     39434



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
len(y_true)

39434

In [None]:
i = 0
for hamd_id, group in X.groupby('hadm_id'):
  print(hamd_id)
  print(group)
  i+=1
  if i == 5:
    break

0.0
       hadm_id  temperature  heartrate  resprate     o2sat       sbp  \
46718      0.0     0.100304   0.075605  0.000169  0.009703  0.102917   
46719      0.0     0.099190   0.065524  0.000169  0.010003  0.085900   

            dbp  death  
46718  0.000494      0  
46719  0.000515      0  
5.000105001151667e-07
             hadm_id  temperature  heartrate  resprate     o2sat       sbp  \
727616  5.000105e-07     0.099899   0.072581  0.000169  0.009903  0.105348   
727617  5.000105e-07     0.099696   0.063508  0.000191  0.010003  0.115883   
727618  5.000105e-07     0.099113   0.060484  0.000212  0.010003  0.114263   
727619  5.000105e-07     0.099113   0.060484  0.000191  0.010003  0.121556   
727620  5.000105e-07     0.099393   0.053427  0.000191  0.009903  0.114263   

             dbp  death  
727616  0.000474      0  
727617  0.000587      0  
727618  0.000525      0  
727619  0.000577      0  
727620  0.000783      0  
3.8000798014969916e-06
         hadm_id  temperature  hea

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset

# 处理缺失值
X = X.fillna(X.median())

# 选择特征和目标
features = [col for col in X.columns if col != 'hadm_id' and col != 'death']
scaler = MinMaxScaler()
X[features] = scaler.fit_transform(X[features])

# 创建LSTM输入序列
def create_sequences_grouped_by_hadm_id(df, features, target):
    sequences = []
    for hadm_id, group in df.groupby('hadm_id'):
        if len(group) > 1:  # 确保有足够的数据点
            group_X = group[features].values
            group_y = group[target].values[-1]
            sequences.append((group_X, group_y))
    return sequences

target_column = 'death'
sequences = create_sequences_grouped_by_hadm_id(X, features, target_column)

# 划分特征和目标
X_sequences = [seq[0] for seq in sequences]
y_sequences = [seq[1] for seq in sequences]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# 自定义数据集类
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = [torch.tensor(seq, dtype=torch.float32) for seq in X]
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

def collate_fn(batch):
    batch_X, batch_y = zip(*batch)
    lengths = [len(x) for x in batch_X]
    padded_X = pad_sequence(batch_X, batch_first=True)
    batch_y = torch.tensor(batch_y, dtype=torch.float32)
    return padded_X, batch_y, lengths

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# 定义LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hn, cn) = self.lstm(packed_input, (h0, c0))
        out, _ = pad_packed_sequence(packed_output, batch_first=True)
        out = self.fc(out[:, -1, :])  # 取最后一个时间步的输出
        out = self.sigmoid(out)
        return out

# 模型参数
input_size = len(features)
hidden_size = 50
num_layers = 2
output_size = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

# 评估模型
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch, lengths in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch, lengths)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())

# 计算AUROC
y_pred = np.array(y_pred)
auroc = roc_auc_score(y_true, y_pred)
print(f'AUROC: {auroc:.4f}')

# 转换预测结果为二值
y_pred_binary = y_pred > 0.5

# 打印评估指标
print(confusion_matrix(y_true, y_pred_binary))
print(classification_report(y_true, y_pred_binary))

Epoch [1/20], Loss: 0.2321
Epoch [2/20], Loss: 0.0236
Epoch [3/20], Loss: 0.0051
Epoch [4/20], Loss: 0.0029
Epoch [5/20], Loss: 0.0027
Epoch [6/20], Loss: 0.0026
Epoch [7/20], Loss: 0.0027
Epoch [8/20], Loss: 0.0026
Epoch [9/20], Loss: 0.0026
Epoch [10/20], Loss: 0.0026
Epoch [11/20], Loss: 0.0026
Epoch [12/20], Loss: 0.0027
Epoch [13/20], Loss: 0.0026
Epoch [14/20], Loss: 0.0026
Epoch [15/20], Loss: 0.0026
Epoch [16/20], Loss: 0.0027
Epoch [17/20], Loss: 0.0026
Epoch [18/20], Loss: 0.0026
Epoch [19/20], Loss: 0.0026
Epoch [20/20], Loss: 0.0026
AUROC: 0.6036
[[36683     0]
 [    8     0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     36683
         1.0       0.00      0.00      0.00         8

    accuracy                           1.00     36691
   macro avg       0.50      0.50      0.50     36691
weighted avg       1.00      1.00      1.00     36691



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([0, 0, 0, ..., 0, 0, 0])