In [50]:
import pandas as pd
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [18]:
# Load the CSV file
#icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')
icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')

In [19]:
icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,2164-10-23 21:10:15,2164-10-25 12:21:07,1.6325
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,2126-08-14 22:34:00,2126-08-28 18:59:00,13.8507
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2125-10-04 23:38:00,2125-10-07 15:13:52,2.6499
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2149-05-29 18:52:29,2149-05-31 22:19:17,2.1436
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,2163-05-14 20:43:56,2163-05-16 03:47:04,1.2938


In [20]:
# Ensure intime and outtime are datetime objects
icu_data['intime'] = pd.to_datetime(icu_data['intime'])
icu_data['outtime'] = pd.to_datetime(icu_data['outtime'])

# Calculate the total ICU stay duration in hours
icu_data['icu_duration_hours'] = (icu_data['outtime'] - icu_data['intime']).dt.total_seconds() / 3600

# Extract useful time-based features from intime
icu_data['admission_hour'] = icu_data['intime'].dt.hour
icu_data['admission_dayofweek'] = icu_data['intime'].dt.dayofweek
icu_data = icu_data.drop(['intime', 'outtime'], axis=1)

icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,1.6325,39.181111,21,1
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,13.8507,332.416667,22,2
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2.6499,63.597778,23,3
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2.1436,51.446667,18,3
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,1.2938,31.052222,20,5


In [21]:

careunit_data=icu_data['first_careunit']

careunit_encoded = tf.keras.utils.to_categorical(
    careunit_data.astype('category').cat.codes
)

# Convert back to DataFrame for better readability
careunit_encoded_df = pd.DataFrame(
    careunit_encoded,
    columns=[f"careunit_{cat}" for cat in careunit_data.unique()]
)

icu_data_encoded = pd.concat([icu_data, careunit_encoded_df], axis=1)
icu_data_encoded.drop('first_careunit', axis=1, inplace=True)

# One-hot encode 'last_careunit'
last_careunit_data = icu_data['last_careunit']

last_careunit_encoded = tf.keras.utils.to_categorical(
    last_careunit_data.astype('category').cat.codes
)
last_careunit_encoded_df = pd.DataFrame(
    last_careunit_encoded,
    columns=[f"lastcareunit_{cat}" for cat in last_careunit_data.unique()]
)
icu_data_encoded = pd.concat([icu_data_encoded, last_careunit_encoded_df], axis=1)
icu_data_encoded.drop('last_careunit', axis=1, inplace=True)

# One-hot encode only 'dbsource' without losing other columns
dbsource_encoded = pd.get_dummies(icu_data['dbsource'], prefix='dbsource')
icu_data_encoded = pd.concat([icu_data_encoded, dbsource_encoded], axis=1)
icu_data_encoded.drop('dbsource', axis=1, inplace=True)


icu_data_encoded.head()


Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek,...,careunit_SICU,careunit_CSRU,careunit_TSICU,lastcareunit_MICU,lastcareunit_CCU,lastcareunit_SICU,lastcareunit_CSRU,lastcareunit_TSICU,dbsource_carevue,dbsource_metavision
0,12742,10006,142345,206504,52,52,1.6325,39.181111,21,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
1,12747,10011,105331,232110,15,15,13.8507,332.416667,22,2,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
2,12749,10013,165520,264446,15,15,2.6499,63.597778,23,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
3,12754,10017,199207,204881,7,7,2.1436,51.446667,18,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,False
4,12755,10019,177759,228977,15,15,1.2938,31.052222,20,5,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False


In [22]:
scaler = MinMaxScaler()
column_names = icu_data_encoded.columns
icu_data_encoded = scaler.fit_transform(icu_data_encoded)


In [25]:
X_train, X_val = train_test_split(icu_data_encoded, test_size=0.2, random_state=42)

input_shape = (X_train.shape[1],)
latent_dim = 10 

# ============================
# Encoder
# ============================

input_data = Input(shape=input_shape, name='encoder_input')

# Fully connected layers instead of Conv2D
x = Dense(64, activation='relu')(input_data)
x = Dense(32, activation='relu')(x)

# Mean and variance layers for latent space
z_mu = Dense(latent_dim, name='latent_mu')(x)
z_sigma = Dense(latent_dim, name='latent_sigma')(x)
z_sigma = Lambda(lambda t: K.clip(t, -10, 10))(z_sigma)


# Sampling function
def sample_z(args):
    z_mu, z_sigma = args
    eps = K.random_normal(shape=(K.shape(z_mu)[0], latent_dim))
    return z_mu + K.exp(z_sigma / 2) * eps


z = Lambda(sample_z, output_shape=(latent_dim,), name='z')([z_mu, z_sigma])

# Define encoder model
encoder = Model(input_data, [z_mu, z_sigma, z], name='encoder')
#encoder.summary()

# ============================
# Decoder
# ============================

# Start from the latent space
decoder_input = Input(shape=(latent_dim,), name='decoder_input')

# Fully connected layers for reconstruction
x = Dense(32, activation='relu')(decoder_input)
x = Dense(64, activation='relu')(x)
outputs = Dense(input_shape[0], activation='sigmoid', name='decoder_output')(x)

# Define decoder model
decoder = Model(decoder_input, outputs, name='decoder')
#decoder.summary()

# Apply the decoder to the sampled latent vector
z_decoded = decoder(z)

# ============================
# Define Custom VAE Loss
# ============================

class CustomLayer(tf.keras.layers.Layer):
    def vae_loss(self, x, z_decoded, z_mu, z_sigma):
        # Reconstruction loss
        recon_loss = tf.reduce_mean(tf.square(x - z_decoded))

        # KL Divergence loss
        kl_loss = -0.001 * tf.reduce_mean(1 + z_sigma - tf.square(z_mu) - K.exp(z_sigma))

        return recon_loss + 0.001 * kl_loss

    def call(self, inputs):
        x = inputs[0]
        z_decoded = inputs[1]
        z_mu = inputs[2]
        z_sigma = inputs[3]

        loss = self.vae_loss(x, z_decoded, z_mu, z_sigma)
        self.add_loss(loss)
        return x


# Apply custom loss layer
y = CustomLayer()([input_data, z_decoded, z_mu, z_sigma])

# ============================
# VAE Model
# ============================

vae = Model(input_data, y, name='vae')

# Compile the model
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005))
#vae.summary()

# ============================
# Train the VAE
# ============================

vae.fit(
    X_train,
    epochs=50,
    batch_size=16,
    validation_split=0.2,
    validation_data=(X_val, None)
)
vae.save("vae_saved_model.keras")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Extract z from the encoder
_, _, z_val = encoder.predict(X_val)

# Generate reconstructed data using decoder
reconstructed_val = decoder.predict(z_val)


mse = mean_squared_error(X_val, reconstructed_val)
mae = mean_absolute_error(X_val, reconstructed_val)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation MAE: {mae:.4f}")


Validation MSE: 0.0389
Validation MAE: 0.1154


In [54]:
# Generate new samples from random latent points
num_samples = 100
random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
generated_data = decoder.predict(random_latent_vectors)

# Convert generated data back to DataFrame
generated_data = scaler.inverse_transform(generated_data)

X_train = pd.DataFrame(X_train, columns=column_names)

generated_df = pd.DataFrame(generated_data, columns=X_train.columns)

id_columns = ['row_id', 'subject_id', 'hadm_id', 'icustay_id', 'first_wardid', 'last_wardid']
generated_df[id_columns] = generated_df[id_columns].round().astype(int)

# Step 2: Decode one-hot encoded columns to original categorical values
def decode_one_hot(row, prefix):
    cols = [col for col in row.index if col.startswith(prefix)]
    if not cols:
        return np.nan
    return cols[np.argmax(row[cols])].replace(f"{prefix}_", "")

# Reconstruct dbsource
generated_df['dbsource'] = generated_df.apply(lambda row: decode_one_hot(row, 'dbsource'), axis=1)

# Reconstruct careunit and lastcareunit
generated_df['first_careunit'] = generated_df.apply(lambda row: decode_one_hot(row, 'careunit'), axis=1)
generated_df['last_careunit'] = generated_df.apply(lambda row: decode_one_hot(row, 'lastcareunit'), axis=1)

# Step 3: Reconstruct intime and outtime
# Use arbitrary base date (adjust if needed)
base_date = pd.Timestamp("2025-01-01")
generated_df['admission_hour'] = generated_df['admission_hour'].round().astype(int)
generated_df['admission_dayofweek'] = generated_df['admission_dayofweek'].round().astype(int)

generated_df['intime'] = base_date + pd.to_timedelta(generated_df['admission_dayofweek'], unit='D') + \
                         pd.to_timedelta(generated_df['admission_hour'], unit='h')

generated_df['icu_duration_hours'] = generated_df['icu_duration_hours'].clip(lower=0)
generated_df['outtime'] = generated_df['intime'] + pd.to_timedelta(generated_df['icu_duration_hours'], unit='h')

# Step 4: Select only the original ICUSTAYS columns (load ICUSTAYS to get columns)
icustays_df = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')  # Load just to get column structure
final_columns = icustays_df.columns.tolist()
generated_df = generated_df[final_columns]

# Show the first few rows of generated data
print(generated_df.head())


   row_id  subject_id  hadm_id  icustay_id    dbsource first_careunit  \
0   26898       26203   148891      247875     carevue           CSRU   
1   31825       30064   149091      252937  metavision           CSRU   
2   17184       16608   151047      246879     carevue           SICU   
3   21673       22355   157249      243874     carevue           CSRU   
4   25665       22674   147311      247935     carevue           SICU   

  last_careunit  first_wardid  last_wardid              intime  \
0          CSRU            38           36 2025-01-03 13:00:00   
1          MICU            34           29 2025-01-04 12:00:00   
2          CSRU            28           27 2025-01-04 13:00:00   
3          MICU            28           27 2025-01-04 13:00:00   
4          CSRU            30           28 2025-01-04 13:00:00   

                        outtime        los  
0 2025-01-18 16:47:30.512695311  13.559687  
1 2025-01-15 10:10:04.687500000  13.558706  
2 2025-01-12 15:02:17.2741699

In [55]:
output_filename = "generated_icu_data.csv"
generated_df.to_csv(output_filename, index=False)

print(f"Generated data saved to {output_filename}")

Generated data saved to generated_icu_data.csv


In [11]:
import os

file_path = "generated_icu_data.csv"

# Check if the file exists first
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' not found.")


File 'generated_icu_data.csv' has been deleted.
