In [1]:
import pandas as pd
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
import numpy as np


In [2]:
# Load the CSV file
#icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')
icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')

In [3]:
icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,2164-10-23 21:10:15,2164-10-25 12:21:07,1.6325
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,2126-08-14 22:34:00,2126-08-28 18:59:00,13.8507
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2125-10-04 23:38:00,2125-10-07 15:13:52,2.6499
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2149-05-29 18:52:29,2149-05-31 22:19:17,2.1436
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,2163-05-14 20:43:56,2163-05-16 03:47:04,1.2938


In [4]:
# Ensure intime and outtime are datetime objects
icu_data['intime'] = pd.to_datetime(icu_data['intime'])
icu_data['outtime'] = pd.to_datetime(icu_data['outtime'])

# Calculate the total ICU stay duration in hours
icu_data['icu_duration_hours'] = (icu_data['outtime'] - icu_data['intime']).dt.total_seconds() / 3600

# Extract useful time-based features from intime
icu_data['admission_hour'] = icu_data['intime'].dt.hour
icu_data['admission_dayofweek'] = icu_data['intime'].dt.dayofweek
icu_data = icu_data.drop(['intime', 'outtime'], axis=1)

icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,1.6325,39.181111,21,1
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,13.8507,332.416667,22,2
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2.6499,63.597778,23,3
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2.1436,51.446667,18,3
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,1.2938,31.052222,20,5


In [5]:

careunit_data=icu_data['first_careunit']

careunit_encoded = tf.keras.utils.to_categorical(
    careunit_data.astype('category').cat.codes
)

# Convert back to DataFrame for better readability
careunit_encoded_df = pd.DataFrame(
    careunit_encoded,
    columns=[f"careunit_{cat}" for cat in careunit_data.unique()]
)

icu_data_encoded = pd.concat([icu_data, careunit_encoded_df], axis=1)
icu_data_encoded.drop('first_careunit', axis=1, inplace=True)

# One-hot encode 'last_careunit'
last_careunit_data = icu_data['last_careunit']

last_careunit_encoded = tf.keras.utils.to_categorical(
    last_careunit_data.astype('category').cat.codes
)
last_careunit_encoded_df = pd.DataFrame(
    last_careunit_encoded,
    columns=[f"lastcareunit_{cat}" for cat in last_careunit_data.unique()]
)
icu_data_encoded = pd.concat([icu_data_encoded, last_careunit_encoded_df], axis=1)
icu_data_encoded.drop('last_careunit', axis=1, inplace=True)

# One-hot encode only 'dbsource' without losing other columns
dbsource_encoded = pd.get_dummies(icu_data['dbsource'], prefix='dbsource')
icu_data_encoded = pd.concat([icu_data_encoded, dbsource_encoded], axis=1)
icu_data_encoded.drop('dbsource', axis=1, inplace=True)


icu_data_encoded.head()


Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek,...,careunit_SICU,careunit_CSRU,careunit_TSICU,lastcareunit_MICU,lastcareunit_CCU,lastcareunit_SICU,lastcareunit_CSRU,lastcareunit_TSICU,dbsource_carevue,dbsource_metavision
0,12742,10006,142345,206504,52,52,1.6325,39.181111,21,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
1,12747,10011,105331,232110,15,15,13.8507,332.416667,22,2,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
2,12749,10013,165520,264446,15,15,2.6499,63.597778,23,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
3,12754,10017,199207,204881,7,7,2.1436,51.446667,18,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,False
4,12755,10019,177759,228977,15,15,1.2938,31.052222,20,5,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False


In [6]:
scaler = MinMaxScaler()
column_names = icu_data_encoded.columns
icu_data_encoded = scaler.fit_transform(icu_data_encoded)


In [None]:
input_shape = (icu_data_encoded.shape[1],)
latent_dim = 10 

# ============================
# Encoder
# ============================

input_data = Input(shape=input_shape, name='encoder_input')

# Fully connected layers instead of Conv2D
x = Dense(64, activation='relu')(input_data)
x = Dense(32, activation='relu')(x)

# Mean and variance layers for latent space
z_mu = Dense(latent_dim, name='latent_mu')(x)
z_sigma = Dense(latent_dim, name='latent_sigma')(x)
z_sigma = Lambda(lambda t: K.clip(t, -10, 10))(z_sigma)


# Sampling function
def sample_z(args):
    z_mu, z_sigma = args
    eps = K.random_normal(shape=(K.shape(z_mu)[0], latent_dim))
    return z_mu + K.exp(z_sigma / 2) * eps


z = Lambda(sample_z, output_shape=(latent_dim,), name='z')([z_mu, z_sigma])

# Define encoder model
encoder = Model(input_data, [z_mu, z_sigma, z], name='encoder')
#encoder.summary()

# ============================
# Decoder
# ============================

# Start from the latent space
decoder_input = Input(shape=(latent_dim,), name='decoder_input')

# Fully connected layers for reconstruction
x = Dense(32, activation='relu')(decoder_input)
x = Dense(64, activation='relu')(x)
outputs = Dense(input_shape[0], activation='sigmoid', name='decoder_output')(x)

# Define decoder model
decoder = Model(decoder_input, outputs, name='decoder')
#decoder.summary()

# Apply the decoder to the sampled latent vector
z_decoded = decoder(z)

# ============================
# Define Custom VAE Loss
# ============================

class CustomLayer(tf.keras.layers.Layer):
    def vae_loss(self, x, z_decoded, z_mu, z_sigma):
        # Reconstruction loss
        recon_loss = tf.reduce_mean(tf.square(x - z_decoded))

        # KL Divergence loss
        kl_loss = -0.001 * tf.reduce_mean(1 + z_sigma - tf.square(z_mu) - K.exp(z_sigma))

        return recon_loss + 0.001 * kl_loss

    def call(self, inputs):
        x = inputs[0]
        z_decoded = inputs[1]
        z_mu = inputs[2]
        z_sigma = inputs[3]

        loss = self.vae_loss(x, z_decoded, z_mu, z_sigma)
        self.add_loss(loss)
        return x


# Apply custom loss layer
y = CustomLayer()([input_data, z_decoded, z_mu, z_sigma])

# ============================
# VAE Model
# ============================

vae = Model(input_data, y, name='vae')

# Compile the model
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005))
#vae.summary()

# ============================
# Train the VAE
# ============================

vae.fit(
    icu_data_encoded,
    epochs=50,
    batch_size=16,
    validation_split=0.2
)




Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 0.2042 - val_loss: 0.2107
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1967 - val_loss: 0.2055
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.1934 - val_loss: 0.2056
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.1862 - val_loss: 0.2012
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1793 - val_loss: 0.1925
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.1693 - val_loss: 0.1905
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.1623 - val_loss: 0.1921
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1535 - val_loss: 0.1875
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x2148eaaab50>

In [None]:
# Generate new samples from random latent points
num_samples = 100
random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
generated_data = decoder.predict(random_latent_vectors)

# Convert generated data back to DataFrame
generated_data = scaler.inverse_transform(generated_data)

icu_data_encoded = pd.DataFrame(icu_data_encoded, columns=column_names)

generated_df = pd.DataFrame(generated_data, columns=icu_data_encoded.columns)

# Convert specific columns to integers — you can customize this list
int_columns = ['row_id', 'subject_id', 'hadm_id', 'icustay_id', 'first_wardid', 'last_wardid', 'admission_dayofweek']
generated_df[int_columns] = generated_df[int_columns].astype(int)
binary_columns = ['dbsource_carevue', 'dbsource_metavision', 'careunit_MICU', 'careunit_CCU',
                  'careunit_SICU', 'careunit_CSRU', 'careunit_TSICU',
                  'lastcareunit_MICU', 'lastcareunit_CCU', 'lastcareunit_SICU',
                  'lastcareunit_CSRU', 'lastcareunit_TSICU']

# Apply the 0.5 threshold for binary columns
generated_df[binary_columns] = (generated_df[binary_columns] > 0.5).astype(int)

# Reverse one-hot encoding function
def one_hot_decode(df, prefix):
    """Decode one-hot encoded columns into original categorical labels."""
    cols = [col for col in df.columns if col.startswith(prefix)]
    decoded_series = df[cols].idxmax(axis=1).str.replace(f'{prefix}_', '')
    return decoded_series

# Decode and restore original categorical columns
generated_df['first_careunit'] = one_hot_decode(generated_df, 'careunit')
generated_df['last_careunit'] = one_hot_decode(generated_df, 'lastcareunit')
generated_df['dbsource'] = one_hot_decode(generated_df, 'dbsource')

# Drop the old one-hot columns now that we’ve restored the original ones
one_hot_columns = [
    col for col in generated_df.columns 
    if col.startswith(('careunit_', 'lastcareunit_', 'dbsource_'))
]
generated_df.drop(columns=one_hot_columns, inplace=True)

# Ensure the columns are back in the original order
original_columns = list(icu_data.columns)
generated_df = generated_df[original_columns]



# Show the first few rows of generated data
print(generated_df.head())


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Decoded data saved successfully!
   row_id  subject_id  hadm_id  icustay_id    dbsource first_careunit  \
0   23983       22798   150538      259874     carevue           MICU   
1   27039       26627   146301      253259     carevue           SICU   
2   33934       29828   147467      244572  metavision           SICU   
3   27069       27555   144368      245959     carevue           MICU   
4   27036       23581   152398      253252     carevue           SICU   

  last_careunit  first_wardid  last_wardid        los  icu_duration_hours  \
0          MICU            27           26   8.951124          332.634094   
1          SICU            18           19   7.301687          310.853241   
2          SICU            33           35   6.163352          320.366638   
3           CCU            29           30  16.306797          301.266998   
4          SICU            26           28  10.527401          357.6278

In [20]:
output_filename = "generated_icu_data.csv"
generated_df.to_csv(output_filename, index=False)

print(f"Generated data saved to {output_filename}")

Generated data saved to generated_icu_data.csv


In [31]:
import os

file_path = "generated_icu_data.csv"

# Check if the file exists first
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' not found.")


File 'generated_icu_data.csv' has been deleted.
