In [12]:
import pandas as pd
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
import numpy as np


In [25]:
# Load the CSV file
#icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')
icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')

In [23]:
icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,1.6325,39.181111,21,1
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,13.8507,332.416667,22,2
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2.6499,63.597778,23,3
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2.1436,51.446667,18,3
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,1.2938,31.052222,20,5


In [26]:
# Ensure intime and outtime are datetime objects
icu_data['intime'] = pd.to_datetime(icu_data['intime'])
icu_data['outtime'] = pd.to_datetime(icu_data['outtime'])

# Calculate the total ICU stay duration in hours
icu_data['icu_duration_hours'] = (icu_data['outtime'] - icu_data['intime']).dt.total_seconds() / 3600

# Extract useful time-based features from intime
icu_data['admission_hour'] = icu_data['intime'].dt.hour
icu_data['admission_dayofweek'] = icu_data['intime'].dt.dayofweek
icu_data = icu_data.drop(['intime', 'outtime'], axis=1)

icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,1.6325,39.181111,21,1
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,13.8507,332.416667,22,2
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2.6499,63.597778,23,3
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2.1436,51.446667,18,3
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,1.2938,31.052222,20,5


In [27]:

careunit_data=icu_data['first_careunit']

careunit_encoded = tf.keras.utils.to_categorical(
    careunit_data.astype('category').cat.codes
)

# Convert back to DataFrame for better readability
careunit_encoded_df = pd.DataFrame(
    careunit_encoded,
    columns=[f"careunit_{cat}" for cat in careunit_data.unique()]
)

icu_data_encoded = pd.concat([icu_data, careunit_encoded_df], axis=1)
icu_data_encoded.drop('first_careunit', axis=1, inplace=True)

# One-hot encode 'last_careunit'
last_careunit_data = icu_data['last_careunit']

last_careunit_encoded = tf.keras.utils.to_categorical(
    last_careunit_data.astype('category').cat.codes
)
last_careunit_encoded_df = pd.DataFrame(
    last_careunit_encoded,
    columns=[f"lastcareunit_{cat}" for cat in last_careunit_data.unique()]
)
icu_data_encoded = pd.concat([icu_data_encoded, last_careunit_encoded_df], axis=1)
icu_data_encoded.drop('last_careunit', axis=1, inplace=True)

# One-hot encode only 'dbsource' without losing other columns
dbsource_encoded = pd.get_dummies(icu_data['dbsource'], prefix='dbsource')
icu_data_encoded = pd.concat([icu_data_encoded, dbsource_encoded], axis=1)
icu_data_encoded.drop('dbsource', axis=1, inplace=True)


icu_data_encoded.head()


Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek,...,careunit_SICU,careunit_CSRU,careunit_TSICU,lastcareunit_MICU,lastcareunit_CCU,lastcareunit_SICU,lastcareunit_CSRU,lastcareunit_TSICU,dbsource_carevue,dbsource_metavision
0,12742,10006,142345,206504,52,52,1.6325,39.181111,21,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
1,12747,10011,105331,232110,15,15,13.8507,332.416667,22,2,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
2,12749,10013,165520,264446,15,15,2.6499,63.597778,23,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
3,12754,10017,199207,204881,7,7,2.1436,51.446667,18,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,False
4,12755,10019,177759,228977,15,15,1.2938,31.052222,20,5,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False


In [28]:
scaler = MinMaxScaler()
column_names = icu_data_encoded.columns
icu_data_encoded = scaler.fit_transform(icu_data_encoded)


In [None]:
# Assuming icu_data_encoded is your preprocessed DataFrame
input_shape = (icu_data_encoded.shape[1],)
latent_dim = 10  # Keep the same latent space size

# ============================
# Encoder
# ============================

input_data = Input(shape=input_shape, name='encoder_input')

# Fully connected layers instead of Conv2D
x = Dense(64, activation='relu')(input_data)
x = Dense(32, activation='relu')(x)

# Mean and variance layers for latent space
z_mu = Dense(latent_dim, name='latent_mu')(x)
z_sigma = Dense(latent_dim, name='latent_sigma')(x)
z_sigma = Lambda(lambda t: K.clip(t, -10, 10))(z_sigma)


# Sampling function
def sample_z(args):
    z_mu, z_sigma = args
    eps = K.random_normal(shape=(K.shape(z_mu)[0], latent_dim))
    return z_mu + K.exp(z_sigma / 2) * eps


z = Lambda(sample_z, output_shape=(latent_dim,), name='z')([z_mu, z_sigma])

# Define encoder model
encoder = Model(input_data, [z_mu, z_sigma, z], name='encoder')
#encoder.summary()

# ============================
# Decoder
# ============================

# Start from the latent space
decoder_input = Input(shape=(latent_dim,), name='decoder_input')

# Fully connected layers for reconstruction
x = Dense(32, activation='relu')(decoder_input)
x = Dense(64, activation='relu')(x)

# Output layer to reconstruct the original feature shape
outputs = Dense(input_shape[0], activation='sigmoid', name='decoder_output')(x)

# Define decoder model
decoder = Model(decoder_input, outputs, name='decoder')
#decoder.summary()

# Apply the decoder to the sampled latent vector
z_decoded = decoder(z)

# ============================
# Define Custom VAE Loss
# ============================

class CustomLayer(tf.keras.layers.Layer):
    def vae_loss(self, x, z_decoded, z_mu, z_sigma):
        # Reconstruction loss
        recon_loss = tf.reduce_mean(tf.square(x - z_decoded))

        # KL Divergence loss
        kl_loss = -0.001 * tf.reduce_mean(1 + z_sigma - tf.square(z_mu) - K.exp(z_sigma))

        return recon_loss + 0.001 * kl_loss

    def call(self, inputs):
        x = inputs[0]
        z_decoded = inputs[1]
        z_mu = inputs[2]
        z_sigma = inputs[3]

        loss = self.vae_loss(x, z_decoded, z_mu, z_sigma)
        self.add_loss(loss)
        return x


# Apply custom loss layer
y = CustomLayer()([input_data, z_decoded, z_mu, z_sigma])

# ============================
# VAE Model
# ============================

vae = Model(input_data, y, name='vae')

# Compile the model
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005))
#vae.summary()

# ============================
# Train the VAE
# ============================

vae.fit(
    icu_data_encoded,
    epochs=50,
    batch_size=16,
    validation_split=0.2
)



Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - loss: 0.2217 - val_loss: 0.2248
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.2114 - val_loss: 0.2104
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.1940 - val_loss: 0.1950
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1900 - val_loss: 0.1934
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1799 - val_loss: 0.1856
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1707 - val_loss: 0.1764
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.1606 - val_loss: 0.1636
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1502 - val_loss: 0.1652
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x1dfa440aed0>

In [30]:
# Generate new samples from random latent points
num_samples = 100
random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
generated_data = decoder.predict(random_latent_vectors)

# Convert generated data back to DataFrame
generated_data = scaler.inverse_transform(generated_data)

icu_data_encoded = pd.DataFrame(icu_data_encoded, columns=column_names)

generated_df = pd.DataFrame(generated_data, columns=icu_data_encoded.columns)

# Ensure column types match original DataFrame
for col in icu_data_encoded.columns:
    if icu_data_encoded[col].dtype == 'int64':
        generated_df[col] = generated_df[col].round().astype(int)
    elif icu_data_encoded[col].dtype == 'bool':
        generated_df[col] = generated_df[col] > 0.5
    else:
        generated_df[col] = generated_df[col].astype(icu_data_encoded[col].dtype)

# Show the first few rows of generated data
print(generated_df.head())


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
         row_id    subject_id        hadm_id     icustay_id  first_wardid  \
0  28328.708984  25217.808594  148851.281250  246354.062500     30.192980   
1  24299.478516  24891.111328  151190.234375  262197.625000     27.982954   
2  26617.425781  28270.849609  151604.531250  249247.000000     26.711040   
3  30001.880859  28945.236328  150141.296875  253827.703125     31.799772   
4  31352.638672  21605.820312  145220.296875  241829.859375     27.353455   

   last_wardid        los  icu_duration_hours  admission_hour  \
0    30.795717  14.668526          368.714813       12.198595   
1    30.646574   7.307881          422.063232       11.032572   
2    27.716604  11.921608          390.286011       11.907008   
3    32.955509  15.157915          359.528534       11.637275   
4    25.056822   6.739770          233.708542       10.406856   

   admission_dayofweek  ...  careunit_SICU  careunit_CSRU  careunit_TSICU 

In [32]:
output_filename = "generated_icu_data.csv"
generated_df.to_csv(output_filename, index=False)

print(f"Generated data saved to {output_filename}")

Generated data saved to generated_icu_data.csv


In [31]:
import os

file_path = "generated_icu_data.csv"

# Check if the file exists first
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' not found.")


File 'generated_icu_data.csv' has been deleted.
