In [2]:
import pandas as pd
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout, LeakyReLU

In [3]:
class PlanarFlow(tf.keras.layers.Layer):
    def __init__(self):
        super(PlanarFlow, self).__init__()

    def build(self, input_shape):
        # Initialize parameters for Planar Flow
        self.u = self.add_weight(name='u', shape=(input_shape[1],), initializer='random_normal', trainable=True)
        self.w = self.add_weight(name='w', shape=(input_shape[1],), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='b', shape=(1,), initializer='zeros', trainable=True)

    def call(self, z):
        # Planar flow transformation
        activation = tf.tanh(tf.linalg.matmul(z, tf.expand_dims(self.w, -1)) + self.b)
        flow = z + tf.expand_dims(self.u, 0) * activation
        return flow

    def log_det_jacobian(self, z):
        # Compute log-determinant of Jacobian for the flow
        activation = tf.tanh(tf.linalg.matmul(z, tf.expand_dims(self.w, -1)) + self.b)
        dot_product = tf.reduce_sum(self.u * activation, axis=-1, keepdims=True)
        jacobian_det = 1 + tf.reduce_sum(self.u * self.w * (1 - activation**2), axis=-1, keepdims=True)
        return tf.reduce_sum(tf.math.log(tf.abs(jacobian_det)), axis=1)


In [4]:
# Load the CSV file
#icu_data = pd.read_csv('C:/Users/two_s/OneDrive/Desktop/ICUSTAYS.csv')
icu_data = pd.read_csv('C:/Users/hacke/Desktop/dataset/ICUSTAYS.csv')

In [5]:
icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,2164-10-23 21:10:15,2164-10-25 12:21:07,1.6325
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,2126-08-14 22:34:00,2126-08-28 18:59:00,13.8507
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2125-10-04 23:38:00,2125-10-07 15:13:52,2.6499
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2149-05-29 18:52:29,2149-05-31 22:19:17,2.1436
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,2163-05-14 20:43:56,2163-05-16 03:47:04,1.2938


In [6]:
# Ensure intime and outtime are datetime objects
icu_data['intime'] = pd.to_datetime(icu_data['intime'])
icu_data['outtime'] = pd.to_datetime(icu_data['outtime'])

# Calculate the total ICU stay duration in hours
icu_data['icu_duration_hours'] = (icu_data['outtime'] - icu_data['intime']).dt.total_seconds() / 3600

# Extract useful time-based features from intime
icu_data['admission_hour'] = icu_data['intime'].dt.hour
icu_data['admission_dayofweek'] = icu_data['intime'].dt.dayofweek
icu_data = icu_data.drop(['intime', 'outtime'], axis=1)

icu_data.head()

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek
0,12742,10006,142345,206504,carevue,MICU,MICU,52,52,1.6325,39.181111,21,1
1,12747,10011,105331,232110,carevue,MICU,MICU,15,15,13.8507,332.416667,22,2
2,12749,10013,165520,264446,carevue,MICU,MICU,15,15,2.6499,63.597778,23,3
3,12754,10017,199207,204881,carevue,CCU,CCU,7,7,2.1436,51.446667,18,3
4,12755,10019,177759,228977,carevue,MICU,MICU,15,15,1.2938,31.052222,20,5


In [7]:

careunit_data=icu_data['first_careunit']

careunit_encoded = tf.keras.utils.to_categorical(
    careunit_data.astype('category').cat.codes
)

# Convert back to DataFrame for better readability
careunit_encoded_df = pd.DataFrame(
    careunit_encoded,
    columns=[f"careunit_{cat}" for cat in careunit_data.unique()]
)

icu_data_encoded = pd.concat([icu_data, careunit_encoded_df], axis=1)
icu_data_encoded.drop('first_careunit', axis=1, inplace=True)

# One-hot encode 'last_careunit'
last_careunit_data = icu_data['last_careunit']

last_careunit_encoded = tf.keras.utils.to_categorical(
    last_careunit_data.astype('category').cat.codes
)
last_careunit_encoded_df = pd.DataFrame(
    last_careunit_encoded,
    columns=[f"lastcareunit_{cat}" for cat in last_careunit_data.unique()]
)
icu_data_encoded = pd.concat([icu_data_encoded, last_careunit_encoded_df], axis=1)
icu_data_encoded.drop('last_careunit', axis=1, inplace=True)

# One-hot encode only 'dbsource' without losing other columns
dbsource_encoded = pd.get_dummies(icu_data['dbsource'], prefix='dbsource')
icu_data_encoded = pd.concat([icu_data_encoded, dbsource_encoded], axis=1)
icu_data_encoded.drop('dbsource', axis=1, inplace=True)


icu_data_encoded.head()


Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,first_wardid,last_wardid,los,icu_duration_hours,admission_hour,admission_dayofweek,...,careunit_SICU,careunit_CSRU,careunit_TSICU,lastcareunit_MICU,lastcareunit_CCU,lastcareunit_SICU,lastcareunit_CSRU,lastcareunit_TSICU,dbsource_carevue,dbsource_metavision
0,12742,10006,142345,206504,52,52,1.6325,39.181111,21,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
1,12747,10011,105331,232110,15,15,13.8507,332.416667,22,2,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
2,12749,10013,165520,264446,15,15,2.6499,63.597778,23,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False
3,12754,10017,199207,204881,7,7,2.1436,51.446667,18,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,True,False
4,12755,10019,177759,228977,15,15,1.2938,31.052222,20,5,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,False


In [8]:
scaler = MinMaxScaler()
column_names = icu_data_encoded.columns
icu_data_encoded = scaler.fit_transform(icu_data_encoded)


In [9]:
class FlowVAE(tf.keras.layers.Layer):
    def __init__(self, n_flows=5, **kwargs):
        super(FlowVAE, self).__init__(**kwargs)
        self.n_flows = n_flows
        self.flow_layers = [PlanarFlow() for _ in range(self.n_flows)]  # Create Planar Flow layers
    
    def call(self, inputs):
        z_mu, z_sigma = inputs
        eps = K.random_normal(shape=(K.shape(z_mu)[0], latent_dim))
        z = z_mu + K.exp(z_sigma / 2) * eps

        log_det_jacobian_total = 0.0
        # Apply Planar Flows and accumulate log-determinant of Jacobian
        for flow_layer in self.flow_layers:
            z = flow_layer(z)  # Apply Planar Flow transformation
            log_det_jacobian_total += flow_layer.log_det_jacobian(z)
        
        return z, log_det_jacobian_total

    def get_config(self):
        # Return the configuration of the FlowVAE layer
        config = super().get_config()
        config.update({
            "n_flows": self.n_flows,
        })
        return config


In [18]:

# --- Data split, assume icu_data_encoded is already preprocessed ---
X_train, X_val = train_test_split(icu_data_encoded, test_size=0.2, random_state=42)
input_shape = (X_train.shape[1],)
latent_dim = 32

# ============================
# Encoder
# ============================
input_data = Input(shape=input_shape, name='encoder_input')

x = Dense(128)(input_data)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)
x = Dense(64)(x)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)
x = Dense(32)(x)
x = LeakyReLU()(x)

z_mu = Dense(latent_dim, name='latent_mu')(x)
z_sigma = Dense(latent_dim, name='latent_sigma')(x)
z_sigma = Lambda(lambda t: K.clip(t, -10, 10))(z_sigma)

# ----------------------------
# Apply Flow Layers
# ----------------------------
# Assume FlowVAE and its PlanarFlow components are defined as before.
flow_layer = FlowVAE(n_flows=5)
z, log_det_jacobian_total = flow_layer([z_mu, z_sigma])

encoder = Model(input_data, [z_mu, z_sigma, z], name="encoder")

# ============================
# Decoder
# ============================
decoder_input = Input(shape=(latent_dim,), name='decoder_input')
x = Dense(32)(decoder_input)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)
x = Dense(64)(x)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)
x = Dense(128)(x)
x = LeakyReLU()(x)
decoder_output = Dense(input_shape[0], activation='sigmoid')(x)
decoder = Model(decoder_input, decoder_output, name="decoder")

# ============================
# VAE Assembly and Custom Loss
# ============================
z_decoded = decoder(z)
class VAEOutputWithLoss(tf.keras.layers.Layer):
    def __init__(self, l2_reg=1e-4, **kwargs):
        super(VAEOutputWithLoss, self).__init__(**kwargs)
        self.l2_reg = l2_reg

    def get_config(self):
        config = super().get_config()
        config.update({
            "l2_reg": self.l2_reg
        })
        return config

    def call(self, inputs):
        x_in, x_decoded, mu, sigma, ldj = inputs

        # --- Reconstruction ---
        recon = tf.reduce_mean(tf.square(x_in - x_decoded))

        # --- KL Divergence ---
        kl = -0.001 * tf.reduce_mean(1 + sigma - tf.square(mu) - K.exp(sigma))

        # --- Manual L2 regularization ---
        kernel_weights = [w for w in self.trainable_weights if 'kernel' in w.name]
        if kernel_weights:
            reg_losses = tf.add_n([
                tf.reduce_sum(tf.square(w)) for w in kernel_weights
            ]) * self.l2_reg
        else:
            reg_losses = 0.0

        total = recon + kl - ldj + reg_losses
        self.add_loss(total)
        return x_decoded




# Apply the custom loss-wrapping layer
loss_layer = VAEOutputWithLoss()
output_with_loss = loss_layer([input_data, z_decoded, z_mu, z_sigma, log_det_jacobian_total])

vae = Model(input_data, output_with_loss, name="vae")

vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))



vae.fit(
    X_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, None)  # X_val is input; no target needed
)


vae.save("flow_vae_model.h5")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
# Extract z from the encoder
_, _, z_val = encoder.predict(X_val)

# Generate reconstructed data using decoder
reconstructed_val = decoder.predict(z_val)


mse = mean_squared_error(X_val, reconstructed_val)
mae = mean_absolute_error(X_val, reconstructed_val)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation MAE: {mae:.4f}")

Validation MSE: 0.0574
Validation MAE: 0.1579


In [20]:
# Generate new samples from random latent points
num_samples = 100
random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
generated_data = decoder.predict(random_latent_vectors)

# Convert generated data back to DataFrame
generated_data = scaler.inverse_transform(generated_data)

X_train = pd.DataFrame(X_train, columns=column_names)

generated_df = pd.DataFrame(generated_data, columns=X_train.columns)

id_columns = ['row_id', 'subject_id', 'hadm_id', 'icustay_id', 'first_wardid', 'last_wardid']
generated_df[id_columns] = generated_df[id_columns].round().astype(int)

# Step 2: Decode one-hot encoded columns to original categorical values
def decode_one_hot(row, prefix):
    cols = [col for col in row.index if col.startswith(prefix)]
    if not cols:
        return np.nan
    return cols[np.argmax(row[cols])].replace(f"{prefix}_", "")

# Reconstruct dbsource
generated_df['dbsource'] = generated_df.apply(lambda row: decode_one_hot(row, 'dbsource'), axis=1)

# Reconstruct careunit and lastcareunit
generated_df['first_careunit'] = generated_df.apply(lambda row: decode_one_hot(row, 'careunit'), axis=1)
generated_df['last_careunit'] = generated_df.apply(lambda row: decode_one_hot(row, 'lastcareunit'), axis=1)

# Step 3: Reconstruct intime and outtime
# Use arbitrary base date (adjust if needed)
base_date = pd.Timestamp("2025-01-01")
generated_df['admission_hour'] = generated_df['admission_hour'].round().astype(int)
generated_df['admission_dayofweek'] = generated_df['admission_dayofweek'].round().astype(int)

generated_df['intime'] = base_date + pd.to_timedelta(generated_df['admission_dayofweek'], unit='D') + \
                         pd.to_timedelta(generated_df['admission_hour'], unit='h')

generated_df['icu_duration_hours'] = generated_df['icu_duration_hours'].clip(lower=0)
generated_df['outtime'] = generated_df['intime'] + pd.to_timedelta(generated_df['icu_duration_hours'], unit='h')

# Step 4: Select only the original ICUSTAYS columns (load ICUSTAYS to get columns)
icustays_df = pd.read_csv('C:/Users/hacke/Desktop/dataset/ICUSTAYS.csv')  # Load just to get column structure
final_columns = icustays_df.columns.tolist()
generated_df = generated_df[final_columns]

# Show the first few rows of generated data
print(generated_df.head())

   row_id  subject_id  hadm_id  icustay_id    dbsource first_careunit   
0   21522       22912   148918      247717     carevue           SICU  \
1   40200       40700   151409      252524  metavision           SICU   
2   38618       35646   153537      256333  metavision           SICU   
3   25785       24343   159384      248010     carevue           SICU   
4   21819       21514   155206      248003     carevue           SICU   

  last_careunit  first_wardid  last_wardid              intime   
0          SICU            32           32 2025-01-04 14:00:00  \
1          SICU            37           34 2025-01-04 13:00:00   
2          SICU            44           40 2025-01-03 15:00:00   
3          SICU            36           32 2025-01-03 14:00:00   
4          SICU            28           26 2025-01-04 12:00:00   

                        outtime       los  
0 2025-01-11 11:01:08.115234376  8.190334  
1 2025-01-10 06:09:56.228027342  5.460143  
2 2025-01-07 14:07:08.356933592 

In [None]:
output_filename = "generated_icu_data.csv"
generated_df.to_csv(output_filename, index=False)

print(f"Generated data saved to {output_filename}")

Generated data saved to generated_icu_data2.csv


In [None]:
import os

file_path = "generated_icu_data.csv"

# Check if the file exists first
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"File '{file_path}' not found.")
