In [None]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

#### Data Preprocessing

In [None]:
df = pd.read_csv('../../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head(5)

In [None]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# # Apply one-hot encoding to categorical columns
# encoder = OneHotEncoder(sparse_output=False)  # Create the encoder
# X_encoded = encoder.fit_transform(X[categorical_columns])  # Fit and transform the categorical data
# column_names = encoder.get_feature_names_out(categorical_columns)  # Get new column names for encoded features
# X_encoded = pd.DataFrame(X_encoded, columns=column_names)  # Create a DataFrame with the new column names

# # Drop original categorical columns and concatenate the new encoded DataFrame
# X = X.drop(categorical_columns, axis=1)
# X = pd.concat([X, X_encoded], axis=1)
    
X.head(5)

In [None]:
X.describe()

#### Data splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

#### Model definition

In [None]:
import tensorflow as tf

# This script defines the generator and discriminator models for a Generative Adversarial Imputation Network (GAIN)
# using the Keras API in TensorFlow 2.x.

def build_generator(data_dim, hidden_dim):
    """
    Builds the generator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The generator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(data_dim, activation='sigmoid')  # Output layer
    ])

    return model


def build_discriminator(data_dim, hidden_dim):
    """
    Builds the discriminator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The discriminator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer
    ])

    return model

In [None]:

# Setup models
data_dim = X_train.shape[1]
hidden_dim = 128

generator = build_generator(data_dim, hidden_dim)
discriminator = build_discriminator(data_dim, hidden_dim)

# Print the model summary
generator.summary(), discriminator.summary()

#### Loss functions and optimizers

In [None]:
# Loss function for the discriminator
def discriminator_loss(D_prob, M, X, G_sample):
    D_prob = tf.cast(D_prob, dtype=tf.float32)  # Cast to float32
    return -tf.reduce_mean(M * tf.math.log(D_prob + 1e-8) + (1 - M) * tf.math.log(1. - D_prob + 1e-8))

# Loss function for the generator
def generator_loss(D_prob, G_sample, M, X):
    # Cast all inputs to float32 to ensure consistent data types for operations
    D_prob = tf.cast(D_prob, dtype=tf.float32)
    G_sample = tf.cast(G_sample, dtype=tf.float32)
    M = tf.cast(M, dtype=tf.float32)
    X = tf.cast(X, dtype=tf.float32)
    
    # Compute the binary cross-entropy loss part
    BCE_loss = -tf.reduce_mean((1 - M) * tf.math.log(D_prob + tf.constant(1e-8, dtype=tf.float32)))

    # Compute the mean squared error loss part
    MSE_loss = tf.reduce_mean(M * tf.square(X - G_sample))

    # Weighting factor for the losses
    alpha = 0.5

    # Combine the losses
    total_loss = alpha * BCE_loss + (1 - alpha) * MSE_loss

    return total_loss

# Adam optimizer is a stochastic gradient descent method
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

#### Training function definition

In [None]:
@tf.function
def train_step(generator, discriminator, data, batch_size):
    noise = tf.random.normal([batch_size, data.shape[1]])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_data = generator(noise, training=True)
        real_output = discriminator(data, training=True)
        fake_output = discriminator(generated_data, training=True)
        gen_loss = generator_loss(fake_output, generated_data, data, noise)
        disc_loss = discriminator_loss(real_output, fake_output, data, noise)
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

def train_gan(generator, discriminator, df, iterations, batch_size):
    for iteration in range(iterations):
        idx = np.random.choice(len(df), batch_size, replace=False)
        data_batch = df.iloc[idx]
        gen_loss, disc_loss = train_step(generator, discriminator, data_batch, batch_size)
        if iteration % 1000 == 0:
            print(f"Iteration {iteration}, Generator Loss: {gen_loss}, Discriminator Loss: {disc_loss}")

#### Training loop

In [None]:
# Start training
train_gan(generator, discriminator, X_train, iterations=10000, batch_size=128)

In [None]:
from joblib import dump

dump(generator, 'cardio_gain_generator.h5')