In [203]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

#### Data Preprocessing

In [204]:
df = pd.read_csv('../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [205]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [206]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'gender', 'cholesterol']
categorical_columns = ['gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Apply one-hot encoding to categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])

X.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,1.364055,0.443452,-0.847873,-0.122182,-0.088238,-0.539322,0,0,0,1
1,0.307686,-0.733108,-1.018168,0.749831,0.07261,-0.03518,2.400793,0,0,0,1
2,-0.247997,-0.733108,0.078047,-0.708942,0.007679,-0.141297,2.400793,0,0,0,0
3,-0.748152,1.364055,0.565254,0.541435,0.137541,0.017879,-0.539322,0,0,0,1
4,-0.808543,-0.733108,-1.018168,-1.264666,-0.187113,-0.194356,-0.539322,0,0,0,0


#### Data splitting

In [207]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

((52500, 11), (17500, 11))

#### Model definition

In [208]:
import tensorflow as tf

# This script defines the generator and discriminator models for a Generative Adversarial Imputation Network (GAIN)
# using the Keras API in TensorFlow 2.x.

def build_generator(data_dim, hidden_dim):
    """
    Builds the generator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The generator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(data_dim, activation='sigmoid')  # Output layer
    ])

    return model


def build_discriminator(data_dim, hidden_dim):
    """
    Builds the discriminator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The discriminator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer
    ])

    return model


# Example model structures
generator_example = build_generator(data_dim=100, hidden_dim=128)
discriminator_example = build_discriminator(data_dim=100, hidden_dim=128)

# Print the model summary
generator_example.summary(), discriminator_example.summary()


Model: "sequential_82"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_246 (Dense)           (None, 128)               12928     
                                                                 
 dense_247 (Dense)           (None, 128)               16512     
                                                                 
 dense_248 (Dense)           (None, 100)               12900     
                                                                 
Total params: 42340 (165.39 KB)
Trainable params: 42340 (165.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_83"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_249 (Dense)           (None, 128)               12928     
                                                          

(None, None)

#### Loss functions and optimizers

In [209]:
# This is a loss function that measures the difference between the predicted output and the ground truth output in a binary classification problem
# In the context of GANs, we use it to measure the difference between the real and fake outputs of the discriminator
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=False)

def discriminator_loss(real_output, fake_output):
    """
    Compute the discriminator loss given real and fake outputs.
    
    Args:
        real_output (tf.Tensor): Real outputs from the discriminator.
        fake_output (tf.Tensor): Fake outputs from the discriminator.
    
    Returns:
        tf.Tensor: Total loss of the discriminator.
    """
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss


def generator_loss(fake_output):
    """
    Compute the generator loss given fake outputs.
    
    Args:
        fake_output (tf.Tensor): Fake outputs from the generator.
    
    Returns:
        tf.Tensor: Loss of the generator.
    """
    return cross_entropy(tf.ones_like(fake_output), fake_output)

# Adam optimizer is a stochastic gradient descent method
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)


#### Training function definition

In [210]:
@tf.function
def train_step(generator, discriminator, data, batch_size):
    """
    Performs a single step of training in a GAN model.

    Args:
        generator: The generator model.
        discriminator: The discriminator model.
        data: The data batch.
        batch_size: The size of the batch.

    Returns:
        The generator and discriminator losses.
    """
    # Generate noise for the batch size
    noise = tf.random.normal([batch_size, data.shape[1]])

    # Perform gradient tape
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        # Generate data using the generator
        generated_data = generator(noise, training=True)

        # Get the real and fake outputs for the discriminator
        real_output = discriminator(data, training=True)
        fake_output = discriminator(generated_data, training=True)

        # Calculate the generator and discriminator losses
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    # Calculate the gradients of the generator and discriminator
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    # Apply the gradients to update the models
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    # Return the losses
    return gen_loss, disc_loss


def train_gan(generator, discriminator, df, iterations, batch_size):
    """
    Trains a GAN model.

    Args:
        generator: The generator model.
        discriminator: The discriminator model.
        df: The data frame.
        iterations: The number of training iterations.
        batch_size: The size of the batch.
    """
    for iteration in range(iterations):
        # Sample data from the dataframe
        idx = np.random.choice(len(df), batch_size, replace=False)
        data_batch = df.iloc[idx]

        # Perform a training step
        gen_loss, disc_loss = train_step(generator, discriminator, data_batch, batch_size)

        # Print the losses every 1000 iterations
        if iteration % 1000 == 0:
            print(f"Iteration {iteration}, Generator Loss: {gen_loss}, Discriminator Loss: {disc_loss}")


#### Training loop

In [211]:
# Setup models
data_dim = X_train.shape[1]
hidden_dim = 128

generator = build_generator(data_dim, hidden_dim)
discriminator = build_discriminator(data_dim, hidden_dim)

# Start training
train_gan(generator, discriminator, X_train, iterations=10000, batch_size=128)

Iteration 0, Generator Loss: 0.7679916620254517, Discriminator Loss: 1.280482292175293
Iteration 1000, Generator Loss: 2.379835605621338, Discriminator Loss: 0.24229392409324646
Iteration 2000, Generator Loss: 4.609932899475098, Discriminator Loss: 0.016189295798540115
Iteration 3000, Generator Loss: 5.847345352172852, Discriminator Loss: 0.004873540252447128
Iteration 4000, Generator Loss: 5.049951553344727, Discriminator Loss: 0.007396744564175606
Iteration 5000, Generator Loss: 6.905609130859375, Discriminator Loss: 0.0011697581503540277
Iteration 6000, Generator Loss: 7.905128479003906, Discriminator Loss: 0.00045545119792222977
Iteration 7000, Generator Loss: 8.692397117614746, Discriminator Loss: 0.0002050155308097601
Iteration 8000, Generator Loss: 9.38206672668457, Discriminator Loss: 9.929617226589471e-05
Iteration 9000, Generator Loss: 10.017717361450195, Discriminator Loss: 5.32857229700312e-05


In [212]:
from joblib import dump

dump(generator, 'gain_generator.h5')

['gain_generator.h5']