In [1]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

#### Data Preprocessing

In [2]:
df = pd.read_csv('../../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [4]:
from sklearn.preprocessing import StandardScaler

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

X.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,2,0.443452,-0.847873,-0.122182,-0.088238,1,1,0,0,1
1,0.307686,1,-1.018168,0.749831,0.07261,-0.03518,3,1,0,0,1
2,-0.247997,1,0.078047,-0.708942,0.007679,-0.141297,3,1,0,0,0
3,-0.748152,2,0.565254,0.541435,0.137541,0.017879,1,1,0,0,1
4,-0.808543,1,-1.018168,-1.264666,-0.187113,-0.194356,1,1,0,0,0


In [5]:
X.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,5.272227e-16,1.349571,1.450116e-15,-2.905105e-16,7.623108000000001e-17,1.7459050000000003e-17,1.366871,1.226457,0.088129,0.053771,0.803729
std,1.000007,0.476838,1.000007,1.000007,1.000007,1.000007,0.68025,0.57227,0.283484,0.225568,0.397179
min,-3.514407,1.0,-13.32014,-4.460075,-1.810381,-0.8841161,1.0,1.0,0.0,0.0,0.0
25%,-0.7315341,1.0,-0.652763,-0.639477,-0.05725127,-0.0882385,1.0,1.0,0.0,0.0,1.0
50%,0.09489744,1.0,0.07804703,-0.1532192,-0.05725127,-0.0882385,1.0,1.0,0.0,0.0,1.0
75%,0.7531244,2.0,0.6870554,0.5414349,0.07261016,-0.03517999,2.0,1.0,0.0,0.0,1.0
max,1.720199,2.0,10.43119,8.738353,103.1826,57.85165,3.0,3.0,1.0,1.0,1.0


#### Data splitting

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

((56000, 11), (14000, 11))

#### Model definition

In [7]:
import tensorflow as tf

# This script defines the generator and discriminator models for a Generative Adversarial Imputation Network (GAIN)
# using the Keras API in TensorFlow 2.x.

def build_generator(data_dim, hidden_dim):
    """
    Builds the generator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The generator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(data_dim, activation='sigmoid')  # Output layer
    ])

    return model


def build_discriminator(data_dim, hidden_dim):
    """
    Builds the discriminator model for a Generative Adversarial Imputation Network (GAIN).

    Args:
        data_dim (int): The dimensionality of the input data.
        hidden_dim (int): The number of hidden units in the encoder and decoder.

    Returns:
        tf.keras.Model: The discriminator model.
    """

    # Define the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(data_dim,)),  # Input layer
        tf.keras.layers.Dense(hidden_dim, activation='relu'),  # Hidden layers
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer
    ])

    return model




In [8]:

# Setup models
data_dim = X_train.shape[1]
hidden_dim = 128

generator = build_generator(data_dim, hidden_dim)
discriminator = build_discriminator(data_dim, hidden_dim)

# Print the model summary
generator.summary(), discriminator.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1536      
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 11)                1419      
                                                                 
Total params: 19467 (76.04 KB)
Trainable params: 19467 (76.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               1536      
                                                               

(None, None)

#### Loss functions and optimizers

In [9]:
# Loss function for the discriminator
def discriminator_loss(D_prob, M, X, G_sample):
    D_prob = tf.cast(D_prob, dtype=tf.float32)  # Cast to float32
    return -tf.reduce_mean(M * tf.math.log(D_prob + 1e-8) + (1 - M) * tf.math.log(1. - D_prob + 1e-8))

# Loss function for the generator
def generator_loss(D_prob, G_sample, M, X):
    # Cast all inputs to float32 to ensure consistent data types for operations
    D_prob = tf.cast(D_prob, dtype=tf.float32)
    G_sample = tf.cast(G_sample, dtype=tf.float32)
    M = tf.cast(M, dtype=tf.float32)
    X = tf.cast(X, dtype=tf.float32)
    
    # Compute the binary cross-entropy loss part
    BCE_loss = -tf.reduce_mean((1 - M) * tf.math.log(D_prob + tf.constant(1e-8, dtype=tf.float32)))

    # Compute the mean squared error loss part
    MSE_loss = tf.reduce_mean(M * tf.square(X - G_sample))

    # Weighting factor for the losses
    alpha = 0.5

    # Combine the losses
    total_loss = alpha * BCE_loss + (1 - alpha) * MSE_loss

    return total_loss

# Adam optimizer is a stochastic gradient descent method
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

#### Training function definition

In [10]:
@tf.function
def train_step(generator, discriminator, data, batch_size):
    noise = tf.random.normal([batch_size, data.shape[1]])
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_data = generator(noise, training=True)
        real_output = discriminator(data, training=True)
        fake_output = discriminator(generated_data, training=True)
        gen_loss = generator_loss(fake_output, generated_data, data, noise)
        disc_loss = discriminator_loss(real_output, fake_output, data, noise)
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

def train_gan(generator, discriminator, df, iterations, batch_size):
    for iteration in range(iterations):
        idx = np.random.choice(len(df), batch_size, replace=False)
        data_batch = df.iloc[idx]
        gen_loss, disc_loss = train_step(generator, discriminator, data_batch, batch_size)
        if iteration % 1000 == 0:
            print(f"Iteration {iteration}, Generator Loss: {gen_loss}, Discriminator Loss: {disc_loss}")

#### Training loop

In [11]:
# Start training
train_gan(generator, discriminator, X_train, iterations=10000, batch_size=128)

Iteration 0, Generator Loss: 0.4583289325237274, Discriminator Loss: 0.6950441598892212
Iteration 1000, Generator Loss: 1.9908151626586914, Discriminator Loss: 0.012231605127453804
Iteration 2000, Generator Loss: 2.5536105632781982, Discriminator Loss: 0.001591833308339119
Iteration 3000, Generator Loss: 2.9825124740600586, Discriminator Loss: 0.000516430358402431
Iteration 4000, Generator Loss: 3.225710153579712, Discriminator Loss: 0.00021870125783607364
Iteration 5000, Generator Loss: 3.7362852096557617, Discriminator Loss: 0.00010426851076772436
Iteration 6000, Generator Loss: 3.722047805786133, Discriminator Loss: 5.3154173656366765e-05
Iteration 7000, Generator Loss: 3.9351186752319336, Discriminator Loss: 2.832830068655312e-05
Iteration 8000, Generator Loss: 4.038579940795898, Discriminator Loss: 1.5276573321898468e-05
Iteration 9000, Generator Loss: 4.314601898193359, Discriminator Loss: 8.446997526334599e-06


In [12]:
from joblib import dump

dump(generator, 'cardio_gain_generator.h5')

['cardio_gain_generator.h5']