In [4]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import time
import os

# Configure directories for saving models and samples
SAMPLE_DIR = './MNIST/figs/generated_images_for_testRun3'
MODEL_DIR = './MNIST/models/models_for_testRun3'
BUFFER_SIZE = 60000
BATCH_SIZE = 512  # Increased batch size for multi-GPU
IMG_SHAPE = (32, 32, 1)
NOISE_DIM = 100

# Ensure directories exist
os.makedirs(SAMPLE_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Preprocess images (resize to 32x32 and normalize to [-1, 1])
def preprocess_images(image):
    image = tf.image.resize(image, [32, 32])
    image = (image - 127.5) / 127.5  # Normalize to [-1, 1]
    return image

# Load and preprocess MNIST dataset
(train_images, _), (_, _) = tf.keras.datasets.mnist.load_data()
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')
train_images = tf.data.Dataset.from_tensor_slices(train_images)
train_images = train_images.map(preprocess_images, num_parallel_calls=tf.data.AUTOTUNE)
train_images = train_images.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Generator model
def build_generator():
    model = tf.keras.Sequential([
        layers.Dense(4 * 4 * 256, use_bias=False, input_shape=(NOISE_DIM,)),
        layers.BatchNormalization(),
        layers.ReLU(),

        layers.Reshape((4, 4, 256)),

        layers.Conv2DTranspose(128, (5, 5), strides=(2, 2), padding='same', use_bias=False),
        layers.BatchNormalization(),
        layers.ReLU(),

        layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False),
        layers.BatchNormalization(),
        layers.ReLU(),

        layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', activation='tanh')
    ])

    return model

# Discriminator model
def build_discriminator():
    model = tf.keras.Sequential([
        layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same', input_shape=IMG_SHAPE),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        layers.Flatten(),
        layers.Dense(1)
    ])

    return model

# Define loss functions and optimizers
def discriminator_loss(real_output, fake_output):
    real_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(real_output), real_output)
    fake_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.zeros_like(fake_output), fake_output)
    return real_loss + fake_loss

def generator_loss(fake_output):
    return tf.keras.losses.BinaryCrossentropy(from_logits=True)(tf.ones_like(fake_output), fake_output)

# Multi-GPU strategy
strategy = tf.distribute.MirroredStrategy()
print(f"Number of devices: {strategy.num_replicas_in_sync}")

# Instantiate models and optimizers within the strategy scope
with strategy.scope():
    generator = build_generator()
    discriminator = build_discriminator()

    generator_optimizer = tf.keras.optimizers.Adam(1e-4)
    discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# Training step
@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    return gen_loss, disc_loss

# Generate and save images
def generate_and_save_images(model, epoch, test_input):
    predictions = model(test_input, training=False)

    fig = plt.figure(figsize=(6, 6))
    for i in range(predictions.shape[0]):
        plt.subplot(6, 6, i + 1)
        plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')
        plt.axis('off')

    plt.savefig(os.path.join(SAMPLE_DIR, f'image_at_epoch_{epoch:04d}.png'))
    plt.close()

# Training loop
def train(dataset, epochs):
    seed = tf.random.normal([36, NOISE_DIM])

    for epoch in range(epochs):
        start = time.time()

        for image_batch in dataset:
            gen_loss, disc_loss = train_step(image_batch)

        print(f'Epoch {epoch + 1}, Gen Loss: {gen_loss:.4f}, Disc Loss: {disc_loss:.4f}, Time: {time.time() - start:.2f} sec')

        # Generate and save images at the end of each epoch
        generate_and_save_images(generator, epoch + 1, seed)

        # Save the model every 5 epochs
        if (epoch + 1) % 5 == 0:
            checkpoint_prefix = os.path.join(MODEL_DIR, "ckpt")
            generator.save_weights(checkpoint_prefix + f"_generator_epoch_{epoch+1}")
            discriminator.save_weights(checkpoint_prefix + f"_discriminator_epoch_{epoch+1}")

# Run training
EPOCHS = 1000
train(train_images, EPOCHS)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Number of devices: 2
Epoch 1, Gen Loss: 0.6852, Disc Loss: 1.2113, Time: 8.30 sec
Epoch 2, Gen Loss: 0.5631, Disc Loss: 2.0306, Time: 7.57 sec
Epoch 3, Gen Loss: 0.8722, Disc Loss: 1.1592, Time: 5.45 sec
Epoch 4, Gen Loss: 0.7288, Disc Loss: 1.4194, Time: 5.13 sec
Epoch 5, Gen Loss: 0.9315, Disc Loss: 1.0916, Time: 5.01 sec
Epoch 6, Gen Loss: 0.6382, Disc Loss: 1.6766, Time: 5.21 sec
Epoch 7, Gen Loss: 1.4546, Disc Loss: 0.6035, Time: 4.99 sec
Epoch 8, Gen Loss: 1.0245, Disc Loss: 1.2119, Time: 4.50 sec
Epoch 9, Gen Loss: 0.9206, Disc Loss: 1.2612, Time: 4.46 sec
Epoch 10, Gen Loss: 0.9129, Disc Loss: 1.2145, Time: 6.19 sec
Epoch 11, Gen Loss: 0.8490, Disc Loss: 1.5282, Time: 5.17 sec
Epoch 12, Gen Loss: 0.9902, Disc Loss: 1.3023, Time: 5.60 sec
Epoch 13, Gen Loss: 1.1405, Disc Loss: 0.8477, Time: 4.95 sec
Epoch 14, Gen Loss: 1.7238, Disc 