In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard

# Model Architecture

## Discriminator Class
- A convolutional neural network designed to classify images as real or fake.
- Takes in an image and outputs a probability value indicating whether the image is real or fake.
- Uses convolutional layers with **LeakyReLU** activations and **BatchNorm** layers.
- The final output is a single probability (between 0 and 1) for each image.

## Generator Class
- A convolutional neural network that generates synthetic images from random noise vectors.
- Takes in a latent vector (usually a random noise vector) and outputs an image.
- Uses transposed convolution layers to upsample the latent vector into an image of the desired size.
- Outputs images normalized between [-1, 1] using the **Tanh** activation function.

## Initialize Weights
- A function to initialize the weights of the model layers using **normal distribution** (mean = 0, std = 0.02).
- The **Convolutional layers** (`Conv2d` and `ConvTranspose2d`) and **Batch Normalization layers** (`BatchNorm2d`) are initialized to improve training stability.
- For **BatchNorm layers**, the scaling factor (`gamma`) is initialized using **normal distribution** with mean = 1 and std = 0.02 to allow slight flexibility in activation scaling.
- The **bias** of BatchNorm layers is initialized to 0 to prevent any initial shift in activations.

## Simple Dimension Check
- A function to verify that the input and output dimensions of both the **Discriminator** and **Generator** are correct.
- Ensures that the Discriminator outputs a tensor of shape `(N, 1, 1, 1)` where `N` is the batch size.
- Ensures that the Generator outputs a tensor of shape `(N, channels_img, H, W)` where `H` and `W` are the target image dimensions.

In [2]:
class Discriminator(nn.Module):
    def __init__(self, channels_img, features_d):
        """
        Implements the Discriminator for DCGAN.
        This model follows a convolutional architecture that progressively reduces the spatial dimensions 
        while increasing the feature depth. The final output is a single scalar value (0 or 1) indicating 
        whether the input image is real or fake.

        Parameters:
        -----------
        channels_img : int
            Number of channels in the input image. 
            (For RGB images, this is typically 3. For grayscale images like MNIST, it's 1.)
        features_d : int
            Number of feature maps in the first convolutional layer.
            This number scales up in deeper layers to capture more complex features.
        """
        super().__init__()

        self.model = nn.Sequential(
            # -------------------------
            # Layer 1: Initial Convolution
            # -------------------------
            # Input: (N, channels_img, 64, 64)
            # Output: (N, features_d, 32, 32)
            # Explanation:
            # - Stride=2 reduces width and height by half
            # - No batch normalization in the first layer as per the DCGAN paper
            nn.Conv2d(
                channels_img,      # Number of input channels (e.g., 3 for RGB images)
                features_d,        # Number of output feature maps
                kernel_size=4,     # 4x4 convolution kernel
                stride=2,          # Reduces spatial dimensions (64x64 -> 32x32)
                padding=1          # Maintains proper output size after convolution
            ),
            nn.LeakyReLU(0.2),  # LeakyReLU activation with a negative slope of 0.2

            # -------------------------
            # Layer 2: Downsampling
            # -------------------------
            # Input: (N, features_d, 32, 32)
            # Output: (N, features_d*2, 16, 16)
            self._convolutionBlock(features_d, features_d*2, 4, 2, 1),

            # -------------------------
            # Layer 3: Downsampling
            # -------------------------
            # Input: (N, features_d*2, 16, 16)
            # Output: (N, features_d*4, 8, 8)
            self._convolutionBlock(features_d*2, features_d*4, 4, 2, 1),

            # -------------------------
            # Layer 4: Downsampling
            # -------------------------
            # Input: (N, features_d*4, 8, 8)
            # Output: (N, features_d*8, 4, 4)
            self._convolutionBlock(features_d*4, features_d*8, 4, 2, 1),

            # -------------------------
            # Final Layer: Fully Connected Convolution
            # -------------------------
            # Input: (N, features_d*8, 4, 4)
            # Output: (N, 1, 1, 1)
            # Explanation:
            # - This layer performs a final convolution that reduces the spatial dimension to 1x1
            # - The output is a single value per image, indicating the probability of being real or fake
            nn.Conv2d(features_d*8, 1, kernel_size=4, stride=2, padding=0),
            nn.Sigmoid()
        )

    def _convolutionBlock(self, in_channels, out_channels, kernel_size, stride, padding):
        """
        Defines a convolutional block used in the Discriminator.
        Each block consists of:
        - A 2D Convolution (with no bias to improve stability)
        - Batch Normalization (to stabilize learning)
        - LeakyReLU Activation (to allow small gradients even for negative inputs)

        Parameters:
        -----------
        in_channels : int
            Number of input channels
        out_channels : int
            Number of output feature maps
        kernel_size : int
            Size of the convolutional kernel
        stride : int
            Stride of the convolution (typically 2 for downsampling)
        padding : int
            Padding applied to the convolution (typically 1 to maintain proper dimensions)
        
        Returns:
        --------
        nn.Sequential : A sequential block of operations
        """
        return nn.Sequential(
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                bias=False,  # Bias is removed as BatchNorm handles normalization
            ),
            nn.BatchNorm2d(out_channels),  # Normalizes feature maps to stabilize training
            nn.LeakyReLU(0.2),  # Allows a small gradient for negative inputs, avoiding dead neurons
        )

    def forward(self, x):
        """
        Defines the forward pass of the Discriminator.

        Parameters:
        -----------
        x : torch.Tensor
            Input image tensor of shape (N, channels_img, 64, 64)

        Returns:
        --------
        torch.Tensor
            Output tensor of shape (N, 1, 1, 1), representing probability scores of being real/fake.
        """
        return self.model(x)


In [3]:
class Generator(nn.Module):
    def __init__(self, z_dim, channels_img, features_g):
        """
        Implements the Generator for DCGAN.
        The Generator takes a random noise vector (latent space) and transforms it 
        into a realistic-looking image through a series of transposed convolutions.
        
        Parameters:
        -----------
        z_dim : int
            Dimension of the latent noise vector (typically 100 in DCGAN implementations).
        channels_img : int
            Number of channels in the generated image.
            (For RGB images, this is typically 3. For grayscale images, it's 1.)
        features_g : int
            Number of feature maps in the first transposed convolutional layer.
            This number scales down in deeper layers to generate finer details.
        """
        super().__init__()

        self.model = nn.Sequential(
            # ---------------------------------------
            # Layer 1: Transform Noise Vector (z_dim) into Feature Maps
            # ---------------------------------------
            # Input: (N, z_dim, 1, 1)  [Latent space input]
            # Output: (N, features_g*16, 4, 4)
            # Explanation:
            # - Converts the 1x1 noise vector into a 4x4 feature map.
            # - `stride=1` and `padding=0` ensure the output starts as exactly 4x4.
            self._convolutionBlock(z_dim, features_g*16, kernel_size=4, stride=1, padding=0),

            # ---------------------------------------
            # Layer 2: Upsample to 8x8
            # ---------------------------------------
            # Input: (N, features_g*16, 4, 4)
            # Output: (N, features_g*8, 8, 8)
            self._convolutionBlock(features_g*16, features_g*8, kernel_size=4, stride=2, padding=1),

            # ---------------------------------------
            # Layer 3: Upsample to 16x16
            # ---------------------------------------
            # Input: (N, features_g*8, 8, 8)
            # Output: (N, features_g*4, 16, 16)
            self._convolutionBlock(features_g*8, features_g*4, kernel_size=4, stride=2, padding=1),

            # ---------------------------------------
            # Layer 4: Upsample to 32x32
            # ---------------------------------------
            # Input: (N, features_g*4, 16, 16)
            # Output: (N, features_g*2, 32, 32)
            self._convolutionBlock(features_g*4, features_g*2, kernel_size=4, stride=2, padding=1),

            # ---------------------------------------
            # Final Layer: Upsample to 64x64 (Target Image Size)
            # ---------------------------------------
            # Input: (N, features_g*2, 32, 32)
            # Output: (N, channels_img, 64, 64) [Final generated image]
            # Explanation:
            # - The final layer **does not use BatchNorm** (per the DCGAN paper).
            # - Uses `Tanh` activation to output values in [-1, 1] to match normalized image range.
            nn.ConvTranspose2d(
                in_channels=features_g*2,
                out_channels=channels_img,
                kernel_size=4,
                stride=2,
                padding=1
            ),
            nn.Tanh()
        )
        
    def _convolutionBlock(self, in_channels, out_channels, kernel_size, stride, padding):
        """
        Defines a transposed convolutional block used in the Generator.
        Each block consists of:
        - A Transposed Convolution (upsampling operation)
        - Batch Normalization (to stabilize training)
        - ReLU Activation (for non-linearity)
        
        Parameters:
        -----------
        in_channels : int
            Number of input feature maps.
        out_channels : int
            Number of output feature maps.
        kernel_size : int
            Size of the convolutional kernel.
        stride : int
            Stride of the transposed convolution (typically 2 for upsampling).
        padding : int
            Padding applied to the transposed convolution (typically 1 for proper output size).
        
        Returns:
        --------
        nn.Sequential : A sequential block of operations.
        """
        return nn.Sequential(
            nn.ConvTranspose2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                bias=False  # No bias since BatchNorm is used.
            ),
            nn.BatchNorm2d(out_channels),  # Stabilizes training by normalizing activations.
            nn.ReLU()  # Activation function to introduce non-linearity.
        )

    def forward(self, x):
        """
        Defines the forward pass of the Generator.
        
        Parameters:
        -----------
        x : torch.Tensor
            Input noise vector of shape (N, z_dim, 1, 1).
        
        Returns:
        --------
        torch.Tensor
            Output image tensor of shape (N, channels_img, 64, 64).
        """
        return self.model(x)


In [4]:
def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
            # Per DCGAN paper, we initialize weights from a normal distribution with mean=0, std=0.02
            # This helps stabilize training and prevents mode collapse in GANs.
            nn.init.normal_(m.weight.data, 0.0, 0.02) 

            # For batch normalization layers, these are default values but explicitly stating it again
            if isinstance(m, nn.BatchNorm2d):
                # Gamma (scaling factor) is initialized to follow N(1, 0.02) per DCGAN recommendations
                nn.init.normal_(m.weight.data, 1.0, 0.02)  

                # Beta (bias) is set to zero, ensuring the initial batch normalization does not shift activations
                nn.init.constant_(m.bias.data, 0)  


In [5]:
def test():
    # N: batch size, in_channels: input channels (e.g., RGB images), H: height, W: width
    N, in_channels, H, W = 8, 3, 64, 64 
    # Dimensionality of the noise vector (latent space)
    z_dim = 100  

    # Input should be batch of RGB images of HxW
    x = torch.randn((N, in_channels, H, W))
    disc_model = Discriminator(in_channels, 8)  
    initialize_weights(disc_model)
    # Output should be one value indicating real or fake
    assert disc_model(x).shape == (N, 1, 1, 1)

    # Input should be latent noise with z_dim for each channel
    gen_model = Generator(z_dim, in_channels, 8)  
    initialize_weights(gen_model)
    z = torch.randn((N, z_dim, 1, 1))
    # Output should be RGB images of HxW
    assert gen_model(z).shape == (N, in_channels, H, W)

    print("Dimension Test Passed")
    
test()

Dimension Test Passed


# Deep Convolutional GAN on MNIST dataset

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters for training
lr = 2e-4 
batch_size = 128 
img_size = 64
img_channels_mnist = 1
z_dim = 100
num_epochs = 5
features_disc = 64
features_gen = 64

# Data transformations for preprocessing the MNIST dataset
transformer = transforms.Compose(
    [
        # Resize images to the specified size (img_size x img_size)
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        # Normalize the images so pixel values are between [-1, 1]
        transforms.Normalize([0.5 for _ in range(img_channels_mnist)], [0.5 for _ in range(img_channels_mnist)]),
    ]
)

# Load the MNIST dataset with the specified transformations
dataset = datasets.MNIST(root="dataset/", train=True, transform=transformer, download=True)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the generator and discriminator models and move them to the appropriate device (GPU or CPU)
gen_model = Generator(z_dim, img_channels_mnist, features_gen).to(device)
disc_model = Discriminator(img_channels_mnist, features_disc).to(device)

# Initialize the weights of both the generator and discriminator using the custom function
initialize_weights(gen_model)
initialize_weights(disc_model)

# Set up Adam optimizers for both models with the same learning rate and betas for stability in training
optimizer_gen = optim.Adam(gen_model.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_disc = optim.Adam(disc_model.parameters(), lr=lr, betas=(0.5, 0.999))

# Binary Cross-Entropy loss (BCELoss) for adversarial training
criterion = nn.BCELoss()

# Generate a fixed noise vector for visualizing the output of the generator during training
fixed_noise = torch.randn(32, z_dim, 1, 1).to(device)

# Set up TensorBoard writers to log images for both real and fake images
writer_fake = SummaryWriter(f"runs/DCGAN_MNIST/fake")
writer_real = SummaryWriter(f"runs/DCGAN_MNIST/real")

In [9]:
%tensorboard --logdir=runs/DCGAN_MNIST --bind_all --port=6006
print("Tensorboard is running on port 6006")

# Set the models to training mode
gen_model.train()
disc_model.train()
step = 0

for epoch in range(num_epochs):
    for batch_idx, (real, _) in enumerate(loader):
        # Move the real images to the device (GPU/CPU)
        real = real.to(device)
        
        # Generate random noise to feed into the generator
        noise = torch.randn((batch_size, z_dim, 1, 1)).to(device)
        
        # Generate fake images using the generator
        fake = gen_model(noise)

        # --- Train the Discriminator ---
        # Compute the discriminator's output on real images
        disc_real = disc_model(real).reshape(-1)
        # Calculate the discriminator's loss on real images (maximize log(D(x)))
        loss_disc_real = criterion(disc_real, torch.ones_like(disc_real))
        
        # Compute the discriminator's output on fake images
        disc_fake = disc_model(fake.detach()).reshape(-1)
        # Calculate the discriminator's loss on fake images (maximize log(1 - D(G(z))))
        loss_disc_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        
        # Total discriminator loss is the average of the real and fake losses
        loss_disc = (loss_disc_real + loss_disc_fake) / 2
        
        disc_model.zero_grad()
        loss_disc.backward()
        optimizer_disc.step()

        # --- Train the Generator ---
        # Compute the discriminator's output on the fake images
        output = disc_model(fake).reshape(-1)
        
        # Generator's objective is to fool the discriminator, so it minimizes log(1 - D(G(z))) 
        # This is equivalent to maximizing log(D(G(z))) in practice.
        loss_gen = criterion(output, torch.ones_like(output))
        
        gen_model.zero_grad()
        loss_gen.backward()
        optimizer_gen.step()

        # Print losses and log images to TensorBoard every 100 batches
        if batch_idx % 100 == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                  Loss D: {loss_disc:.4f}, loss G: {loss_gen:.4f}"
            )

            # Log real and fake images to TensorBoard using a fixed noise vector
            with torch.no_grad():  # No gradients are needed for this step
                fake = gen_model(fixed_noise)  # Generate a batch of fake images using fixed noise
                # Create image grids for real and fake images (up to 32 images)
                img_grid_real = torchvision.utils.make_grid(real[:32], normalize=True)
                img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)

                # Add images to TensorBoard (real and fake images for visualization)
                writer_real.add_image("Real", img_grid_real, global_step=step)
                writer_fake.add_image("Fake", img_grid_fake, global_step=step)

            step += 1

Reusing TensorBoard on port 6006 (pid 5463), started 0:00:48 ago. (Use '!kill 5463' to kill it.)

Tensorboard is running on port 6006
Epoch [0/5] Batch 0/469                   Loss D: 1.0604, loss G: 6.9379
Epoch [0/5] Batch 100/469                   Loss D: 0.2769, loss G: 2.8675
Epoch [0/5] Batch 200/469                   Loss D: 0.5650, loss G: 3.9891
Epoch [0/5] Batch 300/469                   Loss D: 0.3816, loss G: 2.2462
Epoch [0/5] Batch 400/469                   Loss D: 0.3516, loss G: 1.4641
Epoch [1/5] Batch 0/469                   Loss D: 0.4180, loss G: 2.2748
Epoch [1/5] Batch 100/469                   Loss D: 0.5102, loss G: 2.3008
Epoch [1/5] Batch 200/469                   Loss D: 0.4867, loss G: 4.2233
Epoch [1/5] Batch 300/469                   Loss D: 0.3670, loss G: 1.0248
Epoch [1/5] Batch 400/469                   Loss D: 0.5637, loss G: 2.1484
Epoch [2/5] Batch 0/469                   Loss D: 0.4234, loss G: 2.5906
Epoch [2/5] Batch 100/469                   Loss D: 0.3628, loss G: 1.6961
Epoch [2/5] Batch 200/469                   Loss D: 0.3553, loss G: 1.

# Deep Convolutional GAN on CelebA Dataset

In [20]:
# CelebA dataset is RGB image
img_channels_celeb = 3

# Data transformations for preprocessing the CELEB dataset
transformer = transforms.Compose(
    [
        # Resize images to the specified size (img_size x img_size)
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        # Normalize the images so pixel values are between [-1, 1]
        transforms.Normalize([0.5 for _ in range(img_channels_celeb)], [0.5 for _ in range(img_channels_celeb)]),
    ]
)

# Load the CELEB dataset with the specified transformations
dataset = datasets.ImageFolder(root="./dataset/CelebA", transform=transformer)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the generator and discriminator models and move them to the appropriate device (GPU or CPU)
gen_model = Generator(z_dim, img_channels_celeb, features_gen).to(device)
disc_model = Discriminator(img_channels_celeb, features_disc).to(device)

# Initialize the weights of both the generator and discriminator using the custom function
initialize_weights(gen_model)
initialize_weights(disc_model)

# Set up Adam optimizers for both models with the same learning rate and betas for stability in training
optimizer_gen = optim.Adam(gen_model.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_disc = optim.Adam(disc_model.parameters(), lr=lr, betas=(0.5, 0.999))

# Binary Cross-Entropy loss (BCELoss) for adversarial training
criterion = nn.BCELoss()

# Generate a fixed noise vector for visualizing the output of the generator during training
fixed_noise = torch.randn(32, z_dim, 1, 1).to(device)

# Set up TensorBoard writers to log images for both real and fake images
writer_fake = SummaryWriter(f"runs/DCGAN_CELEB/fake")
writer_real = SummaryWriter(f"runs/DCGAN_CELEB/real")

In [21]:
!pkill -f "tensorboard"
%tensorboard --logdir=runs/DCGAN_CELEB --bind_all --port=6006
print("Tensorboard is running on port 6006")

# Set the models to training mode
gen_model.train()
disc_model.train()
step = 0

for epoch in range(num_epochs):
    for batch_idx, (real, _) in enumerate(loader):
        # Move the real images to the device (GPU/CPU)
        real = real.to(device)
        
        # Generate random noise to feed into the generator
        noise = torch.randn((batch_size, z_dim, 1, 1)).to(device)
        
        # Generate fake images using the generator
        fake = gen_model(noise)

        # --- Train the Discriminator ---
        # Compute the discriminator's output on real images
        disc_real = disc_model(real).reshape(-1)
        # Calculate the discriminator's loss on real images (maximize log(D(x)))
        loss_disc_real = criterion(disc_real, torch.ones_like(disc_real))
        
        # Compute the discriminator's output on fake images
        disc_fake = disc_model(fake.detach()).reshape(-1)
        # Calculate the discriminator's loss on fake images (maximize log(1 - D(G(z))))
        loss_disc_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        
        # Total discriminator loss is the average of the real and fake losses
        loss_disc = (loss_disc_real + loss_disc_fake) / 2
        
        disc_model.zero_grad()
        loss_disc.backward()
        optimizer_disc.step()

        # --- Train the Generator ---
        # Compute the discriminator's output on the fake images
        output = disc_model(fake).reshape(-1)
        
        # Generator's objective is to fool the discriminator, so it minimizes log(1 - D(G(z))) 
        # This is equivalent to maximizing log(D(G(z))) in practice.
        loss_gen = criterion(output, torch.ones_like(output))
        
        gen_model.zero_grad()
        loss_gen.backward()
        optimizer_gen.step()

        # Print losses and log images to TensorBoard every 100 batches
        if batch_idx % 100 == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                  Loss D: {loss_disc:.4f}, loss G: {loss_gen:.4f}"
            )

            # Log real and fake images to TensorBoard using a fixed noise vector
            with torch.no_grad():  # No gradients are needed for this step
                fake = gen_model(fixed_noise)  # Generate a batch of fake images using fixed noise
                # Create image grids for real and fake images (up to 32 images)
                img_grid_real = torchvision.utils.make_grid(real[:32], normalize=True)
                img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)

                # Add images to TensorBoard (real and fake images for visualization)
                writer_real.add_image("Real", img_grid_real, global_step=step)
                writer_fake.add_image("Fake", img_grid_fake, global_step=step)

            step += 1

Tensorboard is running on port 6006
Epoch [0/5] Batch 0/1583                   Loss D: 1.0034, loss G: 7.4550
Epoch [0/5] Batch 100/1583                   Loss D: 0.3456, loss G: 4.4890
Epoch [0/5] Batch 200/1583                   Loss D: 0.5005, loss G: 4.0865
Epoch [0/5] Batch 300/1583                   Loss D: 0.5288, loss G: 4.0607
Epoch [0/5] Batch 400/1583                   Loss D: 0.4331, loss G: 2.2625
Epoch [0/5] Batch 500/1583                   Loss D: 0.4269, loss G: 2.6272
Epoch [0/5] Batch 600/1583                   Loss D: 0.4612, loss G: 2.5881
Epoch [0/5] Batch 700/1583                   Loss D: 0.8033, loss G: 3.0244
Epoch [0/5] Batch 800/1583                   Loss D: 0.6337, loss G: 3.0412
Epoch [0/5] Batch 900/1583                   Loss D: 0.4623, loss G: 2.9223
Epoch [0/5] Batch 1000/1583                   Loss D: 0.5819, loss G: 1.6727
Epoch [0/5] Batch 1100/1583                   Loss D: 0.4537, loss G: 2.1151
Epoch [0/5] Batch 1200/1583                   Loss D

KeyboardInterrupt: 