### Goal of project:
The project performs the implementation of DCGAN architecture where generator tries to create a new MNIST data from the given latent  

[Paper Link](https://arxiv.org/pdf/1511.06434.pdf)


Here, the generator takes the latent vector of given input size.

This is transformed to the 784 size vector.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from tqdm import tqdm
import os

## General Configuration

In [2]:
# General config
batch_size = 64

# Generator config
sample_size = 100    
g_alpha     = 0.01   
g_lr        = 1.0e-3 

# Discriminator config
d_alpha = 0.01       
d_lr    = 1.0e-4     

In [3]:

# Generator network with transposed convolutions
class Generator(nn.Module):
    def __init__(self, sample_size: int, alpha: float):
        super().__init__()
        # sample_size => 784 

        self.alpha = alpha
        
        self.l1 = nn.Linear(sample_size, 784)
        self.bn1 = nn.BatchNorm1d(784)

        #
        self.conv1 = nn.ConvTranspose2d(16, 32, 
                               kernel_size=5, stride=2, padding=2,
                               output_padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.ConvTranspose2d(32, 1,
                               kernel_size=5, stride=2, padding=2,
                               output_padding=1, bias=False)

        # Random value vector size
        self.sample_size = sample_size
        

    def forward(self, batch_size: int):
        # Random value generation
        z = torch.randn(batch_size, self.sample_size)
        # print(z.shape)

        # the linear layer extracts the 784 features
        x = nn.LeakyReLU(self.alpha)(self.bn1(self.l1(z)))
        # print(x.shape)

        # the 784 vector size is reshaped to 16*7*7, where 7*7 acts as the image size and 16 as the number of features
        x = torch.reshape(x, (-1, 16,7,7)) 

        # 16*7*7 is converted to the 32*14*14. 14*14 scale up image comes from the increased kernel size of 5 during TransposeConvolution
        x = nn.LeakyReLU(self.alpha)(self.bn2(self.conv1(x)))   

        # 
        # print(x.shape)

        # again the transpose convolution is used to convert change image size to 28*28 with 1 features.
        # application of sigmoid fixes the layer output to the range of the 0 to 1
        x = nn.Sigmoid()(self.conv2(x))
        
        return x

## Visualize Generator Architecture

In [4]:
a = Generator(4, 0.5)
print(a(5))


tensor([[[[0.5550, 0.3139, 0.5574,  ..., 0.6970, 0.4372, 0.5643],
          [0.7290, 0.4817, 0.4791,  ..., 0.5538, 0.5126, 0.4993],
          [0.6033, 0.3975, 0.5155,  ..., 0.4286, 0.5637, 0.4460],
          ...,
          [0.6212, 0.6079, 0.2619,  ..., 0.5273, 0.5170, 0.4668],
          [0.5250, 0.4222, 0.3725,  ..., 0.5432, 0.4824, 0.4563],
          [0.5132, 0.5646, 0.5117,  ..., 0.5214, 0.5228, 0.5218]]],


        [[[0.6906, 0.4980, 0.4349,  ..., 0.4163, 0.6048, 0.5199],
          [0.5561, 0.5428, 0.4523,  ..., 0.6602, 0.5158, 0.5214],
          [0.5430, 0.4391, 0.5007,  ..., 0.4026, 0.3523, 0.5505],
          ...,
          [0.5953, 0.5025, 0.2096,  ..., 0.5335, 0.5460, 0.4824],
          [0.3004, 0.3751, 0.3450,  ..., 0.5677, 0.4928, 0.4548],
          [0.5130, 0.5716, 0.2769,  ..., 0.4182, 0.5240, 0.5058]]],


        [[[0.4692, 0.4366, 0.2013,  ..., 0.1029, 0.5057, 0.4172],
          [0.5711, 0.5580, 0.6006,  ..., 0.4828, 0.4866, 0.6276],
          [0.7408, 0.9064, 0.3710,  ..

In [5]:
# Discriminator network with convolutions
# the goal of the descriminator is used to classify whether the image is real or fake and is tested
class Discriminator(nn.Module):
    def __init__(self, alpha: float):
        super().__init__()
        
        self.conv1 = nn.Conv2d(1, 32,
                    kernel_size=5, stride=2, padding=2, bias=False)
        
        self.conv2 = nn.Conv2d(32, 16,
                    kernel_size=5, stride=2, padding=2, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        
        self.l1 = nn.Linear(784, 784)
        self.bn2 = nn.BatchNorm1d(784)
        
        self.l2 = nn.Linear(784, 1)
        self.alpha = alpha

    def forward(self, images: torch.Tensor, targets: torch.Tensor):

        # the input image is convolved and the image size is also reduced during the convolution without the use of maxpooling
        # the paper mentions the removal of the max pooling layer with the convolution layer
        # this further makes the model learn the parameters even during the pooling operation

        # given the input image is 28 * 28 * 1 -> output size of 32 * 14 * 14 is extracted
        x = nn.LeakyReLU(self.alpha)(self.conv1(images))  
        

        # further the size is reduced to 16 * 7 * 7
        x = nn.LeakyReLU(self.alpha)(self.bn1(self.conv2(x)))       
        

        # the input is flattened to 784 size then fed to linear layer
        x = nn.Flatten()(x)
        
        x = self.bn2(self.l1(x))
        
        x = nn.LeakyReLU(self.alpha)(x)
        
        # finally the single dimension output is extracted.
        prediction = self.l2(x) 

        # bcelogits is returned as the loss
        loss = F.binary_cross_entropy_with_logits(prediction, targets)
        return loss

## Visualize Descriminator Architecture

In [6]:
des = Discriminator(alpha=0.5)

images = torch.randn((2,1,28,28))
targets = torch.zeros((2,1))
print(des(images, targets))

tensor(0.7647, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


## MNIST data loader

In [7]:
# loading mnist datgaset from torch library
transform = transforms.ToTensor()
dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, drop_last=True)

In [8]:
# function to save the image
def save_image_grid(epoch: int, images: torch.Tensor, ncol: int, file_dir = "data"):
    os.makedirs(file_dir, exist_ok=True)
    image_grid = make_grid(images, ncol)     # Images in a grid
    image_grid = image_grid.permute(1, 2, 0) # Move channel last
    image_grid = image_grid.cpu().numpy()    # To Numpy

    plt.imshow(image_grid)
    plt.xticks([])
    plt.yticks([])
    plt.savefig(f'{file_dir}/generated_{epoch:03d}.jpg')
    plt.close()

- creating the real targets are ones while the fake targets are zeros.
- both architecture is optimized using the Adam optimizer.
- at first the descriminator is trained and then the descriminator is evaluated with generator 
- at the end each epoch, random image is generated using generator and saved to data/ folder

In [9]:
# Real and fake labels
real_targets = torch.ones(batch_size, 1)
fake_targets = torch.zeros(batch_size, 1)

# Generator and discriminator networks
generator = Generator(sample_size, g_alpha)
discriminator = Discriminator(d_alpha)

# Optimizers
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=d_lr)
g_optimizer = torch.optim.Adam(generator.parameters(), lr=g_lr)


def train_descriminator():

    # Loss with MNIST image inputs and real_targets as labels
    discriminator.train()
    d_loss = discriminator(images, real_targets)
    
    # Generate images in eval mode
    generator.eval()
    with torch.no_grad():
        generated_images = generator(batch_size)

    # Loss with generated image inputs and fake_targets as labels
    d_loss += discriminator(generated_images, fake_targets)

    # Optimizer updates the discriminator parameters
    d_optimizer.zero_grad()
    d_loss.backward()
    d_optimizer.step()

    return d_loss.item()

def train_generator():

    # Generate images in train mode
    generator.train()
    generated_images = generator(batch_size)

    # batchnorm is unstable in eval due to generated images
    # change drastically every epoch. We'll not use the eval here.
    # discriminator.eval() 

    # Loss with generated image inputs and real_targets as labels
    g_loss = discriminator(generated_images, real_targets)

    # Optimizer updates the generator parameters
    g_optimizer.zero_grad()
    g_loss.backward()
    g_optimizer.step()

    return g_loss.item()

for epoch in range(100):

    d_losses = []
    g_losses = []

    for images, labels in tqdm(dataloader):

        d_loss = train_descriminator()
        g_loss = train_generator()

        # Keep losses for logging
        d_losses.append(d_loss)
        g_losses.append(g_loss)
        
    # Print average losses
    print(epoch, np.mean(d_losses), np.mean(g_losses))

    # Save images
    save_image_grid(epoch, generator(batch_size), ncol=8)

100%|██████████| 937/937 [00:18<00:00, 50.24it/s]


0 0.9215868586473557 1.1129864389384951


100%|██████████| 937/937 [00:18<00:00, 51.74it/s]


1 0.9431092013925982 1.0928135266171703


100%|██████████| 937/937 [00:18<00:00, 49.53it/s]


2 0.8891290809835924 1.1588647991514256


100%|██████████| 937/937 [00:19<00:00, 48.11it/s]


3 0.8256860332148053 1.2514866931461346


100%|██████████| 937/937 [00:19<00:00, 47.96it/s]


4 0.7860277973854937 1.324696316217791


 87%|████████▋ | 813/937 [00:19<00:02, 42.36it/s]


KeyboardInterrupt: 