This model utilizes a DCGAN architecture

In [2]:
import numpy as np 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import music21
import matplotlib as plt


import torch.backends.cudnn as cudnn
torch.cuda.empty_cache()
cudnn.benchmark = True  # Optimise for hardware

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
BATCH_SIZE = 32
EPOCHS = 100
NOISE_DIM = 100
NUM_CLASSES = 18



In [72]:


class Generator(nn.Module):
    def __init__(self, latent_dim, num_classes):
        super(Generator, self).__init__()

        # Embedding for the labels (num_classes = 18 for each composer)
        self.label_emb = nn.Embedding(num_classes, 50)  # (N, 50)

        # Dense layer for the label embedding
        self.fc_label = nn.Linear(50, 8 * 16)  # (N, 128) for reshaping to 8x16

        # Dense layer for the latent noise input
        self.fc_noise = nn.Linear(latent_dim, 128 * 8 * 16)  # (N, 128 * 8 * 16)

        # ConvTranspose layers for upsampling
        self.conv1 = nn.ConvTranspose2d(129, 128, kernel_size=4, stride=2, padding=1)  # Upsample to 16x32
        self.conv2 = nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1)   # Upsample to 32x64
        self.conv3 = nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1)    # Upsample to 64x128
        self.conv4 = nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1)     # Upsample to 128x256

        # LeakyReLU activation function
        self.activation = nn.LeakyReLU(0.2)

    def forward(self, noise, labels):
        # Step 1: Embed the labels
        label_embedding = self.label_emb(labels)  # (N, 50)
        label_embedding = self.fc_label(label_embedding)  # (N, 128)
        label_embedding = label_embedding.view(-1, 8, 16, 1)  # Reshape to (N, 8, 16, 1)

        # Step 2: Process the latent noise input through a dense layer
        noise_embedding = self.fc_noise(noise)  # (N, 128 * 8 * 16)
        noise_embedding = self.activation(noise_embedding)  # Apply LeakyReLU
        noise_embedding = noise_embedding.view(-1, 128, 8, 16)  # Reshape to (N, 128, 8, 16)

        # Step 3: Concatenate the noise and label embeddings along the channel axis
        x = torch.cat((noise_embedding, label_embedding.permute(0, 3, 1, 2)), dim=1)  # (N, 129, 8, 16)

        # Step 4: Upsample to 16x32
        x = self.conv1(x)  # (N, 128, 16, 32)
        x = self.activation(x)

        # Step 5: Upsample to 32x64
        x = self.conv2(x)  # (N, 64, 32, 64)
        x = self.activation(x)

        # Step 6: Upsample to 64x128
        x = self.conv3(x)  # (N, 32, 64, 128)
        x = self.activation(x)

        # Step 7: Upsample to 128x256
        out = self.conv4(x)  # (N, 1, 128, 256)
        out = torch.sigmoid(out)

         # Step 9: Apply thresholding to get binary output (0 or 1)
        out_binary = (out > 0.5).float()  # Convert to 0 or 1

        return out_binary

# Instantiate the model
latent_dim = 100
num_classes = 10
model = Generator(latent_dim, num_classes)

# Example input (noise and labels)
batch_size = 1
noise = torch.randn(batch_size, latent_dim)  # (N, latent_dim)
labels = torch.randint(0, num_classes, (batch_size,))  # (N,)

# Forward pass
output = model(noise, labels)
print(output.shape)  # Should be (batch_size, 1, 128, 256)

        


torch.Size([1, 1, 128, 256])


In [73]:


class Discriminator(nn.Module):
    def __init__(self, in_shape=(1, 128, 256), num_classes=10):
        super(Discriminator, self).__init__()
        
        # Embedding for labels
        self.label_emb = nn.Embedding(num_classes, 50)  # (N, 50)
        
        # Fully connected layer for label embedding, to match pianoroll matrix size
        self.fc_label = nn.Linear(50, in_shape[1] * in_shape[2])  # (N, 128 * 256)
        
        # Convolutional layers for processing the concatenated image and label
        self.conv1 = nn.Conv2d(2, 128, kernel_size=3, stride=2, padding=1)  # (N, 128, 64, 128)
        self.conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1)  # (N, 128, 32, 64)
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.4)
        
        # Fully connected output layer
        self.fc = nn.Linear(128 * 32 * 64, 1)
        
        # Sigmoid activation function
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, img, labels):
        # Step 1: Embed the labels
        label_embedding = self.label_emb(labels)  # (N, 50)
        
        # Step 2: Process label embedding through a dense layer to match image dimensions
        label_embedding = self.fc_label(label_embedding)  # (N, 128 * 256)
        
        # Step 3: Reshape label embedding to add as a channel to the image
        label_embedding = label_embedding.view(-1, 1, 128, 256)  # Reshape to (N, 1, 128, 256)
        
        # Step 4: Concatenate the label embedding with the image input
        x = torch.cat((img, label_embedding), dim=1)  # Concatenate along channel dimension: (N, 2, 128, 256)
        
        # Step 5: Apply convolutional layers to process the image and label
        x = nn.LeakyReLU(0.2)(self.conv1(x))  # (N, 128, 64, 128)
        x = nn.LeakyReLU(0.2)(self.conv2(x))  # (N, 128, 32, 64)
        
        # Step 6: Flatten the feature maps
        x = x.view(x.size(0), -1)  # (N, 128 * 32 * 64 = 262144)
        
        # Step 7: Apply dropout for regularization
        x = self.dropout(x)
        
        # Step 8: Fully connected layer and sigmoid activation for binary classification
        out = self.fc(x)  # (N, 1)
        out = self.sigmoid(out)  # Sigmoid activation to get values between 0 and 1
        
        return out

# Instantiate the discriminator
discriminator = Discriminator(in_shape=(1, 128, 256), num_classes=10)


# Example inputs: random image and label
batch_size = 1
image = torch.randn(batch_size, 1, 128, 256)  # Random image input
#labels = torch.randint(0, 10, (batch_size,))  # Random labels (10 classes)

# Forward pass
output1 = discriminator(output, labels)

print(output1.shape)  # Output should be (batch_size, 1)
print(output1)


torch.Size([1, 1])
tensor([[0.5062]], grad_fn=<SigmoidBackward0>)
