In [None]:
import numpy as np 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import music21
import matplotlib as plt
import torchvision.utils as vutils

import torch.backends.cudnn as cudnn
torch.cuda.empty_cache()
cudnn.benchmark = True  # Optimise for hardware

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


In [None]:
BATCH_SIZE = 16
EPOCHS = 400
NOISE_DIM= 100
NUM_CLASSES = 18
BETA1 = 0.5 # Hyperparamter for adam optimizer
LR = 0.004 # Might need to adjust
EMBEDDING_DIM = 50

In [None]:


class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()

        # Embedding for the labels (num_classes = 18 for each composer)
        self.label_emb = nn.Embedding(NUM_CLASSES, EMBEDDING_DIM)  # (N, 50)

        # Dense layer for the label embedding
        self.fc_label = nn.Linear(EMBEDDING_DIM, 8 * 16)  # (N, 128) for reshaping to 8x16

        # Dense layer for the latent noise input
        self.fc_noise = nn.Linear(NOISE_DIM, 128 * 8 * 16)  # (N, 128 * 8 * 16)

        # ConvTranspose layers for upsampling
        self.conv1 = nn.ConvTranspose2d(129, 128, kernel_size=4, stride=2, padding=1)  # Upsample to 16x32
        self.conv2 = nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1)   # Upsample to 32x64
        self.conv3 = nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1)    # Upsample to 64x128
        self.conv4 = nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1)     # Upsample to 128x256

        # LeakyReLU activation function
        self.activation = nn.ReLU(0.2)
        self.batch_norm = nn.BatchNorm2d()
        self.tanh_layer = nn.Tanh()

    def forward(self, noise, labels):
        # Step 1: Embed the labels
        label_embedding = self.label_emb(labels)  # (N, 50)
        label_embedding = self.fc_label(label_embedding)  # (N, 128)
        label_embedding = label_embedding.view(-1, 8, 16, 1)  # Reshape to (N, 8, 16, 1)

        # Step 2: Process the latent noise input through a dense layer
        noise_embedding = self.fc_noise(noise)  # (N, 128 * 8 * 16)
        
        noise_embedding = self.activation(noise_embedding)  # Apply LeakyReLU
        noise_embedding = noise_embedding.view(-1, 128, 8, 16)  # Reshape to (N, 128, 8, 16)

        # Step 3: Concatenate the noise and label embeddings along the channel axis
        x = torch.cat((noise_embedding, label_embedding.permute(0, 3, 1, 2)), dim=1)  # (N, 129, 8, 16)

        # Step 4: Upsample to 16x32
        x = self.conv1(x)  # (N, 128, 16, 32)
        x = self.activation(x)

        # Step 5: Upsample to 32x64
        x = self.conv2(x)  # (N, 64, 32, 64)
        x = self.activation(x)

        # Step 6: Upsample to 64x128
        x = self.conv3(x)  # (N, 32, 64, 128)
        x = self.activation(x)

        # Step 7: Upsample to 128x256
        out = self.conv4(x)  # (N, 1, 128, 256)
        x = self.tanh_layer(out)

         # Step 9: Apply thresholding to get binary output (0 or 1)
        #out_binary = (out > 0.5).float()  # Convert to 0 or 1

        return x #out_binary

"""
# Instantiate the model
latent_dim = 100
num_classes = 10
model = Generator()

# Example input (noise and labels)
batch_size = 1
noise = torch.randn(batch_size, latent_dim)  # (N, latent_dim)
labels = torch.randint(0, num_classes, (batch_size,))  # (N,)

# Forward pass
output = model(noise, labels)
print(output.shape)  # Should be (batch_size, 1, 128, 256)


class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()

        # Define the embedding layer separately
        self.label_conditioned_generator = nn.Embedding(NUM_CLASSES, EMBEDDING_DIM)
        
        self.latent = nn.Sequential(
            nn.Linear(NOISE_DIM, 4 * 8 * 512),
            nn.LeakyReLU(0.2, inplace=True)
        )

        self.model = nn.Sequential(
            nn.ConvTranspose2d(512 + 1, 64 * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64 * 8, momentum=0.1, eps=0.8),
            nn.ReLU(True),
            nn.ConvTranspose2d(64 * 8, 64 * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64 * 4, momentum=0.1, eps=0.8),
            nn.ReLU(True),
            nn.ConvTranspose2d(64 * 4, 64 * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64 * 2, momentum=0.1, eps=0.8),
            nn.ReLU(True),
            nn.ConvTranspose2d(64 * 2, 64 * 1, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64 * 1, momentum=0.1, eps=0.8),
            nn.ReLU(True),
            nn.ConvTranspose2d(64 * 1, 1, 4, 2, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, inputs):
        noise_vector, label = inputs
        label_output = self.label_conditioned_generator(label)
        label_output = label_output.view(-1, 1, 4, 8)
        latent_output = self.latent(noise_vector)
        latent_output = latent_output.view(-1, 512, 4, 8)
        concat = torch.cat((latent_output, label_output), dim=1)
        image = self.model(concat)
        return image"""


In [None]:


class Discriminator(nn.Module):
    def __init__(self, in_shape=(1, 128, 256), num_classes=NUM_CLASSES):
        super(Discriminator, self).__init__()
        
        # Embedding for labels
        self.label_emb = nn.Embedding(num_classes, EMBEDDING_DIM)  # (N, 50)
        
        # Fully connected layer for label embedding, to match pianoroll matrix size
        self.fc_label = nn.Linear(EMBEDDING_DIM, in_shape[1] * in_shape[2])  # (N, 128 * 256)
        
        # Convolutional layers for processing the concatenated image and label
        self.conv1 = nn.Conv2d(2, 128, kernel_size=3, stride=2, padding=1)  # (N, 128, 64, 128)
        self.conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1)  # (N, 128, 32, 64)
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.4)
        
        # Fully connected output layer
        self.fc = nn.Linear(128 * 32 * 64, 1)
        
        # Sigmoid activation function
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, img, labels):
        # Step 1: Embed the labels
        label_embedding = self.label_emb(labels)  # (N, 50)
        
        # Step 2: Process label embedding through a dense layer to match image dimensions
        label_embedding = self.fc_label(label_embedding)  # (N, 128 * 256)
        
        # Step 3: Reshape label embedding to add as a channel to the image
        label_embedding = label_embedding.view(-1, 1, 128, 256)  # Reshape to (N, 1, 128, 256)
        
        # Step 4: Concatenate the label embedding with the image input
        x = torch.cat((img, label_embedding), dim=1)  # Concatenate along channel dimension: (N, 2, 128, 256)
        
        # Step 5: Apply convolutional layers to process the image and label
        x = nn.LeakyReLU(0.2)(self.conv1(x))  # (N, 128, 64, 128)
       
        x = nn.LeakyReLU(0.2)(self.conv2(x))  # (N, 128, 32, 64)
        
        # Step 6: Flatten the feature maps
        x = x.view(x.size(0), -1)  # (N, 128 * 32 * 64 = 262144)
        
        # Step 7: Apply dropout for regularization
        x = self.dropout(x)
        
        # Step 8: Fully connected layer and sigmoid activation for binary classification
        out = self.fc(x)  # (N, 1)
        out = self.sigmoid(out)  # Sigmoid activation to get values between 0 and 1
        
        return out


# Instantiate the discriminator
discriminator = Discriminator(in_shape=(1, 128, 256), num_classes=10)


# Example inputs: random image and label
batch_size = 1
output = torch.randn(batch_size, 1, 128, 256)  # Random image input
labels = torch.randint(0, 10, (batch_size,))  # Random labels (10 classes)


# Forward pass
output1 = discriminator(output, labels)

print(output.shape)  # Output should be (batch_size, 1)
print(output1)



In [None]:
from torch.utils.data import DataLoader, TensorDataset

inputs_seq = torch.load("Input_tensors.pt")
labels_seq = torch.load("Labels_tensors.pt")
dataset = TensorDataset(inputs_seq, labels_seq)

#Split into batches
batch_size = 16
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Check that the data is loaded correctly
print("Number of input pianorolls: ", len(dataset))



        

In [None]:
# custom weights initialization called on ``netG`` and ``netD``
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [None]:
# Send the models to the GPU
netG = Generator().to(device)
netD = Discriminator().to(device)

criterion = nn.BCELoss()

# Initialize the labels
real_label = 0.9
fake_label = 0

fixed_noise = torch.randn(BATCH_SIZE, NOISE_DIM,  device=device)

# Define the optimizers
optimizerG = optim.Adam(netG.parameters(), lr=LR, betas=(BETA1, 0.999))
optimizerD = optim.Adam(netD.parameters(), lr=LR, betas=(BETA1, 0.999))

# Apply the randomly initialized weights to the discriminator and generator models.
netG.apply(weights_init)
netD.apply(weights_init)

print("Generator model: ", netG)
print("Discriminator model: ", netD)

In [None]:

# Lists to keep track of progress
output_list = []
G_losses = []
D_losses = []
iters = 0

num_epochs = 10

""" 
Data is being loaded correctly. For some reason, model keeps blowing up and not fucking working. D(x)should start close to 1 (predicting everything as real) before settling around 0.5
D(g(z)) should start around 0 (everything should be picked as fake) before sitting at 0.5.

This means the discriminator can't determine which is real and which is fake, so generator has been trained sufficiently.

I think this could be errors in the data or data normalization process

"""

print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # For each batch in the dataloader
    for i, data in enumerate(dataloader):
        
        ##########
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        # Format batch
        real_cpu = data[0].to(device).float()   # ensure tensor is float32
        real_cpu = real_cpu.unsqueeze(1)
        
        b_size = real_cpu.size(0)
        
        if b_size != BATCH_SIZE:
            print(f"Skipping batch {i} due to insufficient size: {b_size}")
            continue
        
        real_label_from_data = data[1].to(device).int()     # Generator and discriminator expect integer labels
        
        # Creates a label tensor of 1s to pass to loss function
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
        
        # Forward pass real batch through D
        output = netD(real_cpu, real_label_from_data).view(-1)
        
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, NOISE_DIM, dtype=torch.float, device=device)

        # Generate fake image batch with G
        fake = netG(noise, real_label_from_data)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output = netD(fake.detach(), real_label_from_data).view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake, real_label_from_data).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 2 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(dataloader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 8 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise, real_label_from_data).detach().cpu()   # Move back to CPU so it can be used.
            
            output_list.append(fake)

        iters += 1

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()