In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import music21 as m21
import torch.nn.functional as F

import os

# Load the saved PyTorch tensors (input piano rolls)
inputs = torch.load("input_tensors.pt")
labels = torch.load("label_tensors.pt")

# Check the shape of the tensors
print("Input tensor shape:", inputs.shape)
print("Label tensor shape:", labels.shape)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = inputs.to(device)
labels = labels.to(device)


  inputs = torch.load("input_tensors.pt")


Input tensor shape: torch.Size([263, 128, 512])
Label tensor shape: torch.Size([263])


  labels = torch.load("label_tensors.pt")


In [23]:
# Hyperparameters
lr = 0.0002
batch_size = 64
piano_roll_shape = (128, 512)  # MIDI pitch (128) by time steps (128)
nz = 100  # Size of latent vector (noise)
ngf = 64  # Generator feature map size
ndf = 64  # Discriminator feature map size
num_epochs = 100


In [24]:
from torch.utils.data import DataLoader, TensorDataset

# Create a DataLoader from the input tensors
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("DataLoader created with batch size:", batch_size)


DataLoader created with batch size: 64


In [25]:
nc = 1  # Number of channels in the input images

In [48]:
import torch
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, nz, ngf, nc):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            # Input: [batch_size, nz, 1, 1], nz is the size of the latent vector

            # First layer: Transpose Conv to 1024 feature maps, 4x4 output
            nn.ConvTranspose2d(nz, ngf * 8, kernel_size=8, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),  # Output: [batch_size, ngf*8, 4, 4]

            # Second layer: Transpose Conv to 512 feature maps, 8x8 output
            nn.ConvTranspose2d(ngf * 8, ngf * 4, kernel_size=4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),  # Output: [batch_size, ngf*4, 8, 8]

            # Third layer: Transpose Conv to 256 feature maps, 16x32 output
            nn.ConvTranspose2d(ngf * 4, ngf * 2, kernel_size=(4), stride=(2), padding=(1), bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),  # Output: [batch_size, ngf*2, 16, 32]

            
            # Fourth layer: Transpose Conv to 128 feature maps, 32x128 output
            nn.ConvTranspose2d(ngf * 2, ngf, kernel_size=(4), stride=(2), padding=(1), bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),  # Output: [batch_size, ngf, 32, 128]

            # Fifth layer: Transpose Conv to 1 channel, 128x512 output
            nn.ConvTranspose2d(ngf, nc, kernel_size=(4), stride=(2), padding=(1), bias=False),
            nn.Tanh()  # Output: [batch_size, 1, 128, 512]
        )

    def forward(self, input):
        print(f"Input noise shape: {input.shape}")  # Print the initial input shape (latent vector)
        
        x = self.main[0](input)  # First layer
        print(f"After first layer (ConvTranspose2d): {x.shape}")
        
        x = self.main[1](x)  # First BatchNorm
        x = self.main[2](x)  # First ReLU
        print(f"After first ReLU: {x.shape}")
        
        x = self.main[3](x)  # Second layer
        print(f"After second layer (ConvTranspose2d): {x.shape}")
        
        x = self.main[4](x)  # Second BatchNorm
        x = self.main[5](x)  # Second ReLU
        print(f"After second ReLU: {x.shape}")
        
        x = self.main[6](x)  # Third layer
        print(f"After third layer (ConvTranspose2d): {x.shape}")
        
        x = self.main[7](x)  # Third BatchNorm
        x = self.main[8](x)  # Third ReLU
        print(f"After third ReLU: {x.shape}")
        
        x = self.main[9](x)  # Fourth layer
        print(f"After fourth layer (ConvTranspose2d): {x.shape}")
        
        x = self.main[10](x)  # Fourth BatchNorm
        x = self.main[11](x)  # Fourth ReLU
        print(f"After fourth ReLU: {x.shape}")
        
        x = self.main[12](x)  # Fifth layer (final ConvTranspose2d)
        print(f"After fifth layer (ConvTranspose2d): {x.shape}")
        
        x = self.main[13](x)  # Final Tanh activation
        print(f"After final Tanh: {x.shape}")
        
        return x


In [27]:
class PianoRollDiscriminator(nn.Module):
    def __init__(self, y_dim, pitch_range, time_steps=128):
        super(PianoRollDiscriminator, self).__init__()
        self.df_dim = 64
        self.y_dim = y_dim
        self.pitch_range = pitch_range
        self.time_steps = time_steps

        # Define the layers for the discriminator
        self.conv1 = nn.Conv2d(1 + y_dim, self.df_dim, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
        self.conv2 = nn.Conv2d(self.df_dim, self.df_dim * 2, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
        self.conv3 = nn.Conv2d(self.df_dim * 2, self.df_dim * 4, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))

        self.fc1 = nn.Linear(self.df_dim * 4 * (time_steps // 8) * (pitch_range // 8), 1024)
        self.fc2 = nn.Linear(1024 + y_dim, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x, y):
        # Concatenate piano roll x and label y
        yb = y.view(y.size(0), self.y_dim, 1, 1)
        yb = yb.repeat(1, 1, x.size(2), x.size(3))
        x = torch.cat([x, yb], dim=1)  # Concatenate along the channel dimension

        # Pass through convolutional layers
        h1 = F.leaky_relu(self.conv1(x), 0.2)
        h2 = F.leaky_relu(self.conv2(h1), 0.2)
        h3 = F.leaky_relu(self.conv3(h2), 0.2)

        # Flatten and pass through fully connected layers
        h3 = h3.view(h3.size(0), -1)  # Flatten
        h4 = F.leaky_relu(self.fc1(h3))
        h4 = torch.cat([h4, y], dim=1)  # Concatenate labels again

        # Output layer
        h5 = self.sigmoid(self.fc2(h4))

        return h5


Generated data shape: torch.Size([64, 1, 64, 64])


In [49]:
# Parameters
nz = 100  # Size of latent vector
ngf = 64  # Generator feature map size
nc = 1    # Number of channels (1 for grayscale, piano roll)

# Create the Generator
netG = Generator(nz=nz, ngf=ngf, nc=nc).to(device)

# Generate random noise
noise = torch.randn(64, nz, 1, 1, device=device)

# Generate fake data and print layer-wise output shapes
generated_data = netG(noise)


Input noise shape: torch.Size([64, 100, 1, 1])
After first layer (ConvTranspose2d): torch.Size([64, 512, 8, 8])
After first ReLU: torch.Size([64, 512, 8, 8])
After second layer (ConvTranspose2d): torch.Size([64, 256, 16, 16])
After second ReLU: torch.Size([64, 256, 16, 16])
After third layer (ConvTranspose2d): torch.Size([64, 128, 32, 32])
After third ReLU: torch.Size([64, 128, 32, 32])
After fourth layer (ConvTranspose2d): torch.Size([64, 64, 64, 64])
After fourth ReLU: torch.Size([64, 64, 64, 64])
After fifth layer (ConvTranspose2d): torch.Size([64, 1, 128, 128])
After final Tanh: torch.Size([64, 1, 128, 128])


In [28]:
# Parameters
nz = 100  # Size of latent vector
ngf = 64  # Generator feature map size
nc = 1    # Number of channels (1 for grayscale, piano roll)

# Create the Generator
netG = Generator(nz, ngf, nc).to(device)

# Generate random noise
noise = torch.randn(64, nz, 1, 1, device=device)

# Generate fake data
generated_data = netG(noise)

# Check the size of the generated data
print("Generated data shape:", generated_data.shape)




TypeError: PianoRollGenerator.forward() missing 1 required positional argument: 'y'

In [41]:
# Labels for real and fake data
real_label = 1.0
fake_label = 0.0
epochs = 100

# Training loop
for epoch in range(epochs):
    for i, data in enumerate(dataloader, 0):
        
        ############################
        # (1) Update Discriminator
        ############################
        netD.zero_grad()

        # Train with real data
        real_data = data[0].unsqueeze(1).to(device).float()  # Add channel dimension for piano roll
        batch_size = real_data.size(0)
        label = torch.full((batch_size,), real_label, device=device).float()
        output = netD(real_data).view(-1)
        lossD_real = criterion(output, label)
        lossD_real.backward()

        # Train with fake data
        noise = torch.randn(batch_size, nz, 1, 1, device=device).float()
        fake_data = netG(noise)
        label.fill_(fake_label)
        output = netD(fake_data.detach()).view(-1)
        lossD_fake = criterion(output, label)
        lossD_fake.backward()

        # Update Discriminator
        optimizerD.step()

        ############################
        # (2) Update Generator
        ############################
        netG.zero_grad()
        label.fill_(real_label)  # Generator wants the discriminator to classify its output as real
        output = netD(fake_data).view(-1)
        lossG = criterion(output, label)
        lossG.backward()

        # Update Generator
        optimizerG.step()

        # Print training stats
        if i % 100 == 0:
            print(f"[{epoch}/{epochs}][{i}/{len(dataloader)}] Loss_D: {lossD_real + lossD_fake:.4f} Loss_G: {lossG:.4f}")


ValueError: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([9280])) is deprecated. Please ensure they have the same size.