In [1]:
pip install mido

Collecting mido
  Downloading mido-1.3.2-py3-none-any.whl.metadata (6.4 kB)
Collecting packaging~=23.1 (from mido)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading mido-1.3.2-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, mido
  Attempting uninstall: packaging
    Found existing installation: packaging 24.1
    Uninstalling packaging-24.1:
      Successfully uninstalled packaging-24.1
Successfully installed mido-1.3.2 packaging-23.2


In [2]:
import mido
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torchaudio

In [79]:
class Generator(nn.Module):
    def __init__(self, input_size=13, hidden_size=512, output_size=(128, 313)):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(input_size * 3, hidden_size)  # MIDI features
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc2 = nn.Linear(hidden_size, output_size[0] * output_size[1])  # Flattening to output size

    def forward(self, midi_input):
        # midi_input shape: (batch_size, 10, 3)
        x = torch.relu(self.fc1(midi_input.view(midi_input.size(0), -1)))  # Flatten MIDI input
        x = x.unsqueeze(1)  # Reshape to (batch_size, 1, hidden_size) for LSTM
        lstm_out, _ = self.lstm(x)  # LSTM expects (batch_size, seq_length, features)
        x = self.fc2(lstm_out[:, -1, :])  # Get last time step
        mel_output = x.view(midi_input.size(0), 128, 313)  # Reshape to mel-spectrogram size
        return mel_output


In [96]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        # Input size is set to 128 because we have 128 frequency bins
        self.lstm = nn.LSTM(input_size=128, hidden_size=64, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(64, 1)

    def forward(self, x):
        # x shape: (batch_size, 128, 313) - mel-spectrogram shape
        # We need to permute it to (batch_size, seq_length, input_size)
        x = x.permute(0, 2, 1)  # Change to (batch_size, 313, 128)
        lstm_out, _ = self.lstm(x)  # LSTM expects (batch_size, seq_length, features)
        lstm_out = lstm_out[:, -1, :]  # Take the output from the last time step
        x = torch.sigmoid(self.fc1(lstm_out))
        return x


In [97]:
import mido
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class MIDIMelDataset(Dataset):
    def __init__(self, midi_files, mel_files):
        self.midi_files = midi_files
        self.mel_files = mel_files

    def __len__(self):
        return len(self.midi_files)

    def __getitem__(self, idx):
        # Load and parse the MIDI file using mido
        midi_file = self.midi_files[idx]
        midi = mido.MidiFile(midi_file)

        # Extract relevant MIDI data (e.g., note, velocity, time)
        midi_data = []
        for msg in midi:
            if not msg.is_meta and msg.type == 'note_on':  # Only note_on events
                midi_data.append([msg.note, msg.velocity, msg.time])

        # Convert MIDI data to a tensor (this will depend on your exact format)
        midi_tensor = torch.tensor(midi_data, dtype=torch.float32)

        # Load the corresponding mel-spectrogram file
        mel_file = self.mel_files[idx]
        mel_tensor = torch.tensor(np.load(mel_file), dtype=torch.float32)

        return midi_tensor, mel_tensor

def custom_collate_fn(batch):
    # Separate midi and mel-spectrogram data
    midi_batch, mel_batch = zip(*batch)

    # Pad the midi_batch to ensure all sequences in a batch are the same length
    midi_batch_padded = pad_sequence(midi_batch, batch_first=True, padding_value=0)

    # Find the maximum width of mel-spectrograms in the batch (since they can vary in width)
    max_mel_width = max([mel.shape[1] for mel in mel_batch])

    # Pad mel-spectrograms to the maximum width in the batch
    mel_batch_padded = []
    for mel in mel_batch:
        pad_size = max_mel_width - mel.shape[1]
        mel_padded = torch.nn.functional.pad(mel, (0, pad_size), mode='constant', value=0)
        mel_batch_padded.append(mel_padded)

    # Stack the padded mel-spectrograms into a batch
    mel_batch_padded = torch.stack(mel_batch_padded, dim=0)

    return midi_batch_padded, mel_batch_padded

In [98]:
# Function to initialize weights (optional)
def weights_init(m):
    if isinstance(m, (nn.Conv2d, nn.Linear)):
        nn.init.normal_(m.weight, mean=0.0, std=0.02)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

In [116]:
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim

# Updated train_gan function to track and plot losses
def train_gan(generator, discriminator, data_loader, num_epochs=50, lr=0.0002, beta1=0.5, device='cuda'):
    # print("check1")
    # Loss and optimizers
    criterion = nn.BCELoss()
    optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(beta1, 0.999))
    optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(beta1, 0.999))

    generator.to(device)
    discriminator.to(device)

    real_label = 1.0  # Use float for the label
    fake_label = 0.0  # Use float for the label

    # Lists to store the losses for plotting
    g_losses = []
    d_losses = []

    for epoch in range(num_epochs):
        for i, (midi, real_mel) in enumerate(data_loader):
            # print("check2")
            # Load data to device
            midi = midi.to(device)
            real_mel = real_mel.to(device)

            # Print shapes for debugging
            # print(f"Real mel shape: {real_mel.shape}")

            # Train Discriminator: maximize log(D(x)) + log(1 - D(G(z)))
            discriminator.zero_grad()

            # Real mel-spectrogram
            output = discriminator(real_mel).view(-1)
            loss_D_real = criterion(output, torch.full((output.size(0),), real_label, dtype=torch.float, device=device))  # Change to float
            loss_D_real.backward()

            # Fake mel-spectrogram generated from MIDI
            fake_mel = generator(midi)
            # print(f"Fake mel shape: {fake_mel.shape}")  # Debugging print
            output = discriminator(fake_mel.detach()).view(-1)
            loss_D_fake = criterion(output, torch.full((output.size(0),), fake_label, dtype=torch.float, device=device))  # Change to float
            loss_D_fake.backward()

            optimizer_D.step()

            # Train Generator: maximize log(D(G(z)))
            generator.zero_grad()
            output = discriminator(fake_mel).view(-1)
            loss_G = criterion(output, torch.full((output.size(0),), real_label, dtype=torch.float, device=device))  # Change to float
            loss_G.backward()
            optimizer_G.step()

            # Save losses for plotting
            d_loss = loss_D_real.item() + loss_D_fake.item()
            g_loss = loss_G.item()
            d_losses.append(d_loss)
            g_losses.append(g_loss)

            if i % 100 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}] Batch {i}/{len(data_loader)} \
                      Loss D: {d_loss:.4f}, Loss G: {g_loss:.4f}")

    print("Training complete!")

    # Plot the losses after training
    plt.figure(figsize=(10, 5))
    plt.plot(g_losses, label="Generator Loss")
    plt.plot(d_losses, label="Discriminator Loss")
    plt.xlabel("Batch iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Generator and Discriminator Loss Over Time")
    plt.show()


In [107]:
# Function to save models
def save_model(model, filepath):
    torch.save(model.state_dict(), filepath)

In [108]:
# Function to load models
def load_model(model, filepath, device='cuda'):
    model.load_state_dict(torch.load(filepath, map_location=device))
    model.eval()

In [114]:
from re import I
from torch.utils.data import DataLoader

# Paths to MIDI and Mel-spectrogram numpy files
midi_files = []
mel_files = []
for i in range(1000):
  midi_files.append(f'/content/drive/MyDrive/DeBaussyAI/Training Data/5S Midi/file{i}.mid')
  mel_files.append(f'/content/drive/MyDrive/DeBaussyAI/Training Data/5S Mel-Spectrogram/file{i}.npy')

# Create Dataset and DataLoader
dataset = MIDIMelDataset(midi_files, mel_files)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

# for i, (midi, real_mel) in enumerate(data_loader):
#     print("MIDI tensor shape:", midi.shape)
#     print("Mel-spectrogram shape:", real_mel.shape)

In [110]:
generator = Generator()
discriminator = Discriminator()

In [111]:
# Optionally apply weight initialization
generator.apply(weights_init)
discriminator.apply(weights_init)

Discriminator(
  (lstm): LSTM(128, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=1, bias=True)
)

In [117]:
# Train the GAN
train_gan(generator, discriminator, data_loader, num_epochs=50)

Epoch [1/50] Batch 0/32                       Loss D: 1.5047, Loss G: 0.5758


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x42 and 39x512)

In [None]:
# Save the models
save_model(generator, 'generator.pth')
save_model(discriminator, 'discriminator.pth')


In [None]:

# Load the models for inference or further training
load_model(generator, 'generator.pth')
load_model(discriminator, 'discriminator.pth')

NEXT

In [1]:
import torch
import librosa
import librosa.display
import numpy as np

def audio_to_melspectrogram(audio_path, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    y, sr = librosa.load(audio_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db


In [11]:
import torch
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, midi_max_len=180, hidden_size=512, output_size=(128, 216)):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(midi_max_len, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size[0] * output_size[1])  # Generate mel-spectrogram (128 x 216)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.view(-1, 128, 216)  # Reshape to mel-spectrogram shape (128, 216)
        return x

In [12]:
class Discriminator(nn.Module):
    def __init__(self, input_size=(128, 216), hidden_size=512):
        super(Discriminator, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * input_size[0] * input_size[1] // 4, hidden_size)  # 4 comes from 2 pooling layers
        self.fc2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension (batch_size, 1, 128, 216)
        x = nn.ReLU()(self.conv1(x))
        x = nn.MaxPool2d(2)(x)  # Pooling layer
        x = nn.ReLU()(self.conv2(x))
        x = nn.MaxPool2d(2)(x)  # Pooling layer
        x = x.view(x.size(0), -1)  # Flatten
        x = nn.ReLU()(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

In [13]:
import torch.optim as optim

def train_gan(generator, discriminator, data_loader, num_epochs=50):
    criterion = nn.BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)

    for epoch in range(num_epochs):
        for midi_data, real_mel in data_loader:
            batch_size = real_mel.size(0)

            # Real and fake labels
            real_labels = torch.ones(batch_size, 1)
            fake_labels = torch.zeros(batch_size, 1)

            # Train discriminator
            real_outputs = discriminator(real_mel)
            d_loss_real = criterion(real_outputs, real_labels)

            fake_mel = generator(midi_data)
            fake_outputs = discriminator(fake_mel.detach())
            d_loss_fake = criterion(fake_outputs, fake_labels)

            d_loss = d_loss_real + d_loss_fake
            d_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

            # Train generator
            fake_outputs = discriminator(fake_mel)
            g_loss = criterion(fake_outputs, real_labels)  # Trick discriminator

            g_optimizer.zero_grad()
            g_loss.backward()
            g_optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item()}, g_loss: {g_loss.item()}')

In [16]:
# Initialize the generator and discriminator
generator = Generator(midi_max_len=180, hidden_size=512, output_size=(128, 216))
discriminator = Discriminator(input_size=(128, 216), hidden_size=512)

In [15]:
def intonation_accuracy_loss(real_mel, fake_mel, real_labels):
    mse_loss = nn.MSELoss()(fake_mel, real_mel)  # Intonation accuracy
    gan_loss = nn.BCELoss()(fake_mel, real_labels)  # GAN loss (fooling discriminator)

    total_loss = gan_loss + 0.5 * mse_loss  # Adjust weighting as necessary
    return total_loss

# Replace generator loss calculation in the training loop
fake_outputs = discriminator(fake_mel)
g_loss = intonation_accuracy_loss(real_mel, fake_mel, real_labels)

NameError: name 'discriminator' is not defined

In [8]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
import librosa

class MidiMelDataset(Dataset):
    def __init__(self, midi_dir, mel_dir, midi_max_len=180, mel_shape=(128, 216)):
        self.midi_files = [os.path.join(midi_dir, f) for f in os.listdir(midi_dir)]
        self.mel_files = [os.path.join(mel_dir, f) for f in os.listdir(mel_dir)]
        self.midi_max_len = midi_max_len  # Maximum size of the MIDI file in bytes (for padding/truncation)
        self.mel_shape = mel_shape  # Shape of the mel-spectrogram (e.g., 128 mel bins, 216 time frames)

    def __len__(self):
        return len(self.midi_files)

    def __getitem__(self, idx):
        # Load MIDI data
        with open(self.midi_files[idx], 'rb') as f:
            midi_data = np.frombuffer(f.read(), dtype=np.uint8)

        # Pad or truncate MIDI data to midi_max_len
        midi_data = np.pad(midi_data, (0, max(0, self.midi_max_len - len(midi_data))))[:self.midi_max_len]
        midi_data = torch.FloatTensor(midi_data) / 255.0  # Normalize MIDI data between 0 and 1

        # Load and process mel-spectrogram
        mel_spec = np.load(self.mel_files[idx])  # Assuming you store mel-spectrograms as .npy files
        mel_spec = np.pad(mel_spec, ((0, 0), (0, max(0, self.mel_shape[1] - mel_spec.shape[1]))), mode='constant')[:, :self.mel_shape[1]]  # Pad/truncate time dimension
        mel_spec = torch.FloatTensor(mel_spec)

        return midi_data, mel_spec


In [10]:
from torch.utils.data import DataLoader

midi_dir = '/content/drive/MyDrive/DeBaussyAI/Training Data/5S Midi'  # Directory containing MIDI files
mel_dir = '/content/drive/MyDrive/DeBaussyAI/Training Data/5S Mel-Spectrogram'    # Directory containing mel-spectrogram files

dataset = MidiMelDataset(midi_dir, mel_dir, midi_max_len=180, mel_shape=(128, 216))  # Assuming 128 mel bins and 216 time frames
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)  # Adjust batch_size as needed

In [17]:
def train_gan(generator, discriminator, data_loader, num_epochs=50):
    criterion = nn.BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)

    for epoch in range(num_epochs):
        for real_mel, midi_data in data_loader:
            batch_size = real_mel.size(0)

            # Train discriminator
            real_labels = torch.ones(batch_size, 1).to(real_mel.device)
            fake_labels = torch.zeros(batch_size, 1).to(real_mel.device)

            # Move tensors to the same device as the models
            real_mel = real_mel.to(real_mel.device)  # Assuming real_mel is 2D
            midi_data = midi_data.to(real_mel.device)  # MIDI data should be 3D for LSTM input

            # Real mel-spectrograms
            real_outputs = discriminator(real_mel.unsqueeze(1))  # Add channel dimension if needed
            d_loss_real = criterion(real_outputs, real_labels)

            # Fake mel-spectrograms (from generator)
            fake_mel = generator(midi_data)
            fake_outputs = discriminator(fake_mel.detach().unsqueeze(1))  # Add channel dimension if needed
            d_loss_fake = criterion(fake_outputs, fake_labels)

            d_loss = d_loss_real + d_loss_fake
            d_optimizer.zero_grad()
            d_loss.backward()
            d_optimizer.step()

            # Train generator
            fake_outputs = discriminator(fake_mel.unsqueeze(1))  # Add channel dimension if needed
            g_loss = criterion(fake_outputs, real_labels)  # Goal: trick discriminator

            g_optimizer.zero_grad()
            g_loss.backward()
            g_optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item()}, g_loss: {g_loss.item()}')

# Assuming you have already initialized your generator and discriminator
train_gan(generator, discriminator, data_loader, num_epochs=50)

RuntimeError: Given input size: (16x1x180). Calculated output size: (16x0x90). Output size is too small