In [3]:
pip install pretty_midi

Collecting pretty_midi
  Using cached pretty_midi-0.2.10.tar.gz (5.6 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.2-py3-none-any.whl.metadata (6.4 kB)
Collecting packaging~=23.1 (from mido>=1.1.16->pretty_midi)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading mido-1.3.2-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592287 sha256=c5ef8ed463bc9132eb7ca02363b85660e67467e9347e47c9d0ca6a3984a57429
  Stored in directo

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import librosa
import numpy as np
import pretty_midi

In [5]:
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        output = self.leaky_relu(output)
        return output


In [6]:
class DiscriminatorAccuracy(nn.Module):
    def __init__(self, input_size):
        super(DiscriminatorAccuracy, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 1)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))


class DiscriminatorHumanNess(nn.Module):
    def __init__(self, input_size):
        super(DiscriminatorHumanNess, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 1)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))


In [7]:
def midi_to_mel(midi_path):
    # Load MIDI file
    midi_data = pretty_midi.PrettyMIDI(midi_path)
    # Synthesize audio
    audio = midi_data.synthesize()
    # Convert to mel-spectrogram
    mel = librosa.feature.melspectrogram(y=audio, sr=22050, n_mels=128)
    return mel

def load_human_data(human_audio_files):
    mel_data = []
    for file in human_audio_files:
        mel = midi_to_mel(file)
        mel_data.append(mel)
    return np.array(mel_data)


In [8]:
def train_human_ness_discriminator(discriminator, data_loader, optimizer, criterion):
    discriminator.train()
    for mel_spectrogram in data_loader:
        optimizer.zero_grad()
        output = discriminator(mel_spectrogram)
        loss = criterion(output, torch.ones_like(output))
        loss.backward()
        optimizer.step()


In [9]:
def train_gan(generator, discriminator_accuracy, discriminator_human_ness, data_loader, optimizer_g, optimizer_d, criterion_a, criterion_h):
    generator.train()
    discriminator_accuracy.train()
    discriminator_human_ness.train()

    for midi_data in data_loader:
        # Create mel-spectrograms from MIDI data
        mel_real = midi_to_mel(midi_data)

        # Train Discriminators
        optimizer_d.zero_grad()

        # Discriminator Accuracy
        output_a = discriminator_accuracy(mel_real)
        loss_a = criterion_a(output_a, torch.ones_like(output_a))

        mel_fake = generator(midi_data)
        output_a_fake = discriminator_accuracy(mel_fake.detach())
        loss_a_fake = criterion_a(output_a_fake, torch.zeros_like(output_a_fake))

        loss_d_a = loss_a + loss_a_fake
        loss_d_a.backward()
        optimizer_d.step()

        # Train Human-ness Discriminator
        optimizer_d.zero_grad()
        output_h = discriminator_human_ness(mel_fake.detach())
        loss_h = criterion_h(output_h, torch.zeros_like(output_h))  # Assume fake mel has a human-ness of 1
        loss_h.backward()
        optimizer_d.step()

        # Train Generator
        optimizer_g.zero_grad()
        output_a = discriminator_accuracy(mel_fake)
        loss_g_a = criterion_a(output_a, torch.ones_like(output_a))

        output_h = discriminator_human_ness(mel_fake)
        loss_g_h = criterion_h(output_h, torch.ones_like(output_h))  # Assume fake mel has a human-ness of 0

        loss_g = loss_g_a + loss_g_h
        loss_g.backward()
        optimizer_g.step()
