In [None]:
pip install librosa pretty_midi

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.2-py3-none-any.whl.metadata (6.4 kB)
Collecting packaging (from lazy-loader>=0.1->librosa)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading mido-1.3.2-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils
import librosa
import numpy as np
import pretty_midi

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [None]:
midi_vocab_size = 128       # Number of unique MIDI notes (0-127)
embedding_dim = 256         # Dimension of MIDI note embeddings
hidden_size = 512           # Hidden size for LSTMs
mel_bins = 80               # Number of mel-spectrogram frequency bins
num_layers = 2              # Number of layers in LSTMs
learning_rate = 0.0002
num_epochs = 100
batch_size = 32
alpha = 0.5                 # Weight for Discriminator1 loss
beta = 0.5                  # Weight for Discriminator2 loss

In [None]:
class Generator(nn.Module):
    def init(self, midivocab_size, embedding_dim, hidden_size, mel_bins, num_layers=2):
        super(Generator, self).init()
        self.embedding = nn.Embedding(midi_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, mel_bins)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, midi_input, midi_lengths):
        # Embed MIDI input
        embedded = self.embedding(midi_input)  # (batch_size, seq_len, embedding_dim)
        packed = rnn_utils.pack_padded_sequence(embedded, midi_lengths.cpu(), batch_first=True, enforce_sorted=False)
        lstm_out,  = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)
        out = self.fc(lstm_out)  # (batch_size, seq_len, mel_bins)
        out = self.leaky_relu(out)
        return out

In [None]:
# Discriminator1 Model (Accuracy Discriminator)
class Discriminator1(nn.Module):
    def __init__(self, mel_bins, hidden_size, num_layers=2):
        super(Discriminator1, self).__init__()
        self.lstm = nn.LSTM(mel_bins, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, mel_input, mel_lengths):
        # Pack the sequence
        packed = rnn_utils.pack_padded_sequence(mel_input, mel_lengths.cpu(), batch_first=True, enforce_sorted=False)

        # LSTM
        packed_output, _ = self.lstm(packed)

        # Unpack the sequence
        lstm_out, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)

        # Get the outputs corresponding to the last valid time step for each sequence
        batch_size = lstm_out.size(0)
        lstm_out_last = lstm_out[torch.arange(batch_size), mel_lengths - 1]

        # Fully connected layer
        out = self.fc(lstm_out_last)
        out = self.sigmoid(out)
        return out

# Discriminator2 Model (Human-ness Discriminator)
class Discriminator2(nn.Module):
    def __init__(self, mel_bins, hidden_size, num_layers=2):
        super(Discriminator2, self).__init__()
        self.lstm = nn.LSTM(mel_bins, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, mel_input, mel_lengths):
        # Pack the sequence
        packed = rnn_utils.pack_padded_sequence(mel_input, mel_lengths.cpu(), batch_first=True, enforce_sorted=False)

        # LSTM
        packed_output, _ = self.lstm(packed)

        # Unpack the sequence
        lstm_out, _ = rnn_utils.pad_packed_sequence(packed_output, batch_first=True)

        # Get the outputs corresponding to the last valid time step for each sequence
        batch_size = lstm_out.size(0)
        lstm_out_last = lstm_out[torch.arange(batch_size), mel_lengths - 1]

        # Fully connected layer
        out = self.fc(lstm_out_last)
        out = self.sigmoid(out)
        return out

In [None]:
class MIDIDataset(Dataset):
    def init(self, midi_dir):
        self.midi_files = [os.path.join(midi_dir, f) for f in os.listdir(midi_dir) if f.endswith('.pt')]
        self.midi_files.sort()

    def len(self):
        return len(self.midi_files)

    def getitem(self, idx):
        # Load preprocessed MIDI sequence (list or tensor of MIDI note indices)
        midi_sequence = torch.load(self.midi_files[idx])  # Shape: (sequence_length,)
        return midi_sequence

In [None]:
class MelSpectrogramDataset(Dataset):
    def init(self, mel_dir):
        self.mel_files = [os.path.join(mel_dir, f) for f in os.listdir(mel_dir) if f.endswith('.pt')]
        self.mel_files.sort()

    def len(self):
        return len(self.mel_files)

    def getitem(self, idx):
        # Load mel-spectrogram tensor
        mel_spectrogram = torch.load(self.mel_files[idx])  # Shape: (sequence_length, mel_bins)
        return mel_spectrogram

In [None]:
def collate_fn_midi(batch):
    # Sort the batch in the descending order of sequence lengths
    batch.sort(key=lambda x: len(x), reverse=True)
    sequences = [seq for seq in batch]
    lengths = [len(seq) for seq in sequences]
    padded_sequences = rnn_utils.pad_sequence(sequences, batch_first=True, padding_value=0)
    lengths = torch.tensor(lengths)
    return padded_sequences, lengths

In [None]:
def collate_fn_mel(batch):
    # Sort the batch in the descending order of sequence lengths
    batch.sort(key=lambda x: x.shape[0], reverse=True)
    sequences = batch
    lengths = [seq.shape[0] for seq in sequences]
    padded_sequences = rnn_utils.pad_sequence(sequences, batch_first=True, padding_value=0)
    lengths = torch.tensor(lengths)
    return padded_sequences, lengths

In [None]:
midi_data_dir = '/content/midi_data'              # Replace with your path
synth_mel_data_dir = '/content/synth_mel_data'    # Replace with your path
human_mel_data_dir = '/content/human_mel_data'    # Replace with your path

In [None]:
midi_dataset = MIDIDataset(midi_data_dir)
synth_mel_dataset = MelSpectrogramDataset(synth_mel_data_dir)
human_mel_dataset = MelSpectrogramDataset(human_mel_data_dir)

TypeError: MIDIDataset() takes no arguments

In [None]:
midi_loader = DataLoader(
    midi_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn_midi
)

synth_mel_loader = DataLoader(
    synth_mel_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn_mel
)

human_mel_loader = DataLoader(
    human_mel_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn_mel
)

NameError: name 'midi_dataset' is not defined

In [None]:
generator = Generator(midi_vocab_size, embedding_dim, hidden_size, mel_bins, num_layers).to(device)
discriminator1 = Discriminator1(mel_bins, hidden_size, num_layers).to(device)
discriminator2 = Discriminator2(mel_bins, hidden_size, num_layers).to(device)

TypeError: Generator.__init__() takes 1 positional argument but 6 were given

In [None]:
criterion = nn.BCELoss()

In [None]:
optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_D1 = optim.Adam(discriminator1.parameters(), lr=learning_rate)
optimizer_D2 = optim.Adam(discriminator2.parameters(), lr=learning_rate)

NameError: name 'generator' is not defined

In [None]:
from itertools import cycle

# Training loop
for epoch in range(num_epochs):
    # Create iterators
    midi_iter = iter(midi_loader)
    synth_mel_iter = iter(synth_mel_loader)
    human_mel_iter = iter(human_mel_loader)

    num_batches = min(len(midi_loader), len(synth_mel_loader), len(human_mel_loader))

    for i in range(num_batches):
        # Get MIDI batch
        try:
            midi_input, midi_lengths = next(midi_iter)
        except StopIteration:
            midi_iter = iter(midi_loader)
            midi_input, midi_lengths = next(midi_iter)

        # Get synthesized mel-spectrogram batch
        try:
            synth_mel_input, synth_mel_lengths = next(synth_mel_iter)
        except StopIteration:
            synth_mel_iter = iter(synth_mel_loader)
            synth_mel_input, synth_mel_lengths = next(synth_mel_iter)

        # Get human mel-spectrogram batch
        try:
            human_mel_input, human_mel_lengths = next(human_mel_iter)
        except StopIteration:
            human_mel_iter = iter(human_mel_loader)
            human_mel_input, human_mel_lengths = next(human_mel_iter)

        batch_size = midi_input.size(0)

        # Move data to device
        midi_input = midi_input.to(device)
        midi_lengths = midi_lengths.to(device)
        synth_mel_input = synth_mel_input.to(device)
        synth_mel_lengths = synth_mel_lengths.to(device)
        human_mel_input = human_mel_input.to(device)
        human_mel_lengths = human_mel_lengths.to(device)

        # Labels
        valid = torch.ones(batch_size, 1).to(device)
        fake = torch.zeros(batch_size, 1).to(device)

        # ---------------------
        #  Train Discriminator1
        # ---------------------
        optimizer_D1.zero_grad()

        # Loss for real synthesized mel-spectrograms
        real_output_D1 = discriminator1(synth_mel_input, synth_mel_lengths)
        loss_real_D1 = criterion(real_output_D1, valid)

        # Generate mel-spectrograms
        gen_mel = generator(midi_input, midi_lengths)
        gen_mel_lengths = midi_lengths  # Assuming output lengths match input lengths

        # Loss for fake generated mel-spectrograms
        fake_output_D1 = discriminator1(gen_mel.detach(), gen_mel_lengths)
        loss_fake_D1 = criterion(fake_output_D1, fake)

        # Total Discriminator1 loss
        loss_D1 = (loss_real_D1 + loss_fake_D1) / 2
        loss_D1.backward()
        optimizer_D1.step()

        # ---------------------
        #  Train Discriminator2
        # ---------------------
        optimizer_D2.zero_grad()

        # Loss for real human mel-spectrograms
        real_output_D2 = discriminator2(human_mel_input, human_mel_lengths)
        loss_real_D2 = criterion(real_output_D2, valid)

        # Loss for fake generated mel-spectrograms
        fake_output_D2 = discriminator2(gen_mel.detach(), gen_mel_lengths)
        loss_fake_D2 = criterion(fake_output_D2, fake)

        # Total Discriminator2 loss
        loss_D2 = (loss_real_D2 + loss_fake_D2) / 2
        loss_D2.backward()
        optimizer_D2.step()

        # -----------------
        #  Train Generator
        # -----------------
        optimizer_G.zero_grad()

        # Generator tries to fool both discriminators
        output_D1 = discriminator1(gen_mel, gen_mel_lengths)
        loss_G_D1 = criterion(output_D1, valid)

        output_D2 = discriminator2(gen_mel, gen_mel_lengths)
        loss_G_D2 = criterion(output_D2, valid)

        # Total Generator loss
        loss_G = alpha * loss_G_D1 + beta * loss_G_D2
        loss_G.backward()
        optimizer_G.step()

        # Print progress
        print(f"[Epoch {epoch+1}/{num_epochs}] [Batch {i+1}/{num_batches}] [D1 loss: {loss_D1.item():.4f}] [D2 loss: {loss_D2.item():.4f}] [G loss: {loss_G.item():.4f}]")

NameError: name 'midi_loader' is not defined

In [None]:
def midi_to_sequence(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append(note.pitch)
    notes = torch.tensor(notes, dtype=torch.long)
    return notes

In [None]:
raw_midi_dir = '/content/raw_midi'  # Replace with your directory
midi_save_dir = '/content/midi_data'
os.makedirs(midi_save_dir, exist_ok=True)

midi_files = [os.path.join(raw_midi_dir, f) for f in os.listdir(raw_midi_dir) if f.endswith('.mid')]

for midi_file in midi_files:
    sequence = midi_to_sequence(midi_file)
    file_name = os.path.basename(midi_file).replace('.mid', '.pt')
    save_path = os.path.join(midi_save_dir, file_name)
    torch.save(sequence, save_path)

NameError: name 'os' is not defined

In [None]:
def audio_to_mel_spectrogram(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=mel_bins)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    mel_spectrogram = torch.tensor(mel_spectrogram.T, dtype=torch.float32)  # Shape: (sequence_length, mel_bins)
    return mel_spectrogramraw_audio_dir = '/content/raw_audio'  # Replace with your directory
mel_save_dir = '/content/synth_mel_data'  # Or '/content/human_mel_data'
os.makedirs(mel_save_dir, exist_ok=True)

audio_files = [os.path.join(raw_audio_dir, f) for f in os.listdir(raw_audio_dir) if f.endswith('.wav')]

for audio_file in audio_files:
    mel_spectrogram = audio_to_mel_spectrogram(audio_file)
    file_name = os.path.basename(audio_file).replace('.wav', '.pt')
    save_path = os.path.join(mel_save_dir, file_name)
    torch.save(mel_spectrogram, save_path)