In [1]:
import pretty_midi
from IPython.display import Audio

# Transform midi to np array and back

In [2]:
def midi_to_waveform(midi_file_path, sampling_rate=44100 ):
    """
    Convert a MIDI file to audio and return the synthesized audio.

    Parameters:
    - midi_file_path (str): Path to the MIDI file to be converted.

    Returns:
    - waveform (numpy.ndarray: Synthesized audio data as a NumPy array).
    """
    # Load the MIDI file using pretty_midi
    midi_data = pretty_midi.PrettyMIDI(midi_file_path)
    # Synthesize audio from the MIDI data
    return midi_data.synthesize(sampling_rate) 

In [3]:
test = midi_to_waveform('Dont_Forget_to_Remember.mid')

Audio(test, rate=44100)

In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.linear(out)
        return out
    
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        out, _ = self.gru(x)
        out = self.linear(out)
        return out

In [None]:
import glob

def get_data():
    midi_files = glob.glob('archive/*/*.mid')
    data = []
    for midi_file in midi_files:
        midi_data = pretty_midi.PrettyMIDI(midi_file)
        for instrument in midi_data.instruments:
            if not instrument.is_drum:
                data.append(instrument.notes)
    return data

def get_vocab(data):
    vocab = set()
    for notes in data:
        for note in notes:
            vocab.add(note.pitch)
    return vocab

def get_note_to_idx(vocab):
    note_to_idx = {}
    for i, note in enumerate(vocab):
        note_to_idx[note] = i
    return note_to_idx

def get_idx_to_note(vocab):
    idx_to_note = {}
    for i, note in enumerate(vocab):
        idx_to_note[i] = note
    return idx_to_note

def get_sequences(data, note_to_idx, seq_len=100):
    sequences = []
    for notes in data:
        for i in range(0, len(notes) - seq_len):
            sequence = notes[i:i+seq_len]
            sequence = [note_to_idx[note.pitch] for note in sequence]
            sequences.append(sequence)
    return sequences


def get_dataloader(sequences, batch_size=64):
    inputs = []
    labels = []
    for sequence in sequences:
        inputs.append(sequence[:-1])
        labels.append(sequence[1:])
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    dataset = torch.utils.data.TensorDataset(inputs, labels)
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)


def generate(model, start_sequence, length, device):
    model.eval()
    sequence = start_sequence
    with torch.no_grad():
        for i in range(length):
            sequence_tensor = torch.tensor(sequence).unsqueeze(0).to(device)
            outputs = model(sequence_tensor)
            _, predicted = torch.max(outputs[:, -1], 1)
            sequence.append(predicted.item())
    return sequence

def generate_midi(sequence, idx_to_note, file_path):
    midi = pretty_midi.PrettyMIDI()
    piano = pretty_midi.Instrument(program=0)
    for i in range(len(sequence)):
        note = pretty_midi.Note(
            velocity=100,
            pitch=idx_to_note[sequence[i]],
            start=i,
            end=i+1
        )
        piano.notes.append(note)
    midi.instruments.append(piano)
    midi.write(file_path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
data = get_data()
vocab = get_vocab(data)
note_to_idx = get_note_to_idx(vocab)
idx_to_note = get_idx_to_note(vocab)
sequences = get_sequences(data, note_to_idx)
train_loader = get_dataloader(sequences)
model = LSTM(len(vocab), 512, 2, len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs = 100
for epoch in range(epochs):
    loss = train(model, train_loader, optimizer, criterion, device)
    print(f'Epoch: {epoch+1}/{epochs}, Loss: {loss}')
    if (epoch+1) % 10 == 0:
        start_sequence = sequences[np.random.randint(0, len(sequences))]
        sequence = generate(model, start_sequence, 100, device)
        generate_midi(sequence, idx_to_note, f'generated_midi/epoch_{epoch+1}.mid')

torch.save(model.state_dict(), 'model.pth')


In [None]:
model = LSTM(len(vocab), 512, 2, len(vocab)).to(device)
model.load_state_dict(torch.load('model.pth'))
model.eval()

start_sequence = sequences[np.random.randint(0, len(sequences))]
sequence = generate(model, start_sequence, 100, device)
generate_midi(sequence, idx_to_note, 'generated_midi/test.mid')