In [3]:
# TODO: Data Loader module see 
# https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00

%load_ext autoreload
%autoreload 2

In [65]:
from torch.utils.data import Dataset, DataLoader
import torch
import mido
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [40]:
BUNDLE_SEPARATOR = -1
EOS = 1024

In [36]:
# Bundle all note events together based on their time offset from each other
# Returns an array with dimensions (tracks, bundles per track)
def bundle_events(midi_file : mido.MidiFile):
    track_bundles = []

    for i, track in enumerate(midi_file.tracks):
        bundle = []
        bundles = []
        for msg in track:
            attrs = msg.dict()
            type = attrs["type"]

            if type == 'note_on' or type == 'note_off':
                if attrs['time'] == 0 or len(bundle) == 0:
                    bundle.append(attrs)
                else:
                    # Ensure that the first entry in the list has the time stamp info
                    delta = bundle[0]['time']
                    # Sort each entry in each bundle based on the note (in ascending order) 
                    sorted_bundle = sorted(bundle, key=lambda x: int(x['note']))

                    # Set all bundle entries to have time = 0 except for the first entry
                    for b in sorted_bundle:
                        b['time'] = 0
                    sorted_bundle[0]['time'] = delta 

                    bundles.append(sorted_bundle)

                    bundle.clear()
                    bundle.append(attrs)

        track_bundles.append(bundles)
    return track_bundles

In [57]:
# Convert bundles to a list of notes, deltas and velocities. 
# Each bundle note and velocity is separated from each other.
# Each delta only corresponds to each bundle separator
# Each velocity is normalized to be between 0 and 1 (by dividing by 127)

def create_lists(track_bundles):    
    track_notes = []
    track_deltas = []
    track_velocities = []

    for track in track_bundles:
        notes = []
        deltas = []
        velocities = []

        for b in track:
            for x in b: 
                notes.append(x['note'])
                velocities.append(x['velocity'] / 127)

            notes.append(BUNDLE_SEPARATOR)
            deltas.append(b[0]['time'])
            velocities.append(0)

        notes.append(EOS)
        deltas.append(0)
        velocities.append(0)

        track_notes.append(notes)
        track_deltas.append(deltas)
        track_velocities.append(velocities)

    return track_notes, track_deltas, track_velocities

In [59]:
def process_midi(path, logs = False):
    midi_file = mido.MidiFile(path)

    if logs:
        print(path)
        print("Samples: ", len(midi_file.tracks[0]))

    track_bundles = bundle_events(midi_file)
    track_notes, track_deltas, track_velocities = create_lists(track_bundles)

    return track_notes[0], track_deltas[0], track_velocities[0]

# Dataset Creation

In [68]:
def make_dataset(midis) -> pd.DataFrame:

    df = pd.DataFrame(columns=["notes", "deltas", "velocities"])

    for mid in midis:
        n, d, v = process_midi(mid)
        df.loc[len(df.index)] = [n, d, v]

    return df

In [64]:
class MidiDataset(Dataset):
    def __init__(self, notes, deltas, velocities):
        self.notes = notes 
        self.deltas = deltas 
        self.velocities = velocities
    
    def __len__(self):
        return len(self.notes)
    
    def __getitem__(self, idx):
        notes = self.notes[idx]
        deltas = self.deltas[idx]
        velocities = self.velocities[idx]

        sample = {"notes": notes, "deltas": deltas, "velocities": velocities }
        return sample

In [71]:
MIDI_PATH = r"data/Ludwig van Beethoven/Ode to Joy from the 9th Symphony.mid"
df = make_dataset([MIDI_PATH])

                                               notes  \
0  [57, 62, 65, 69, 70, 74, 77, 82, -1, 57, 62, 6...   

                                              deltas  \
0  [480, 239, 1, 59, 1, 59, 1, 59, 1, 59, 1, 59, ...   

                                          velocities  
0  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0, 0....  


In [74]:
midi = MidiDataset(df['notes'], df['deltas'], df['velocities'])