In [1]:
import glob
import os
import pandas as pd

import pretty_midi

from IPython.display import clear_output

In [2]:
DATA_PATH = '/tmp/data/'
SEARCH_CRITERION = '**/*.mid'

midi_files = glob.glob(os.path.join(DATA_PATH, SEARCH_CRITERION), recursive=True)

# Simplifying the dataset

To begin with, let's list all the instruments present in the midi files by count, so we can take a decision on where to start.
We will want to only focus on training melodies for one set for now, so it makes sense to pick an instrument (program code), that is present in large quantities, and is likely to have long melodies to train on.

Initial guesses would be a Piano or Guitar program.

Some songs might have duplicates - we'll find a way to deal with this later if necessary.

[See here](https://soundprogramming.net/file-formats/general-midi-instrument-list/) for a list of instrument programs and their names

In [3]:
# Save the filepath for a second pass loading the piano rolls for each instrument we will use

limit = -1

instrument_ary = [[]]
instrument_ary.append(['program', 'is_drum', 'name', 'filepath'])
# This might take a while...
for index, file in enumerate(midi_files[:limit]):
    clear_output(wait=True)
    print("{}/{}: Loading and parsing {}.".format(index, len(midi_files), os.path.basename(file)))
    try:
        pm = pretty_midi.PrettyMIDI(file)
        instruments = pm.instruments

        for instrument in instruments:
            instrument_ary.append([instrument.program, instrument.is_drum, instrument.name.replace(';',''), file])
    except:
        # For now, just ignore files we can't load.
        continue

561/563: Loading and parsing 135.mid.


In [4]:
df = pd.DataFrame(data=instrument_ary, columns=["program", "is_drum", "name", "filepath"])
df = df.dropna()
df.head()

Unnamed: 0,program,is_drum,name,filepath
1,program,is_drum,name,filepath
2,19,False,Soprano,/tmp/data/66.mid
3,19,False,Soprano,/tmp/data/66.mid
4,19,False,Soprano,/tmp/data/66.mid
5,19,False,Soprano,/tmp/data/66.mid


# Save the loaded instruments for further processing

In [5]:
file_name = 'instruments.csv'
df.to_csv(file_name, sep=';', encoding='utf-8')

## Show the instruments that occur most often in different files

We want to get an intuition of which songs might have the most attractive instruments to work on

In [6]:
# Show the instrument with the most unique filepath values
df.groupby('program').nunique().sort_values('filepath', ascending=False)

Unnamed: 0_level_0,program,is_drum,name,filepath
program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19,1,1,5,523
40,1,1,1,17
42,1,1,1,15
0,1,1,1,7
6,1,1,1,5
73,1,1,1,4
41,1,1,1,1
program,1,1,1,1
