In [None]:
import glob
import os
import pandas as pd

import pretty_midi

from IPython.display import clear_output

# Simplifying the dataset

[We've listed all the instruments present by count](/notebooks/notebooks/Load%20Instruments.ipynb) so we'll build on the work in that notebook, by loading the csv file it has produced.

We will want to only focus on training melodies for one set for now, so it makes sense to pick an instrument (program code), that is present in large quantities, and is likely to have long melodies to train on.

[See here](https://soundprogramming.net/file-formats/general-midi-instrument-list/) for a list of instrument programs and their names.

In [None]:
instruments = pd.read_csv('instruments.csv', sep=';', index_col=0)
instruments.head()

## We'll focus on one of the most popular instruments

Let's list the top used instruments below:

You'll notice the name column has a large count. The name appears to be a free text field for description of the intention of this track in the MIDI file, the program is the instrument used.

Because these midi files are created by humans, there's a risk that people will intend for a track to be a certain type, but have used a program they liked the sound of in their midi engine when creating the track. It is common for vocals to be coded as Strings, (52 Choir Aahs, 53 Voice Oohs), for instance. We're going to disregard this problem for now, and assume the program codes are accurate.

In [None]:
# Show the instrument with the most unique filepath values
grouped = instruments.groupby('program').nunique().sort_values('filepath', ascending=False)
grouped.head(n=10)

In [None]:
# Translate the program codes to instrument names
for program in grouped.head(n=10).index.values:
    print(pretty_midi.program_to_instrument_name(int(program)))
# 

The top instrument used is 0: Acoustic Grand Piano.

It doesn't seem to be the most realiable program of them all (7281 different plaintext names compared to the next in line with 1588), so it does point towards this being used for a bunch of different stuff.
We'll continue inspecting this.

In [None]:
instruments[(instruments['program'] == 0)].head(n=10)

Not exactly all pianos.
Let's try to filter for names that contain the word "piano"

In [None]:
pianos = instruments[(instruments['program'] == 0) & (instruments['name'].str.contains('piano', case=False))]
print(len(pianos))
pianos.head(n=10)

Much better.

But now we're reduced to 3183 files. It'll have to do for now.

# Loading all files for the instrument

We'll iterate over our piano list and load all the midi files, get the piano rolls and save the result as a CSV for later consumption.

This will take a few minutes, since parsing takes long in itself.

In [None]:
note_columns = [pretty_midi.note_number_to_name(n) for n in range(0,128)]
def encode_dummies(instrument, sampling_freq):
    """ Gonna cheat a little bit by transposing the instrument piano roll. 
        However, that leaves us with a lot of blank space. 
    """
    return pd.DataFrame(instrument.get_piano_roll(fs=sampling_freq).astype('int').T, columns=note_columns)

def trim_blanks(df):
    """
        Remove the first period of no activity (fast forward to where the first note begins for this instrument)
    """
    nonzero = df.apply(lambda s: s != 0)
    nonzeroes = df[nonzero].apply(pd.Series.first_valid_index)
    first_nonzero = nonzeroes.min()
    if first_nonzero is pd.np.nan:
        return None
    return df.iloc[int(first_nonzero):]

"""
    Create a pandas to store the piano rolls in.
    It'll look a bit like this:
    
    |   Index     | t | C0 |...| C10 |
    |Song_name_3:0| 0 | 40 |...| 0   |
    |             | 1 | 40 |...| 0   |
    |             |...|... |...| ... |
    |             | N | 40 |...| 0   |
    |Song_name_3:1| 0 | 40 |...| 0   |
    |             |...|... |...| ... |
    |             | N | 40 |...| 0   |
    |Song_name_4:0| 0 | 40 |...| 0   |
    |             |...|... |...| ... |
    |             | N | 40 |...| 0   |

"""
saved_columns = [pretty_midi.note_number_to_name(n) for n in range(48,109)]
piano_rolls = pd.DataFrame(columns=['piano_roll_name', 'timestep'] + saved_columns)
piano_rolls = piano_rolls.set_index(['piano_roll_name', 'timestep'])
piano_rolls.head()

# Parsing the data to csv

The following step takes too long and is projected to take up roughly 30gb of space - so let's do some more splitting work on it (to reduce breaks between segments). Maybe consider cutting away large parts of the tonal range, such as only using songs where the instruments are between C4 and C8 or something.

Alternatively, we can load in the files directly during training in a data loader, but I'd like to clean and split the songs beforehand so they're easier to train on.

We could split all the songs into x-bar segments and save them as .mid files in a processed folder. Should take up less space and be easier.

In [None]:
limit = 100

file_name = 'piano_rolls.csv'
piano_rolls.to_csv(file_name, sep=';', encoding='utf-8')

# This might take a while...
for i, file in enumerate(pianos['filepath'][:limit]):
    clear_output(wait=True)
    song_name = os.path.basename(file)
    print("{}/{}: Loading and parsing {}.".format(i, len(pianos), song_name))
    try:
        pm = pretty_midi.PrettyMIDI(file)
        
        """
            Here we calculate the amount of seconds per sixteenth note, by taking the second beat of the song 
            (which is the same as the difference in seconds between the first and second beat),
            and convert it to the sampling frequency format that pretty_midi expects.
        """
        sampling_freq = 1/ (pm.get_beats()[1]/4)
    except:
        # For now, just ignore files we can't load.
        continue
        
    for j, instrument in enumerate(pm.instruments):
        # Hardcoded for now:
        if instrument.program == 0 and 'piano' in instrument.name.lower():
            """
                Generate a unique top level index per song and instrument in this song,
                if it has multiples of the same kind.
            """
            top_level_index = "{}_{}:{}".format(song_name, i, j)

            df = encode_dummies(instrument, sampling_freq).fillna(value=0) # Fill invalid values
            df = trim_blanks(df)
            if df is None:
                # We've got an empty track
                continue
            df['timestep'] = df.index
            df['piano_roll_name'] = top_level_index
            df = df.set_index(['piano_roll_name', 'timestep'])
            df.drop(labels=[pretty_midi.note_number_to_name(n) for n in range(0,48)], axis=1, inplace=True)
            df.drop(labels=[pretty_midi.note_number_to_name(n) for n in range(109,128)], axis=1, inplace=True)
            df.to_csv(file_name, sep=';', mode='a', encoding='utf-8', header=False)
    

In [None]:
import librosa.display as display
import matplotlib.pyplot as plt
%matplotlib inline

rolls = pd.read_csv(file_name, sep=';', index_col=['piano_roll_name', 'timestep'])
rolls.head()
first = rolls.loc['The Only Way Is Up.mid_0:4'].T
display.specshow(first.values, y_axis='cqt_note', cmap=plt.cm.hot)

In [None]:
rolls = rolls.loc[:, (rolls != 0).any(axis=0)]
rolls.tail(n=16)