The music data in this repository are combined from ACPAS, A-MAPS, Classical Piano Midi, and ASAP music datasets.
See the [PM2S repository](https://github.com/cheriell/PM2S/tree/main/dev) for details.

This notebook shows how we processed these music datasets.
You don't need to run this notebook again if you just want to use the dataset;
the data contained in this repo is already processed.

- `pm2s_metadata_r.csv` and `pm2s_metadata_s.csv` come from `dev/metadata/ACPAS/metadata_{R,S}.csv` in the PM2S repo respectively.

In [10]:
import numpy as np
import pandas as pd
from pathlib import Path

meta_r = pd.read_csv("./pm2s_metadata_r.csv")
assert len(meta_r["performance_MIDI_external"]) == len(
    set(meta_r["performance_MIDI_external"])
)
meta_s = pd.read_csv("./pm2s_metadata_s.csv")
assert len(meta_s["performance_MIDI_external"]) == len(
    set(meta_s["performance_MIDI_external"])
)
# There are no duplicate performance MIDI files in each set,
# but cross-set duplicates exist
# The "S" (synthetic) set has Kontakt instrument-synthesized audio
# which we don"t have access to, nor do we need them.
# The MIDI files are the same in the case of duplications.

meta = pd.concat([meta_s, meta_r]).reset_index(drop=True)
meta.drop_duplicates("performance_MIDI_external", inplace=True)
print(f"Total {meta.shape[0]} MIDI files")

meta.rename(
    columns={
        "performance_MIDI_external": "perf_midi_file",
        "MIDI_score_external": "orig_midi_file",
        "performance_annotation_external": "annot_file",
    },
    inplace=True,
)
meta.drop(
    columns=[
        "performance_id",
        "performance_audio_external",
        "score_annotation_external",
        "folder",
        "performance_audio",
        "performance_MIDI",
        "MIDI_score",
        "performance_annotation",
        "score_annotation",
    ],
    inplace=True,
)

# There are a few case-mismatched paths that we'll correct here.
remap = {
    "cpm/Schubert/schubert_D850_1.mid": "cpm/Schubert/schubert_d850_1.mid",
    "cpm/Schubert/schubert_D850_2.mid": "cpm/Schubert/schubert_d850_2.mid",
    "cpm/Schubert/schubert_D850_3.mid": "cpm/Schubert/schubert_d850_3.mid",
    "cpm/Schubert/schubert_D850_4.mid": "cpm/Schubert/schubert_d850_4.mid",
}


def map_annot_path(s: str):
    if isinstance(s, str) and "{ASAP}" in s:
        return s.replace("{ASAP}/", "asap/")
    assert isinstance(s, float) and s != s  # isnan
    return None


def map_midi_path(s: str):
    if "{A_MAPS}" in s:
        return s.replace("{A_MAPS}/MAPS_MUS-", "amaps/")
    if "{ASAP}" in s:
        return s.replace("{ASAP}/", "asap/")
    if "{CPM}" in s:
        s = s.replace("{CPM}/midis/", "cpm/")
        if s in remap:
            s = remap[s]
        return s
    raise ValueError(f"Unknown path: {s}")


def read_midi_info(midi_file):
    import pretty_midi as pm

    midi = pm.PrettyMIDI(midi_file)
    duration = midi.get_end_time()
    n_notes = np.sum([len(inst.notes) for inst in midi.instruments])
    return duration, n_notes


def check_all_files_exist(files):
    for f in files:
        if f is not None and not (f := Path(f)).is_file():
            raise ValueError(f"File not found: {f}")


meta["perf_midi_file"] = meta["perf_midi_file"].map(map_midi_path)
meta["orig_midi_file"] = meta["orig_midi_file"].map(map_midi_path)
meta["annot_file"] = meta["annot_file"].map(map_annot_path)

# There are exactly 2 pieces with wrong duration; we'll overwrite them.
# asap/Beethoven/Piano_Sonatas/11-1/MaximovI02M.mid
# asap/Beethoven/Piano_Sonatas/9-1/Tysman05M.mid
midi_info = meta["perf_midi_file"].map(read_midi_info)
meta["duration"], meta["n_notes"] = midi_info.map(lambda x: x[0]), midi_info.map(
    lambda x: x[1]
)

# No need to check "perf_midi_file" again -- already loaded these MIDI diles
check_all_files_exist(
    pd.concat([meta["orig_midi_file"], meta["annot_file"]]).to_numpy()
)

meta.to_csv("metadata.csv", index=False)

Total 1670 MIDI files


Output format explained:
- `perf_midi_file` is the MIDI file of an entry (not necessarily a human performance MIDI; it depends on the dataset)

- `orig_midi_file` is the score MIDI file of an entry.
  If a piece has multiple performance versions (typically in the `ASAP` dataset), then this points to the non-performance, score-version MIDI of that piece.

- `annot_file` points to the annotation file of an entry. This only exists for `ASAP`;
  for the other 2 datasets, the annotations are builtin to the MIDI file, and this column's value is `None`.

- `aligned` is a boolean and exists in the original metadata. It's not clear what this means, and it does not seem to do anything in PM2S source code.

- All file paths are relative to the root of this repository.

In [13]:
import pandas as pd


def get_stats(dataset):
    return {
        "pieces": len(dataset["piece_id"].unique()),
        "performances": dataset.shape[0],
        "duration": dataset["duration"].sum(),
        "n_notes": dataset["n_notes"].sum(),
    }


meta = pd.read_csv("./metadata.csv")
datasets = ["train", "validation", "test"]
pd.DataFrame([get_stats(meta[meta["split"] == s]) for s in datasets], index=datasets)

Unnamed: 0,pieces,performances,duration,n_notes
train,359,1155,340844.519229,3456673
validation,49,135,31208.688997,280316
test,89,380,113079.25801,1255614
