In [2]:
import json

import fortepyan as ff
import numpy as np
from tqdm import tqdm
from datasets import Dataset, load_dataset

from data_masked.masking import AwesomeMasks

In [3]:
def process_dataset(dataset: Dataset, sequence_len: int, sequence_step: int, masks: AwesomeMasks) -> list[dict]:
    processed_records = []

    for record in tqdm(dataset, total=dataset.num_rows):
        # print(record)
        piece = ff.MidiPiece.from_huggingface(record)
        processed_record = process_record(piece, sequence_len, sequence_step, masks)

        processed_records += processed_record

    return processed_records


def process_record(piece: ff.MidiPiece, sequence_len: int, sequence_step: int, masks: AwesomeMasks) -> list[dict]:
    piece.df["next_start"] = piece.df.start.shift(-1)
    piece.df["dstart"] = piece.df.next_start - piece.df.start
    piece.df["dstart"] = piece.df["dstart"].fillna(0)

    midi_filename = piece.source["midi_filename"]

    record = []

    n_samples = 1 + (piece.size - sequence_len) // sequence_step
    for it in range(n_samples):
        start = it * sequence_step
        finish = start + sequence_len
        part = piece[start:finish]

        sequence = {
            "midi_filename": midi_filename,
            "source": json.dumps(part.source),
            "pitch": part.df.pitch.astype("int16").values,
            "dstart": part.df.dstart.astype("float32").values,
            "duration": part.df.duration.astype("float32").values,
            "velocity": part.df.velocity.astype("int16").values,
        }

        masking_spaces = {}

        for mask in masks.masks:
            # add new keys to sequence corresponding to masking space for each mask type
            masking_spaces[mask.token] = mask.masking_space(sequence)

        sequence["masking_spaces"] = masking_spaces

        record.append(sequence)

    return record

def average_record_length(dataset: Dataset) -> float:
    lengths = [len(record["notes"]["pitch"]) for record in dataset]
    return np.mean(lengths), np.std(lengths), np.max(lengths), np.min(lengths), np.median(lengths)

In [4]:
dataset = load_dataset("roszcz/maestro-v1-sustain")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

Downloading readme:   0%|          | 0.00/842 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to C:/Users/samue/.cache/huggingface/datasets/roszcz___parquet/roszcz--maestro-v1-sustain-5350ada51983a2ef/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/69.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.57M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/962 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/137 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/177 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/samue/.cache/huggingface/datasets/roszcz___parquet/roszcz--maestro-v1-sustain-5350ada51983a2ef/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
masks = AwesomeMasks()

time processing with minimal coverage

In [6]:
%%timeit -n 10 -r 1
test_records = process_dataset(test_dataset, sequence_len=60, sequence_step=60, masks=masks)

100%|██████████| 177/177 [00:18<00:00,  9.67it/s]
100%|██████████| 177/177 [00:17<00:00,  9.86it/s]
100%|██████████| 177/177 [00:18<00:00,  9.66it/s]
100%|██████████| 177/177 [00:17<00:00,  9.87it/s]
100%|██████████| 177/177 [00:17<00:00,  9.84it/s]
100%|██████████| 177/177 [00:18<00:00,  9.71it/s]
100%|██████████| 177/177 [00:18<00:00,  9.75it/s]
100%|██████████| 177/177 [00:17<00:00,  9.89it/s]
100%|██████████| 177/177 [00:18<00:00,  9.35it/s]
100%|██████████| 177/177 [00:18<00:00,  9.63it/s]

18.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)





In [7]:
mean, std, max_len, min_len, median = average_record_length(test_dataset)
print(f"Mean: {mean}, std: {std}, max: {max_len}, min: {min_len}, median: {median}")

Mean: 4188.757062146893, std: 3130.797449499849, max: 16966, min: 366, median: 3264.0
