In [None]:
'''
Welcome to Music SketchNet! 
--------------------------
This is the first file you click in this folder to start.
We will guide you in each process of the Music SketchNet, including:
    1. data processing
    2. model construction
    3. model training/infering
    4. evaluation.
--------------------------
This file is to process the dataset that used in Music SketchNet

Before going into the process, we would like you to know that we use self-defined MIDI_Loader to process the irish midi files
One of the problem you should notice is that there is a bias time (~1/960 sec) in the irish midi files
In that, we offset this bias in the Midi_Loader implementation (c_bias = 1.0 / 960)
With our codes, you can process the irish dataset.

If you want to process other datasets, you have two choices:
1) replace the c_bias = 1.0/960 with c_bias = 0.0, and perhaps you should check more about the difference between your midi files and irish midi files. 
2) check our code, and write your own processing script (we recommend this)

But at least, with our codes, you can use the irish dataset to go through all the process, which will give you a strong example of how to use it.

Please ignore the "Nottingham" we define in the MIDI_Loader. 
Most of these codes can help you to process the Nottingham, another folk songs dataset, but there might be some problems with it.
'''
import os
import copy
import random
import numpy as np
import pretty_midi as pyd
from loader.dataloader import MIDI_Loader

s_dir = "" # folder address
dataset_path = "data/IrishFolkSong/session/" # dataset path


In [None]:
# load data from Midis, because bpm = 120ï¼Œso one beat time = 60 / 120 = 0.5
# And in 4/4 we divide 4 beat to 24 step/frames, each will be 0.5 * 4 / 24  = 0.5 / 6 sec
# It takes a little bit long time. 
ml = MIDI_Loader("Irish",minStep = 0.5 / 6)
ml.load(os.path.join(s_dir, dataset_path))

In [None]:
# process all files to the mesaure data for VAE tranining
s = ml.processed_all()

In [None]:
#  now you would able to see the files like
print(s[0])

In [None]:
# in order to save space, we only need the notes, so we delete the "raw" in the processed dict
for i in range(len(s)):
    s[i]["raw"] = ""

In [None]:
# split the dataset
ratio = [int(len(s) * 0.7), int(len(s) * 0.9)]
random.shuffle(s)
train_s = s[:ratio[0]]
validate_s = s[ratio[0]:ratio[1]]
test_s = s[ratio[1]:]
print(len(train_s),len(validate_s),len(test_s))

In [None]:
# Save it to irish_train, validate, or test
np.save("data/irish_train.npy", train_s)
np.save("data/irish_validate.npy", validate_s)
np.save("data/irish_test.npy", test_s)