# Processing All Data

#### Giant Piano MIDI, ASAP, Maestro, and ATEPP

## Library Import

In [1]:
import miditok
import os
import pickle
import json
import random
from tqdm import tqdm
import copy
from collections import Counter

import random
import shutil

from collections import defaultdict

## Get file paths

In [58]:
filez_path = 'norm_midi'
filez = list()
for (dirpath, dirnames, filenames) in os.walk(filez_path):
    filez += [os.path.join(dirpath, file) for file in filenames if file.lower().endswith((".mid", ".midi"))]

In [57]:
filez_path2 = 'transposed_midi'
filez2 = list()
for (dirpath, dirnames, filenames) in os.walk(filez_path2):
    filez2 += [os.path.join(dirpath, file) for file in filenames if file.lower().endswith((".mid", ".midi"))]

In [3]:
len(filez)

38548

In [22]:
len(filez2)

3348

## Tokenize

#### Define a validation function to not process short (<256 notes) MIDIs

In [4]:
def midi_valid(midi) -> bool:
    if len(midi.instruments[0].notes) < 300:
        return False  # this MIDI is too short
    return True

In [7]:
# Our parameters
CHORD_MAPS = {
    "min": (0, 3, 7),
    "maj": (0, 4, 7),
    "dim": (0, 3, 6),
    "aug": (0, 4, 8),
    "sus2": (0, 2, 7),
    "sus4": (0, 5, 7),
    "7dom": (0, 4, 7, 10),
    "7min": (0, 3, 7, 10),
    "7maj": (0, 4, 7, 11),
    "7halfdim": (0, 3, 6, 10),
    "7dim": (0, 3, 6, 9),
    "7aug": (0, 4, 8, 11),
    "9maj": (0, 4, 7, 10, 14),
    "9min": (0, 4, 7, 10, 13),
}

TS_MAPS = {16: [15, 11, 7, 4, 3, 12, 6], 8: [3, 12, 6, 7, 5, 9], 4: [5, 6, 3, 2, 1, 4, 9], 2: [3, 2, 4, 1]}


TOKENIZER_PARAMS2 = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "nb_velocities": 32,
    "special_tokens": ["PAD", "EOS"],
    "use_chords": True,
    "chord_maps": CHORD_MAPS,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": True,
    "time_signature_range": TS_MAPS,
    "use_programs": False,
    "use_sustain_pedals": False,
    "nb_tempos": 32,  # nb of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
config = miditok.TokenizerConfig(**TOKENIZER_PARAMS2)

# Creates the tokenizer
tokenizer = miditok.REMI(config)

In [8]:
len(tokenizer.vocab.keys())

361

In [54]:
pickle.dump(tokenizer, open("tokenizer_REMI_ext_np.p", "wb"))

In [11]:
tokenizer.vocab

{'PAD_None': 0,
 'EOS_None': 1,
 'Bar_None': 2,
 'Pitch_21': 3,
 'Pitch_22': 4,
 'Pitch_23': 5,
 'Pitch_24': 6,
 'Pitch_25': 7,
 'Pitch_26': 8,
 'Pitch_27': 9,
 'Pitch_28': 10,
 'Pitch_29': 11,
 'Pitch_30': 12,
 'Pitch_31': 13,
 'Pitch_32': 14,
 'Pitch_33': 15,
 'Pitch_34': 16,
 'Pitch_35': 17,
 'Pitch_36': 18,
 'Pitch_37': 19,
 'Pitch_38': 20,
 'Pitch_39': 21,
 'Pitch_40': 22,
 'Pitch_41': 23,
 'Pitch_42': 24,
 'Pitch_43': 25,
 'Pitch_44': 26,
 'Pitch_45': 27,
 'Pitch_46': 28,
 'Pitch_47': 29,
 'Pitch_48': 30,
 'Pitch_49': 31,
 'Pitch_50': 32,
 'Pitch_51': 33,
 'Pitch_52': 34,
 'Pitch_53': 35,
 'Pitch_54': 36,
 'Pitch_55': 37,
 'Pitch_56': 38,
 'Pitch_57': 39,
 'Pitch_58': 40,
 'Pitch_59': 41,
 'Pitch_60': 42,
 'Pitch_61': 43,
 'Pitch_62': 44,
 'Pitch_63': 45,
 'Pitch_64': 46,
 'Pitch_65': 47,
 'Pitch_66': 48,
 'Pitch_67': 49,
 'Pitch_68': 50,
 'Pitch_69': 51,
 'Pitch_70': 52,
 'Pitch_71': 53,
 'Pitch_72': 54,
 'Pitch_73': 55,
 'Pitch_74': 56,
 'Pitch_75': 57,
 'Pitch_76': 58,
 'Pitch

In [59]:
tokenizer.tokenize_midi_dataset(midi_paths = filez, out_dir = "REMI_tokens_norm2", validation_fn = midi_valid) #error at 2601+3807

Tokenizing MIDIs (REMI_tokens_norm2): 100%|██████████| 38548/38548 [1:04:19<00:00,  9.99it/s]


In [60]:
tokenizer.tokenize_midi_dataset(midi_paths = filez2, out_dir = "REMI_tokens_norm2", validation_fn = midi_valid) #error at 2601+3807

Tokenizing MIDIs (REMI_tokens_norm2): 100%|██████████| 3348/3348 [08:25<00:00,  6.63it/s]


### Ensure all files are over 1024 tokens length

In [61]:
# Define the maximum size for JSON files in kilobytes
max_file_size_kb = 80

# Define the minimum length for the list
minimum_list_length = 1048

# Function to check and move JSON files
def process_directory(directory, destination_directory):
    c = 0
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(root, filename)

                # Check file size before processing
                file_size_kb = os.path.getsize(file_path) / 1024  # Convert to kilobytes
                if file_size_kb <= max_file_size_kb:
                    with open(file_path, 'r') as json_file:
                        data = json.load(json_file)
                    if "ids" in data and len(data["ids"]) > 0 and len(data["ids"][0]) < minimum_list_length:
                        destination_path = os.path.join(destination_directory, filename)
                        shutil.move(file_path, destination_path)
                        print(f"Moved: {file_path} to {destination_path}")
                        c+=1
                else:
                    pass
    return c

In [62]:
source_directory = 'REMI_tokens_norm'
destination_directory = 'data_too_short'

process_directory(source_directory, destination_directory)

0

## Split into train/val/test

In [63]:
# Set the main directory path
main_directory = "REMI_tokens_norm2"

# Define the train, validation, and test split percentages
train_percent = 0.945  # 94.5% of the data for training
validation_percent = 0.05  # 5% for validation - 0.5% left for testing


# Create destination directories for each split
train_dir = 'data_remi_norm2/train'
validation_dir = 'data_remi_norm2/val'
test_dir = 'data_remi_norm2/test'


files = list()
for (dirpath, dirnames, filenames) in os.walk(main_directory):
    files += [os.path.join(dirpath, file) for file in filenames]

random.shuffle(files)
    
# Calculate the number of files for each split
num_files = len(files)
num_train = int(train_percent * num_files)
num_validation = int(validation_percent * num_files)
num_test = num_files - num_train - num_validation

# Distribute the files into train, validation, and test splits
train_files = files[:num_train]
validation_files = files[num_train:num_train + num_validation]
test_files = files[num_train + num_validation:]

# Copy or move the files into the respective split directories
for file in tqdm(train_files):
    shutil.copy(file, train_dir)

for file in validation_files:
    shutil.copy(file, validation_dir)

for file in test_files:
    shutil.copy(file, test_dir)



100%|██████████| 22268/22268 [00:03<00:00, 5690.99it/s]


### Perform Data Augmentation

In [2]:
tokenizer = pickle.load(open("tokenizer_REMI_ext.p", "rb"))

In [3]:
tokenizer

363 tokens with ('I', 'T') io format, without BPE

In [4]:
miditok.data_augmentation.data_augmentation_dataset(data_path = 'data_remi_norm/train',
                                                    tokenizer=tokenizer,
                                                    nb_octave_offset=2,
                                                    nb_vel_offset=2,
                                                    out_path = 'data_remi_norm_aug/train',
                                                    copy_original_in_new_location=True)

Performing data augmentation: 100%|██████████| 21919/21919 [07:52<00:00, 46.37it/s]


TypeError: unsupported operand type(s) for /: 'str' and 'str'

### Perform BPE

In [28]:
tokenizer_bpe = copy.deepcopy(tokenizer)

In [43]:
tokens = list()
filez_path = "data_remi_norm"
for (dirpath, dirnames, filenames) in os.walk(filez_path):
    # Filter files with only .mid or .midi extensions
    json_files = [os.path.join(dirpath, file) for file in filenames if file.lower().endswith(".json")]
    tokens += json_files

In [45]:
tokens[0]

'data_remi_norm/train/27a4db205dd446f70f32abc7e6dbcfcd.json'

In [33]:
len(tokens)

23195

In [50]:
with open(tokens[0]) as f:
    data = json.load(f)
len(data['ids'][0])

12047

In [34]:
tokenizer_bpe.learn_bpe(
    vocab_size=2000,
    tokens_paths=tokens,
)

Loading token files: 100%|██████████| 23195/23195 [00:16<00:00, 1440.15it/s]







In [35]:
pickle.dump(tokenizer_bpe, open("tokenizer_bpe.p", "wb"))

In [6]:
tokenizer_bpe = pickle.load(open("tokenizer_bpe.p", "rb"))

In [36]:
tokenizer_bpe.apply_bpe_to_dataset("data_remi_norm", "data_remi_norm_bpe")

Applying BPE to dataset: 100%|██████████| 23195/23195 [02:51<00:00, 135.43it/s]


In [11]:
tokens = list()
val_path = "data_remi_norm_aug/train"
for (dirpath, dirnames, filenames) in os.walk(val_path):
    # Filter files with only .json
    tokens += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")]

toks = []
for f in tqdm(tokens[:4000]):
    a = json.load(open(f))
    toks.extend(a)

100%|██████████| 4000/4000 [00:01<00:00, 2126.62it/s]


In [22]:
import os
import shutil

directory_path = 'data_remi_norm_aug/train'

# Get a list of all items (files and directories) in the given directory
all_items = os.listdir(directory_path)

cc = 0
cc2 = 0
# Iterate through each item
for item in all_items:
    # Form the full path to the item
    full_path = os.path.join(directory_path, item)

    # Check if the item is a directory
    if os.path.isdir(full_path):
        print(f"Deleting directory: {item}")
        shutil.rmtree(full_path)
        cc +=1
    else:
        cc2 += 1

print(cc2)

Deleting directory: chpn-p21_format0.json
Deleting directory: ty_februar_format0.json
Deleting directory: Granada - Fandango.json
Deleting directory: mendel_op53_5_format0.json
Deleting directory: 27a4db205dd446f70f32abc7e6dbcfcd.json
Deleting directory: 710d458a47a372c94452901eab2bad65.json
Deleting directory: 00d7e300d5f6dd658a6f7d3dcb3eb030.json
Deleting directory: 49527bdc3939b863a90e131b83d95a59.json
Deleting directory: 6b8acedd1051b55ed172b8303ef6bdcc.json
Deleting directory: 66c2e241e6492dba1899a1db2d6c2832.json
Deleting directory: 3c62f56b8aa9ea4fe12db1d23dbd2939.json
Deleting directory: d4c1a0858a9764f1d03615a3ff2cd36f.json
Deleting directory: swing-low-sweet-chariot.json
Deleting directory: a1cc9028a1d88a390e78420b21384800.json
Deleting directory: 259e07013d09e3e9ce04cbcf4a4d563a.json
Deleting directory: c9151545bb8c7fac48c06553f555bc4f.json
Deleting directory: d3af80309f156e469c5f041a5c90c97b.json
Deleting directory: 8e63841501d865be6b90b2a93e9f61b4.json
Deleting directory: 

In [23]:
tokenizer_bpe.apply_bpe_to_dataset("data_remi_norm_aug/train", "data_remi_norm_bpe_aug/train")

Applying BPE to dataset: 100%|██████████| 136653/136653 [18:48<00:00, 121.14it/s]


In [24]:
tokens_bpe = list()
filez_path = "data_remi_norm_bpe_aug"
for (dirpath, dirnames, filenames) in os.walk(filez_path):
    # Filter files with only .mid or .midi extensions
    json_files = [os.path.join(dirpath, file) for file in filenames if file.lower().endswith(".json")]
    tokens_bpe += json_files

In [26]:
tokens_bpe[0:10]

['data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§p12.json',
 'data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§p-12.json',
 'data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§v1.json',
 'data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§v2.json',
 'data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§v-2.json',
 'data_remi_norm_bpe_aug/train/27a4db205dd446f70f32abc7e6dbcfcd§v-1.json',
 'data_remi_norm_bpe_aug/train/710d458a47a372c94452901eab2bad65§p12.json',
 'data_remi_norm_bpe_aug/train/710d458a47a372c94452901eab2bad65§p24.json',
 'data_remi_norm_bpe_aug/train/710d458a47a372c94452901eab2bad65§p-12.json',
 'data_remi_norm_bpe_aug/train/710d458a47a372c94452901eab2bad65§v1.json']

In [52]:
ratios = []
for i in range(100):
    with open(tokens_bpe[i]) as f:
        data_bpe = json.load(f)
    with open(tokens[i]) as f:
        data = json.load(f)
    ratio = len(data_bpe['ids'][0])/len(data['ids'][0])
    ratios.append(ratio)
print(f"Average compression: {sum(ratios)/len(ratios)}")

Average compression: 0.5202478647877393


In [49]:
len(data['ids'][0])

7062