In [None]:
!pip install -U joblib
!pip install -U ipywidgets
!pip install -U tqdm

In [None]:
from midi_tokenizer import MIDITokenizer

import MIDI 
# Note: The original MIDI.py has poor performance because list.pop(0) has poor performance on some pythons or machines. I changed it to list = list[1:]

import os
import glob
import re
from tqdm import tqdm
from pathlib import Path
import shutil
import random

from joblib import Parallel, delayed, parallel_config

In [None]:
tokenizer = MIDITokenizer()

#======================================================================================

move_files = False # Move processed and bad MIDIs or only copy them

#======================================================================================

def process_midi_file(midi_file):
    try:
        with open(midi_file, 'rb') as f:
            datas = f.read()
    except Exception:
        print(midi_file)
        return
    res = []
    if len(datas) > 384000:  # file too large that can not load by `MIDI.midi2score`
        res.append("large")
    elif len(datas) < 3000:
        res.append("small")
    if not res:
        try:
            mid = MIDI.midi2score(datas)
            mid = tokenizer.tokenize(mid)
            quality, res = tokenizer.check_quality(mid)
        except Exception:
            quality = False
            res = ["unk"] # broken midi file or Exception in the code
    else:
        quality = False
        
    if quality:
        path = midi_file.replace(dataset_dir, f"{processed_dir}/")
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)

        if move_files:
            shutil.move(midi_file, path)

        else:
            shutil.copy2(midi_file, path)
    else:
        res = "_".join(res)
        path = midi_file.replace(dataset_dir, f"{rm_dir}/{res}/")
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        
        if move_files:
            shutil.move(midi_file, path)

        else:
            shutil.copy2(midi_file, path)

In [None]:
dataset_dir = r"/home/ubuntu/SOURCE/MIDIs" # Source MIDI dataset directory
processed_dir = r"/home/ubuntu/OUTPUT/processed_midis" # All processed midi will be moved to here. The folder will be created automatically.
rm_dir = r"/home/ubuntu/OUTPUT/bad_midis" # badAll bad midi will be moved here. The folder will be created automatically.

In [None]:
midi_files = glob.glob(f"{dataset_dir}/**/*", recursive=True)
midi_files = [file for file in midi_files if re.search(r'\.midi?$', file, re.IGNORECASE)]
midi_files = sorted(midi_files)

In [None]:
# check if dataset_dir is correct
print(len(midi_files))
print(random.choice(midi_files))

In [None]:
# start processing...

NUMBER_OF_PARALLEL_JOBS = 128 # Number of parallel jobs
NUMBER_OF_FILES_PER_ITERATION = 256 # Number of files to queue for each parallel iteration

print('=' * 70)
print('Processing MIDI files. Please wait...')
print('=' * 70)

for i in tqdm(range(0, len(midi_files), NUMBER_OF_FILES_PER_ITERATION)):

    with parallel_config(n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose = 0):

        Parallel(backend='loky', n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose=0)(delayed(process_midi_file)(f) for f in midi_files[i:i+NUMBER_OF_FILES_PER_ITERATION])

print('=' * 70)
print('Done!')
print('=' * 70)

In [None]:
# Calculate the number of processed and bad MIDIs

print('=' * 70)
print('Scannging processed MIDIs dir...')

processed_midis = []

for (dirpath, dirnames, filenames) in os.walk(processed_dir):
    processed_midis += [os.path.join(dirpath, file) for file in filenames]

print('=' * 70)
print('Scannging bad MIDIs dir...')

bad_midis = []

for (dirpath, dirnames, filenames) in os.walk(rm_dir):
    bad_midis += [os.path.join(dirpath, file) for file in filenames]

print('=' * 70)
print('Number of good MIDIs:', len(processed_midis))
print('Number of bad MIDIs:', len(bad_midis))
print('=' * 70)