# combining classicalmidi and musicnet datasets

## this uses midi2abc "sudo apt install abcmidi" for abc2midi and midi2abc programs we call

In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm
import pretty_midi
import IPython.display
import subprocess
import copy
import shutil

from helpers import list_files

In [2]:
FS = 44100

TRANSPOSE_RANGE = 3
TEMPO_RANGE = 0.15
#VELOCITY_RANGE = 5
NUM_AUGMENTS = 27

In [3]:
INPUT_DIR = './inputs'
OUTPUT_PATH = os.path.join(INPUT_DIR, 'final/')
input_folders = ['classicalmidi', 'musicnet_midis'] #[folder for folder in os.listdir(INPUT_DIR) if '_og' not in folder or 'final' not in folder]
if False:
    for folder in input_folders:
            #path_to_folder = os.getcwd()
            path_to_folder = os.path.join(INPUT_DIR, folder)
            print(path_to_folder)
            list_files(path_to_folder, 2) # 2 files listed per dir

In [4]:
#OUTPUT_PATH = os.path.join(INPUT_DIR, 'final') # bad name

def midi_data_extraction(file_path, output_path):
    """
    Takes in a path to a folder of midi files. It goes through each midi, loads it, saves it as abc, and calls
    another function to handle augmentations.
    """
    if os.path.isdir(file_path):
        midi_files = [fn for fn in os.listdir(file_path) if fn.lower().endswith('.mid')]
        for file_name in tqdm(midi_files, desc=file_path):

            full_path = os.path.join(file_path, file_name)
            try:
                midi_data = pretty_midi.PrettyMIDI(full_path)
            except:
                print(f"FAILURE LOADING: {full_path}")
                continue

            
            #handle_augments(midi_data, output_path, file_name[:-4])            
            
            abc_name = file_name[:-4] + '.abc' # change extension
            abc_path = os.path.join(output_path, abc_name)
            midi_to_abc(full_path, abc_path)
            abcSetTitle(abc_path, f"{os.path.basename(file_path)} {file_name[:-4]}")

            handleTransposes(abc_path, output_path)
            #print(f"{full_path} loaded, sent to output path {output_path} with name {file_name[:-4] + '.abc'}")


def handleTransposes(abc_path, output_path, abc_name_no_extension = None):
    if not abc_name_no_extension:
        _, abc_name_no_extension = os.path.split(abc_path)

    for trans in range(-TRANSPOSE_RANGE, TRANSPOSE_RANGE + 1):
        if trans == 0:
            continue
        transposeABC(abc_path, trans)

        
def handle_augments(pm, output_path, abc_name_no_extension):
    temp_midi_path = os.path.join(output_path, f'{abc_name_no_extension}_temp.mid') # we need to save our pretty_midi to send to the external program
    
    for index in range(NUM_AUGMENTS):
        trans = np.random.randint(-TRANSPOSE_RANGE, TRANSPOSE_RANGE)
        tempo = np.random.uniform(1 - TEMPO_RANGE, 1 + TEMPO_RANGE)
        
        modded_midi = augment_data(pm, transpose_amount = trans, tempo_amount = tempo)        
        new_output_path = os.path.join(output_path, f'{abc_name_no_extension}_m{index}.abc')

        modded_midi.write(temp_midi_path) # prepare for external program
        midi_to_abc(temp_midi_path, new_output_path)
        
    try:
        os.remove(temp_midi_path)
        #print(f"File '{temp_midi_path}' deleted successfully.")
    except FileNotFoundError:
        print(f"Error: File '{temp_midi_path}' not found.")
    except Exception as e:
        print(f"An error occurred removing temp file: {e}")

In [5]:
def midi_to_abc(midi_path, abc_path):
    try:
        subprocess.run(["midi2abc", midi_path, "-NCOM", "-o", abc_path], check=True) # -NCOM = no comments
        #print(f"[OK] Saved ABC to {abc_path}")
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] MIDI to ABC conversion failed: {e}")

def abcToMidi(abc_path, midi_path): # for testing
    try:
        subprocess.run(["abc2midi", abc_path, "-o", midi_path], check=True)
        #print(f"[OK] Saved MIDI to {midi_path}")
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] ABC to MIDI conversion failed: {e}")

let's set up a cell to test midi data extraction

that's working, let's start converting everything into one folder

In [6]:
def transposeMidi(midi_data, shift):
    transposed = copy.deepcopy(midi_data) #pretty_midi.PrettyMIDI()

    for ks in transposed.key_signature_changes:
        new_key = ks.key_number
        is_major = new_key in range(0,12)
        if is_major:
            new_key = (new_key + shift) % 12
        else:
            new_key = (new_key + shift) % 12 + 12
        #print(f"changing old key {ks.key_number} by shift {shift} to {new_key}")
        ks.key_number = new_key
        
    for inst in transposed.instruments:

        # Don't transpose drum tracks
        if not inst.is_drum:
            for note in inst.notes:
                note.pitch += shift

    return transposed

def transposeABC(abc_path, shift, new_name = None):
    directory, filename = os.path.split(abc_path)
    if not new_name:
        new_name = filename[:-4] + f"_t{shift}.abc"
    new_abc_path = os.path.join(directory, new_name)
    shutil.copyfile(abc_path, new_abc_path)
    #print("fart", abc_path, new_abc_path)
    try:
        proc = subprocess.run(["abc2abc", abc_path, "-t", str(shift)], check=True, encoding='utf-8', stdout=subprocess.PIPE)
        with open(new_abc_path, "w") as fout:
            fout.writelines(proc.stdout)
        #print(f"[OK] Transposed ABC {shift} semitones to {new_abc_path}")
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] ABC to MIDI conversion failed: {e}")
        
def tempoShiftMidi(midi_data, tempo_factor = 1.01):
    new_midi = copy.deepcopy(midi_data)
    
    # Scale note timings
    for instrument in new_midi.instruments:
        for note in instrument.notes:
            note.start *= tempo_factor
            note.end *= tempo_factor

    # Scale time signature changes
    for ts in new_midi.time_signature_changes:
        ts.time *= tempo_factor

    return new_midi

In [7]:
def augment_data(pm, transpose_amount = 0, tempo_amount = 1.0):
    if transpose_amount == 0 and tempo_amount == 1.0:
        return pm
        
    modded = copy.deepcopy(pm)

    if transpose_amount:
        modded = transposeMidi(modded, transpose_amount)

    if tempo_amount != 1.0 and False: # not working for now, so ignore
        modded = tempoShiftMidi(modded, tempo_amount)

    return modded

In [8]:
"""
X: 1
T: from ./inputs/test_tempo.mid
M: 3/4
L: 1/8
Q:1/4=72
K:C % 0 sharps
V:1
"""
def abcSetTitle(abc_path, new_title):
    with open(abc_path, "r") as f:
        lines = f.readlines()
        i = 0
        for line in lines:
            if line.startswith("T:"): # tempo "Q: 1/4 = 60" with or without spaces
                new_line = f"T:{new_title}\n"
                lines[i] = new_line
            i += 1
    #print(lines)
    with open(abc_path, "w") as f:
        f.writelines(lines)
def abcAugmentTest(abc_path, tempo_change_bpm):
    with open(abc_path, "r") as f:
        lines = f.readlines()
        for line in lines:
            if line.startswith("Q:"): # tempo "Q: 1/4 = 60" with or without spaces
                parts = line.split("=")
                old_tempo = parts[-1]
                new_tempo = old_tempo + tempo_change_bpm
                ...
#abcSetTitle("test_raw_t5.abc", "NEW TITLE HERE")

## this does the main logic

In [None]:
test_filename = "test_deb_menu.mid"
test_path = os.path.join(INPUT_DIR, test_filename)
midi_to_abc(test_path, "test_raw.abc")

test_file = pretty_midi.PrettyMIDI(test_path)

transposeABC("test_raw.abc", 5, "test_trans.abc")
abcToMidi("test_raw.abc", "test_trans.mid")
trans_file = pretty_midi.PrettyMIDI("test_trans.mid")
trans_synth = trans_file.synthesize()
#IPython.display.Audio(trans_synth, rate=FS)


"""
trans_test = transposeMidi(test_file, 8)
trans_test_path = os.path.join(INPUT_DIR, "test_trans.mid")
trans_test.write(trans_test_path)
midi_to_abc(trans_test_path, "test_trans.abc")

# not really working, at the moment
tempo_test = tempoShiftMidi(test_file, 1.10)
tempo_test_path = os.path.join(INPUT_DIR, "test_tempo.mid")
tempo_test.write(tempo_test_path)
midi_to_abc(tempo_test_path, "test_tempo.abc")

final_test = abcToMidi("test_trans.abc", "test_trans.mid")
pm_final_test = pretty_midi.PrettyMIDI("test_trans.mid")

synth = test_file.synthesize()
IPython.display.Audio(synth, rate=FS)

trans_synth = pm_final_test.synthesize()
IPython.display.Audio(trans_synth, rate=FS)
"""


In [9]:
def convertAllToABC():
    for input_folder in input_folders:
        composers_path = os.path.join(INPUT_DIR, input_folder)
        
        for composer_folder in os.listdir(composers_path):
            input_path = os.path.join(composers_path, composer_folder)
            if not os.path.isdir(input_path):
                continue
                
            folder_name = composer_folder.capitalize()
            folder_output_path = os.path.join(OUTPUT_PATH, folder_name)
            os.makedirs(folder_output_path, exist_ok=True)
            
            midi_data_extraction(input_path, folder_output_path)

convertAllToABC()

./inputs/classicalmidi/balakir: 100%|█████████████| 1/1 [00:00<00:00, 10.43it/s]
./inputs/classicalmidi/tschai: 100%|████████████| 12/12 [00:00<00:00, 41.30it/s]
./inputs/classicalmidi/grieg: 100%|█████████████| 16/16 [00:00<00:00, 48.22it/s]
./inputs/classicalmidi/mendelssohn: 100%|███████| 15/15 [00:00<00:00, 50.76it/s]
./inputs/classicalmidi/granados: 100%|████████████| 3/3 [00:00<00:00, 31.61it/s]
./inputs/classicalmidi/haydn: 100%|█████████████| 21/21 [00:00<00:00, 44.61it/s]
./inputs/classicalmidi/beeth: 100%|█████████████| 29/29 [00:01<00:00, 21.02it/s]
./inputs/classicalmidi/mozart: 100%|████████████| 21/21 [00:00<00:00, 24.05it/s]
./inputs/classicalmidi/schumann: 100%|██████████| 24/24 [00:00<00:00, 53.08it/s]
./inputs/classicalmidi/burgm: 100%|███████████████| 9/9 [00:00<00:00, 55.27it/s]
./inputs/classicalmidi/brahms: 100%|████████████| 10/10 [00:00<00:00, 30.47it/s]
./inputs/classicalmidi/liszt: 100%|█████████████| 16/16 [00:01<00:00, 15.09it/s]
./inputs/classicalmidi/schub

Error: Time=9600 Track=3 Note terminated when not on - pitch 40
Error: Time=10080 Track=3 Note terminated when not on - pitch 40


./inputs/classicalmidi/chopin: 100%|████████████| 48/48 [00:01<00:00, 39.31it/s]
./inputs/classicalmidi/debussy: 100%|█████████████| 9/9 [00:00<00:00, 46.82it/s]
./inputs/classicalmidi/muss: 100%|████████████████| 8/8 [00:00<00:00, 31.26it/s]
./inputs/classicalmidi/bach: 100%|████████████████| 3/3 [00:00<00:00, 50.55it/s]
./inputs/musicnet_midis/Faure:  75%|██████████▌   | 3/4 [00:00<00:00, 10.54it/s]

Error: Time=22048 Track=4 Note terminated when not on - pitch 63
Error: Time=22048 Track=4 Note terminated when not on - pitch 74
Error: Time=22048 Track=4 Note terminated when not on - pitch 75
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error: Time=125662 Track=5 Note terminated when not on - pitch 55
Error: Time=125662 Track=5 Note terminated when not on - pitch 67
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on a

./inputs/musicnet_midis/Faure: 100%|██████████████| 4/4 [00:00<00:00,  8.54it/s]


Error: Time=50464 Track=4 Note terminated when not on - pitch 63
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!




FAILURE LOADING: ./inputs/musicnet_midis/Bach/2211_fugue12.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2305_prelude14.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2194_prelude13.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2227_fugue6.mid


./inputs/musicnet_midis/Bach:  84%|██████████▊  | 56/67 [00:00<00:00, 76.37it/s]

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2230_prelude20.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2292_prelude19.mid


./inputs/musicnet_midis/Bach: 100%|█████████████| 67/67 [00:00<00:00, 78.54it/s]


FAILURE LOADING: ./inputs/musicnet_midis/Bach/2310_prelude15.mid
Error: Time=5763 Track=2 Note terminated when not on - pitch 53


./inputs/musicnet_midis/Dvorak: 100%|█████████████| 8/8 [00:00<00:00, 21.32it/s]
./inputs/musicnet_midis/Cambini: 100%|████████████| 9/9 [00:00<00:00, 33.03it/s]
./inputs/musicnet_midis/Haydn: 100%|██████████████| 3/3 [00:00<00:00, 31.86it/s]
./inputs/musicnet_midis/Brahms: 100%|███████████| 24/24 [00:01<00:00, 16.40it/s]


Error: Time=444864 Track=3 Note terminated when not on - pitch 62
Error: Time=444992 Track=3 Note terminated when not on - pitch 62


./inputs/musicnet_midis/Mozart:  46%|█████      | 11/24 [00:00<00:00, 30.41it/s]

Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!


./inputs/musicnet_midis/Mozart: 100%|███████████| 24/24 [00:00<00:00, 28.08it/s]
Advancing by 0 in printtrack!


Error: Time=59650 Track=2 Note terminated when not on - pitch 60
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error: Time=51394 Track=4 Note terminated when not on - pitch 41
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
[ERROR] MIDI to ABC conversion failed: Command '['midi2abc', './inputs/musicnet_midis/Ravel/2179_gr_rqtf3.mid', '-NCOM', '-o', './inputs/final/Ravel/2179_gr_rqtf3.abc']' returned non-zero exit status 1.


./inputs/musicnet_midis/Ravel: 100%|██████████████| 4/4 [00:00<00:00,  9.60it/s]


Error: Time=93120 Track=4 Note terminated when not on - pitch 41
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!
Error in MIDI file - notes still on at end of track!




Error: Time=93452 Track=1 Note terminated when not on - pitch 77
Error: Time=117056 Track=1 Note terminated when not on - pitch 77


./inputs/musicnet_midis/Beethoven: 100%|██████| 157/157 [00:06<00:00, 24.59it/s]
./inputs/musicnet_midis/Schubert: 100%|█████████| 30/30 [00:01<00:00, 17.08it/s]


In [10]:
def prepareForGPT():
    with open("all_abcs.txt", "w") as f:
        f.write("<|startoftext|>\n")
        for final_folder in os.listdir(OUTPUT_PATH):
            input_path = os.path.join(OUTPUT_PATH, final_folder)
            if not os.path.isdir(input_path):
                continue
                
            abc_files = [fn for fn in os.listdir(input_path) if fn.lower().endswith('.abc')]
            for file_name in tqdm(abc_files, desc=f"Processing {final_folder:12}"):
        
                full_path = os.path.join(input_path, file_name)   
                with open(full_path, "r") as abc:
                    f.write('<|startofpiece|>\n')
                    f.writelines(abc.readlines())
                    f.write('<|endofpiece|>\n')
            f.write("<|endoftext|>")
prepareForGPT()

Processing Debussy     : 100%|███████████████| 63/63 [00:00<00:00, 32968.33it/s]
Processing Tschai      : 100%|███████████████| 84/84 [00:00<00:00, 30881.02it/s]
Processing Mendelssohn : 100%|█████████████| 105/105 [00:00<00:00, 38939.16it/s]
Processing Chopin      : 100%|█████████████| 336/336 [00:00<00:00, 33591.22it/s]
Processing Faure       : 100%|████████████████| 28/28 [00:00<00:00, 7516.67it/s]
Processing Liszt       : 100%|█████████████| 112/112 [00:00<00:00, 13585.18it/s]
Processing Balakir     : 100%|██████████████████| 7/7 [00:00<00:00, 8192.00it/s]
Processing Burgm       : 100%|███████████████| 63/63 [00:00<00:00, 39562.98it/s]
Processing Beeth       : 100%|█████████████| 203/203 [00:00<00:00, 18459.88it/s]
Processing Borodin     : 100%|███████████████| 49/49 [00:00<00:00, 36621.68it/s]
Processing Bach        : 100%|█████████████| 441/441 [00:00<00:00, 40472.80it/s]
Processing Albeniz     : 100%|███████████████| 98/98 [00:00<00:00, 28780.41it/s]
Processing Dvorak      : 100