# combining classicalmidi and musicnet datasets

In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm
import pretty_midi
import IPython.display

from helpers import list_files

In [2]:
FS = 44100

TRANSPOSE_RANGE = 3
TEMPO_RANGE = 0.15
VELOCITY_RANGE = 5
NUM_AUGMENTS = 27

In [3]:
INPUT_DIR = './inputs'
input_folders = ['classicalmidi', 'musicnet_midis'] #[folder for folder in os.listdir(INPUT_DIR) if '_og' not in folder or 'final' not in folder]
for folder in input_folders:
        #path_to_folder = os.getcwd()
        path_to_folder = os.path.join(INPUT_DIR, folder)
        print(path_to_folder)
        list_files(path_to_folder, 2) # 2 files listed per dir

./inputs/classicalmidi
classicalmidi/
    balakir/
        islamei.mid
    tschai/
        ty_april.mid
        ty_dezember.mid
    grieg/
        grieg_march.mid
        grieg_wanderer.mid
    mendelssohn/
        mendel_op19_1.mid
        mendel_op19_6.mid
    granados/
        gra_esp_3.mid
        gra_esp_2.mid
    haydn/
        hay_40_2.mid
        haydn_8_4.mid
    beeth/
        mond_2.mid
        pathetique_2.mid
    mozart/
        mz_332_1.mid
        mz_570_1.mid
    schumann/
        scn16_7.mid
        scn15_4.mid
    burgm/
        burg_agitato.mid
        burg_perlen.mid
    brahms/
        br_rhap.mid
        br_im2.mid
    liszt/
        liz_et_trans4.mid
        liz_rhap12.mid
    schubert/
        schu_143_2.mid
        schubert_D935_4.mid
    albeniz/
        alb_se5.mid
        alb_se1.mid
    borodin/
        bor_ps6.mid
        bor_ps1.mid
    chopin/
        chpn-p24.mid
        chp_op18.mid
    debussy/
        deb_prel.mid
        deb_menu.mid
    muss/
     

In [1]:
entire_data = []
instrument_map = set()
OUTPUT_PATH = os.path.join(INPUT_DIR, 'final')

def midi_data_extraction(file_path, output_path):
    FLOAT_TYPE = np.float32 # float 16 seems a bit too small, 64 is big but might be worthwhile. is the default
    if os.path.isdir(file_path):
        midi_files = [fn for fn in os.listdir(file_path) if fn.endswith('.mid')]
        for file_name in tqdm(midi_files, desc=file_path):

            cols = ['start_time', 'end_time', 'note_value', 'velocity', 'instrument_program_number']
            file_data = [cols]
            full_path = os.path.join(file_path, file_name)
            try:
                midi_data = pretty_midi.PrettyMIDI(full_path)
            except:
                print(f"FAILURE LOADING: {full_path}")
                
            # Extract data for each note
            for instrument in midi_data.instruments:
                for note in instrument.notes:
                    start_time = note.start.astype(FLOAT_TYPE)
                    end_time = note.end.astype(FLOAT_TYPE)

                    instrument_program_number = instrument.program

                    instrument_map.add((instrument.name, instrument_program_number, pretty_midi.program_to_instrument_name(instrument_program_number)))
                    #note_name = pretty_midi.note_number_to_name(note.pitch)
                    note_value = note.pitch
                    #duration = end_time - start_time
                    velocity = note.velocity

                    note_info = [start_time, end_time, note_value, velocity, instrument_program_number]
                    file_data.append(note_info)
                    #print(note_info)

            df = pd.DataFrame(file_data[1:], columns = cols)
            
            csv_name = file_name[:-4] + '.csv' # change extension
            csv_path = os.path.join(output_path, csv_name)
            df.to_csv(csv_path, index=None) # the original
            
            handle_transposes(df, output_path, file_name[:-4])

def handle_transposes(df, output_path, csv_name):
    transposes = [np.random.randint(-TRANSPOSE_RANGE, TRANSPOSE_RANGE) for _ in range (NUM_AUGMENTS)]
    tempi = [np.random.uniform(1 - TEMPO_RANGE, 1 + TEMPO_RANGE) for _ in range (NUM_AUGMENTS)]
    velocities = [np.random.randint(-VELOCITY_RANGE, VELOCITY_RANGE) for _ in range (NUM_AUGMENTS)]

    

    for index in range(NUM_AUGMENTS):
        trans = np.random.randint(-TRANSPOSE_RANGE, TRANSPOSE_RANGE)
        tempo = np.random.uniform(1 - TEMPO_RANGE, 1 + TEMPO_RANGE)
        velo = np.random.randint(-VELOCITY_RANGE, VELOCITY_RANGE)
    
        modded_df = augment_data(df, transpose_amount = trans, tempo_amount = tempo, velocity_amount = velo)

        new_output_path = os.path.join(output_path, f'{csv_name}_m{index}.csv')
        modded_df.to_csv(new_output_path, index=None)
    #print(new_output_path, df.head(2))

NameError: name 'os' is not defined

let's set up a cell to test midi data extraction

that's working, let's start converting everything into one folder

In [6]:
for input_folder in input_folders:
    composer_path = os.path.join(INPUT_DIR, input_folder)
    
    for composer_folder in os.listdir(composer_path):
        input_path = os.path.join(composer_path, composer_folder)
        if not os.path.isdir(input_path):
            continue
            
        folder_name = composer_folder.capitalize()
        folder_output_path = os.path.join(OUTPUT_PATH, folder_name)
        os.makedirs(folder_output_path, exist_ok=True)
        
        midi_data_extraction(input_path, folder_output_path)

./inputs/classicalmidi/balakir: 100%|█████████████| 1/1 [00:00<00:00,  3.59it/s]
./inputs/classicalmidi/tschai: 100%|████████████| 12/12 [00:00<00:00, 15.27it/s]
./inputs/classicalmidi/grieg: 100%|█████████████| 16/16 [00:00<00:00, 18.58it/s]
./inputs/classicalmidi/mendelssohn: 100%|███████| 15/15 [00:00<00:00, 17.46it/s]
./inputs/classicalmidi/granados: 100%|████████████| 3/3 [00:00<00:00, 13.67it/s]
./inputs/classicalmidi/haydn: 100%|█████████████| 21/21 [00:01<00:00, 16.35it/s]
./inputs/classicalmidi/beeth: 100%|█████████████| 29/29 [00:04<00:00,  7.18it/s]
./inputs/classicalmidi/mozart: 100%|████████████| 21/21 [00:02<00:00,  8.31it/s]
./inputs/classicalmidi/schumann: 100%|██████████| 24/24 [00:01<00:00, 20.29it/s]
./inputs/classicalmidi/burgm: 100%|███████████████| 9/9 [00:00<00:00, 21.55it/s]
./inputs/classicalmidi/brahms: 100%|██████████████| 9/9 [00:00<00:00,  9.55it/s]
./inputs/classicalmidi/liszt: 100%|█████████████| 16/16 [00:03<00:00,  5.10it/s]
./inputs/classicalmidi/schub

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2211_fugue12.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2305_prelude14.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2194_prelude13.mid




FAILURE LOADING: ./inputs/musicnet_midis/Bach/2227_fugue6.mid


./inputs/musicnet_midis/Bach:  67%|████████▋    | 45/67 [00:01<00:01, 19.52it/s]

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2230_prelude20.mid


./inputs/musicnet_midis/Bach:  82%|██████████▋  | 55/67 [00:02<00:00, 25.44it/s]

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2292_prelude19.mid


./inputs/musicnet_midis/Bach:  97%|████████████▌| 65/67 [00:02<00:00, 27.56it/s]

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2310_prelude15.mid


./inputs/musicnet_midis/Bach: 100%|█████████████| 67/67 [00:02<00:00, 25.07it/s]
./inputs/musicnet_midis/Dvorak: 100%|█████████████| 8/8 [00:01<00:00,  7.21it/s]
./inputs/musicnet_midis/Cambini: 100%|████████████| 9/9 [00:00<00:00, 10.14it/s]
./inputs/musicnet_midis/Haydn: 100%|██████████████| 3/3 [00:00<00:00, 11.61it/s]
./inputs/musicnet_midis/Brahms: 100%|███████████| 24/24 [00:04<00:00,  5.17it/s]
./inputs/musicnet_midis/Mozart: 100%|███████████| 24/24 [00:02<00:00,  8.84it/s]
./inputs/musicnet_midis/Ravel: 100%|██████████████| 4/4 [00:01<00:00,  3.75it/s]
./inputs/musicnet_midis/Beethoven: 100%|██████| 157/157 [00:19<00:00,  7.87it/s]
./inputs/musicnet_midis/Schubert: 100%|█████████| 30/30 [00:05<00:00,  5.74it/s]


In [None]:
# CLAUDE 3.7 Sonnet program number stuff
# NOT NEEDED
import difflib

def match_program_number(instrument_name):
    try:
        # Try direct mapping first
        program = pretty_midi.instrument_name_to_program(instrument_name)
    except:
        # Fall back to fuzzy matching
        standard_name = match_instrument_name(instrument_name)
        program = pretty_midi.instrument_name_to_program(standard_name)
    return program

def match_instrument_name(custom_name):
    """Find closest matching General MIDI instrument name"""
    # Get all standard instrument names
    standard_names = [program_to_instrument_name(i) for i in range(128)]
    
    # Remove any parenthetical additions and extra words
    clean_name = custom_name.split('(')[0].strip()
    clean_name = clean_name.split('.')[0].strip()
    
    # Special case handling
    if 'Piano' in clean_name:
        return 'Acoustic Grand Piano'  # Default piano
    if any(s in clean_name for s in ['Violin', 'Violino']):
        return 'Violin'
    if 'Viola' in clean_name:
        return 'Viola'
    if any(s in clean_name for s in ['Cello', 'Violoncello']):
        return 'Cello'
    
    # Fuzzy match against standard names
    matches = difflib.get_close_matches(clean_name, standard_names, n=1)
    if matches:
        return matches[0]
    
    # Default case - return a general instrument based on MIDI program
    # You could expand this logic based on your dataset
    return 'Acoustic Grand Piano'  # Default fallback

In [None]:
# CLAUDE 3.7 Sonnet
def create_transposed_versions(midi_file, semitone_shifts=None):
    """
    Create multiple transposed versions of a MIDI file.

    Args:
        midi_file: Path to MIDI file
        semitone_shifts: List of semitone shifts to apply, default is all 11 possible transpositions

    Returns:
        List of PrettyMIDI objects with different transpositions
    """
    if semitone_shifts is None:
        semitone_shifts = range(1, 12)  # All possible transpositions

    midi_data = pretty_midi.PrettyMIDI(midi_file)
    transpositions = []

    for shift in semitone_shifts:
        transposed = pretty_midi.PrettyMIDI()

        # Copy tempo and time signature info
        for timing in midi_data.time_signature_changes:
            transposed.time_signature_changes.append(timing)
        """
        not an existing field, claude being a silly goose
        if midi_data.get_tempo_changes():
            for tempo in midi_data.get_tempo_changes():
                transposed.tempo_changes.append(tempo)
        """
        # Create transposed instruments
        for inst in midi_data.instruments:
            new_inst = pretty_midi.Instrument(program=inst.program)
            new_inst.is_drum = inst.is_drum

            # Don't transpose drum tracks
            if not inst.is_drum:
                for note in inst.notes:
                    new_note = pretty_midi.Note(
                        velocity=note.velocity,
                        pitch=note.pitch + shift,
                        start=note.start,
                        end=note.end
                    )
                    new_inst.notes.append(new_note)
            else:
                new_inst.notes = inst.notes

            transposed.instruments.append(new_inst)

        transpositions.append(transposed)

    return transpositions

In [None]:
# CLAUDE 3.7 Sonnet
def change_tempo(midi_data, tempo_factor):
    """
    Change the tempo of a MIDI file by scaling all time values.

    Args:
        midi_data: PrettyMIDI object
        tempo_factor: Factor to multiply timing by (0.5 = twice as fast, 2.0 = half speed)

    Returns:
        Modified PrettyMIDI object
    """
    # Scale note timings
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            note.start *= tempo_factor
            note.end *= tempo_factor

    # Scale time signature changes
    for ts in midi_data.time_signature_changes:
        ts.time *= tempo_factor

    return midi_data

# Example usage
print(os.getcwd())
midi_data = pretty_midi.PrettyMIDI('./inputs/classicalmidi/debussy/debussy_cc_1.mid')
#audio_data = midi_data.synthesize()
#IPython.display.Audio(audio_data, rate=44100)

faster_midi = change_tempo(midi_data, 2)  # 20% faster

test_faster_out_midi_name = 'faster_song_pls_delete.mid'
faster_midi.write(test_faster_out_midi_name)
faster_midi = pretty_midi.PrettyMIDI(test_faster_out_midi_name)
faster_midi = faster_midi.synthesize()
IPython.display.Audio(faster_midi, rate=44100)

In [5]:
"""
def augment_data_randomly(midi_df):
    TRANSPOSE_RANGE = 3
    TEMPO_RANGE = 0.15
    VELOCITY_RANGE = 5

    transpose = np.random.randint(-TRANSPOSE_RANGE, TRANSPOSE_RANGE)
    tempo = np.random.uniform(1 - TEMPO_RANGE, 1 + TEMPO_RANGE)
    velocity = np.random.randint(-VELOCITY_RANGE, VELOCITY_RANGE)
    #print(f"trans: {transpose}, tempo: {tempo}")

    rand_trans = np.random.random()
    rand_tempo = np.random.random()
    rand_veloc = np.random.random()

    if rand_trans > 0.25:
        print(f"HRT time: transing {transpose} steps")
        df['note_value'] = df['note_value'] + transpose
        
    if rand_tempo > 0.25:
        print(f"codeine time: tempo change {tempo}")
        df['start_time'] = df['start_time'] * tempo
        df['end_time'] = df['end_time'] * tempo

    if rand_veloc > 0.25:
        print(f"bad musician: velocity {velocity}")
        df['velocity'] = df['velocity'] + velocity

    return df # but isnt this already doing it in place idk honestly what best practice here is
"""

def augment_data(df, transpose_amount = 0, tempo_amount = 1.0, velocity_amount = 0):
    """
    TRANSPOSE_RANGE = 3
    TEMPO_RANGE = 0.15
    VELOCITY_RANGE = 5

    transpose = np.random.randint(-TRANSPOSE_RANGE, TRANSPOSE_RANGE)
    tempo = np.random.uniform(1 - TEMPO_RANGE, 1 + TEMPO_RANGE)
    velocity = np.random.randint(-VELOCITY_RANGE, VELOCITY_RANGE)
    """

    if transpose_amount == 0 and tempo_amount == 1.0 and velocity_amount == 0:
        return df

    df['note_value'] = df['note_value'] + transpose_amount

    df['start_time'] = df['start_time'] * tempo_amount
    df['end_time'] = df['end_time'] * tempo_amount

    df['velocity'] = df['velocity'] + velocity_amount

    return df # but isnt this already doing it in place idk honestly what best practice here is