# combining classicalmidi and musicnet datasets

In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm
import pretty_midi
import IPython.display

from helpers import list_files

In [2]:
FS = 44100

In [3]:
INPUT_DIR = './inputs'
input_folders = [folder for folder in os.listdir(INPUT_DIR) if '_og' not in folder or 'final' not in folder]
for folder in input_folders:
        #path_to_folder = os.getcwd()
        path_to_folder = os.path.join(INPUT_DIR, folder)
        print(path_to_folder)
        list_files(path_to_folder, 2) # 2 files listed per dir

./inputs/musicnet_midis
musicnet_midis/
    musicnet_metadata.csv
    Faure/
        2168_gr_f45m3.csv
        2166_gr_f45m1.mid
    Bach/
        2243_vs1_3.mid
        2659_vs2_6.mid
    Dvorak/
        1932_dv96_3.mid
        1919_dvqt10m4.mid
    Cambini/
        2082_quint3f2.mid
        2077_quintBb3.mid
    Haydn/
        2105_op64n5_2.mid
        2104_op64n5_1.mid
    Brahms/
        2151_br25m4.mid
        2148_br25m1.mid
    Mozart/
        1819_k3754a.mid
        1811_kv581b.mid
    Ravel/
        2180_gr_rqtf4.mid
        2179_gr_rqtf3.mid
    Beethoven/
        2510_ps02_02.mid
        2322_ps23_01.mid
    Schubert/
        1757_d958-1.mid
        1735_sy_sps94.mid
./inputs/classicalmidi
classicalmidi/
    balakir/
        islamei.mid
    tschai/
        ty_april.mid
        ty_dezember.mid
    grieg/
        grieg_march.mid
        grieg_wanderer.mid
    mendelssohn/
        mendel_op19_1.mid
        mendel_op19_6.mid
    granados/
        gra_esp_3.mid
        gra_esp_2.

In [35]:
entire_data = []
instrument_map = set()
OUTPUT_PATH = os.path.join(INPUT_DIR, 'final')

def midi_data_extraction(file_path, output_path):
    FLOAT_TYPE = np.float32 # float 16 seems a bit too small, 64 is big but might be worthwhile. is the default
    if os.path.isdir(file_path):
        midi_files = [fn for fn in os.listdir(file_path) if fn.endswith('.mid')]
        for file_name in tqdm(midi_files, desc=file_path):
                
            file_data = [['start_time', 'end_time', 'note_value', 'velocity', 'instrument_program_number']]
            full_path = os.path.join(file_path, file_name)
            try:
                midi_data = pretty_midi.PrettyMIDI(full_path)
            except:
                print(f"FAILURE LOADING: {full_path}")
            # Extract data for each note
            for instrument in midi_data.instruments:
                for note in instrument.notes:
                    start_time = note.start.astype(FLOAT_TYPE)
                    end_time = note.end.astype(FLOAT_TYPE)

                    """
                    instrument_name = instrument.name if instrument.name else 'Unnamed'
                    unique_instrument_names.add(instrument_name)
                    print()
                    try:
                        standardized_instrument_name = pretty_midi.instrument_name_to_program(instrument_name)
                        #print(f'{instrument_name} -> {standardized_instrument_name}')
                    except:
                        standardized_instrument_name = instrument_name
                    """
                    instrument_program_number = instrument.program
                    """
                    if not instrument_program_number:
                        instrument_program_number = match_program_number(instrument.name)
                        failed_instrument_names.add((instrument.name, instrument_program_number))
                    """
                    instrument_map.add((instrument.name, instrument_program_number, pretty_midi.program_to_instrument_name(instrument_program_number)))
                    #note_name = pretty_midi.note_number_to_name(note.pitch)
                    note_value = note.pitch
                    #duration = end_time - start_time
                    velocity = note.velocity

                    note_info = [start_time, end_time, note_value, velocity, instrument_program_number]
                    file_data.append(note_info)
                    #print(note_info)

            if output_path:
                csv_name = file_name[:-4] + '.csv' # change extension
                csv_path = os.path.join(output_path, csv_name)
                #print(f'writing {file_name} to {csv_path}')
                with open(csv_path, "w", newline="") as f:
                   writer = csv.writer(f)
                   writer.writerows(file_data)

let's set up a cell to test midi data extraction

In [36]:
"""
path_test_folder = os.path.join(INPUT_DIR, input_folders[0])
composers_list = os.listdir(path_test_folder)
path_test_composer = os.path.join(path_test_folder, composers_list[0]) 
#print(f'{path_test_composer} contains these files: {os.listdir(path_test_composer)}')
midi_data_extraction(path_test_composer, None)
if True:
    for entry in instrument_map:
        i_name, program, std_name = entry
        print(f"{i_name, int(program), std_name}")
"""

'\npath_test_folder = os.path.join(INPUT_DIR, input_folders[0])\ncomposers_list = os.listdir(path_test_folder)\npath_test_composer = os.path.join(path_test_folder, composers_list[0]) \n#print(f\'{path_test_composer} contains these files: {os.listdir(path_test_composer)}\')\nmidi_data_extraction(path_test_composer, None)\nif True:\n    for entry in instrument_map:\n        i_name, program, std_name = entry\n        print(f"{i_name, int(program), std_name}")\n'

that's working, let's start converting everything into one folder

In [37]:
for input_folder in ['classicalmidi', 'musicnet_midis']:
    composer_path = os.path.join(INPUT_DIR, input_folder)
    for composer_folder in os.listdir(composer_path):
        input_path = os.path.join(composer_path, composer_folder)
        if not os.path.isdir(input_path):
            continue
        folder_name = composer_folder.capitalize()
        folder_output_path = os.path.join(OUTPUT_PATH, folder_name)
        os.makedirs(folder_output_path, exist_ok=True)
        midi_data_extraction(input_path, folder_output_path)
        #print(folder_output_path)

./inputs/classicalmidi/balakir: 100%|█████████████| 1/1 [00:00<00:00, 11.66it/s]
./inputs/classicalmidi/tschai: 100%|████████████| 12/12 [00:00<00:00, 47.52it/s]
./inputs/classicalmidi/grieg: 100%|█████████████| 16/16 [00:00<00:00, 66.06it/s]
./inputs/classicalmidi/mendelssohn: 100%|███████| 15/15 [00:00<00:00, 64.65it/s]
./inputs/classicalmidi/granados: 100%|████████████| 3/3 [00:00<00:00, 34.61it/s]
./inputs/classicalmidi/haydn: 100%|█████████████| 21/21 [00:00<00:00, 54.95it/s]
./inputs/classicalmidi/beeth: 100%|█████████████| 29/29 [00:01<00:00, 21.54it/s]
./inputs/classicalmidi/mozart: 100%|████████████| 21/21 [00:00<00:00, 23.85it/s]
./inputs/classicalmidi/schumann: 100%|██████████| 24/24 [00:00<00:00, 71.72it/s]
./inputs/classicalmidi/burgm: 100%|███████████████| 9/9 [00:00<00:00, 68.18it/s]
./inputs/classicalmidi/brahms: 100%|██████████████| 9/9 [00:00<00:00, 28.71it/s]
./inputs/classicalmidi/liszt: 100%|█████████████| 16/16 [00:01<00:00, 15.07it/s]
./inputs/classicalmidi/schub

FAILURE LOADING: ./inputs/musicnet_midis/Bach/2211_fugue12.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2305_prelude14.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2194_prelude13.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2227_fugue6.mid


./inputs/musicnet_midis/Bach: 100%|████████████| 67/67 [00:00<00:00, 112.94it/s]


FAILURE LOADING: ./inputs/musicnet_midis/Bach/2230_prelude20.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2292_prelude19.mid
FAILURE LOADING: ./inputs/musicnet_midis/Bach/2310_prelude15.mid


./inputs/musicnet_midis/Dvorak: 100%|█████████████| 8/8 [00:00<00:00, 25.02it/s]
./inputs/musicnet_midis/Cambini: 100%|████████████| 9/9 [00:00<00:00, 39.18it/s]
./inputs/musicnet_midis/Haydn: 100%|██████████████| 3/3 [00:00<00:00, 32.59it/s]
./inputs/musicnet_midis/Brahms: 100%|███████████| 24/24 [00:01<00:00, 17.31it/s]
./inputs/musicnet_midis/Mozart: 100%|███████████| 24/24 [00:00<00:00, 33.39it/s]
./inputs/musicnet_midis/Ravel: 100%|██████████████| 4/4 [00:00<00:00,  8.56it/s]
./inputs/musicnet_midis/Beethoven: 100%|██████| 157/157 [00:05<00:00, 26.38it/s]
./inputs/musicnet_midis/Schubert: 100%|█████████| 30/30 [00:01<00:00, 16.85it/s]


In [7]:
# CLAUDE 3.7 Sonnet
# NOT NEEDED
import difflib

def match_program_number(instrument_name):
    try:
        # Try direct mapping first
        program = pretty_midi.instrument_name_to_program(instrument_name)
    except:
        # Fall back to fuzzy matching
        standard_name = match_instrument_name(instrument_name)
        program = pretty_midi.instrument_name_to_program(standard_name)
    return program

def match_instrument_name(custom_name):
    """Find closest matching General MIDI instrument name"""
    # Get all standard instrument names
    standard_names = [program_to_instrument_name(i) for i in range(128)]
    
    # Remove any parenthetical additions and extra words
    clean_name = custom_name.split('(')[0].strip()
    clean_name = clean_name.split('.')[0].strip()
    
    # Special case handling
    if 'Piano' in clean_name:
        return 'Acoustic Grand Piano'  # Default piano
    if any(s in clean_name for s in ['Violin', 'Violino']):
        return 'Violin'
    if 'Viola' in clean_name:
        return 'Viola'
    if any(s in clean_name for s in ['Cello', 'Violoncello']):
        return 'Cello'
    
    # Fuzzy match against standard names
    matches = difflib.get_close_matches(clean_name, standard_names, n=1)
    if matches:
        return matches[0]
    
    # Default case - return a general instrument based on MIDI program
    # You could expand this logic based on your dataset
    return 'Acoustic Grand Piano'  # Default fallback

In [8]:
# CLAUDE 3.7 Sonnet
def create_transposed_versions(midi_file, semitone_shifts=None):
    """
    Create multiple transposed versions of a MIDI file.

    Args:
        midi_file: Path to MIDI file
        semitone_shifts: List of semitone shifts to apply, default is all 11 possible transpositions

    Returns:
        List of PrettyMIDI objects with different transpositions
    """
    if semitone_shifts is None:
        semitone_shifts = range(1, 12)  # All possible transpositions

    midi_data = pretty_midi.PrettyMIDI(midi_file)
    transpositions = []

    for shift in semitone_shifts:
        transposed = pretty_midi.PrettyMIDI()

        # Copy tempo and time signature info
        for timing in midi_data.time_signature_changes:
            transposed.time_signature_changes.append(timing)
        """
        not an existing field, claude being a silly goose
        if midi_data.get_tempo_changes():
            for tempo in midi_data.get_tempo_changes():
                transposed.tempo_changes.append(tempo)
        """
        # Create transposed instruments
        for inst in midi_data.instruments:
            new_inst = pretty_midi.Instrument(program=inst.program)
            new_inst.is_drum = inst.is_drum

            # Don't transpose drum tracks
            if not inst.is_drum:
                for note in inst.notes:
                    new_note = pretty_midi.Note(
                        velocity=note.velocity,
                        pitch=note.pitch + shift,
                        start=note.start,
                        end=note.end
                    )
                    new_inst.notes.append(new_note)
            else:
                new_inst.notes = inst.notes

            transposed.instruments.append(new_inst)

        transpositions.append(transposed)

    return transpositions

In [9]:
# CLAUDE 3.7 Sonnet
def change_tempo(midi_data, tempo_factor):
    """
    Change the tempo of a MIDI file by scaling all time values.

    Args:
        midi_data: PrettyMIDI object
        tempo_factor: Factor to multiply timing by (0.5 = twice as fast, 2.0 = half speed)

    Returns:
        Modified PrettyMIDI object
    """
    # Scale note timings
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            note.start *= tempo_factor
            note.end *= tempo_factor

    """
    # Scale tempo changes
    for tempo_change in midi_data.tempo_changes:
        tempo_change.time *= tempo_factor
        # Note: The actual tempo (bpm) remains unchanged in this approach
    """

    # Scale time signature changes
    for ts in midi_data.time_signature_changes:
        ts.time *= tempo_factor

    return midi_data

# Example usage
midi_data = pretty_midi.PrettyMIDI('/content/Untitled.mid')
faster_midi = change_tempo(midi_data, 2)  # 20% faster
#faster_midi.write('faster_song.mid')
audio_data = faster_midi.synthesize()
IPython.display.Audio(audio_data, rate=44100)

FileNotFoundError: [Errno 2] No such file or directory: '/content/Untitled.mid'