In [None]:
import os
import shutil
from pathlib import Path
from fractions import Fraction

Parse the GitHub files structure and put all musicxml files in one folder

In [None]:
def collect_musicxml_files(repo_dir, output_dir): #ChatGPT generated
    repo_dir = Path(repo_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)


    i = 0
    for path in repo_dir.rglob('*.musicxml'):
        dest_path = output_dir / f"{i}_{path.name}"
        i += 1
        shutil.copy(path, dest_path)
        print(f"Copied {path} -> {dest_path}")

collect_musicxml_files('../asap-dataset', 'xmls')


Parse XML files and cut out only the parts we need as tokens

In [None]:
from music21 import converter, note, chord, stream, pitch as m21pitch
STANDARD_DURATIONS = [ #what length of fractions do we allow?
    Fraction(1, 16), Fraction(1, 8), Fraction(1, 6), Fraction(1, 4),
    Fraction(1, 3), Fraction(3, 8), Fraction(1, 2), Fraction(2, 3),
    Fraction(3, 4), Fraction(1), Fraction(3, 2), Fraction(2),
    Fraction(3), Fraction(4)
]

def quantize_duration(dur_str: str):

    try:
        raw = Fraction(dur_str)
        closest = min(STANDARD_DURATIONS, key=lambda x: abs(x - raw))
        return float(closest)
    except Exception:
        return 0.25
    
def note_to_token(n): #checks what type of note it is and outputs accordingly
    dur = Fraction(n.duration.quarterLength)
    closest = min(STANDARD_DURATIONS, key=lambda x: abs(x - dur))
    dur = closest

    if isinstance(n, note.Note):
        normalized_pitch = m21pitch.Pitch(n.pitch.midi).nameWithOctave
        
        return f"note{normalized_pitch}_{dur}"
    
    elif isinstance(n, note.Rest):
        return f"rest_{dur}"
    
    elif isinstance(n, chord.Chord): #chords output a list of notes to be played simulatneously
        return [f"note{m21pitch.Pitch(p.midi).nameWithOctave}_{dur}" for p in n.pitches] 
    
    else:
        return None


def tokenize_musicxml(file_path): #partially ChatGPT generated
    score = converter.parse(file_path)
    all_tokens = []

    # flatten to handle offsets
    flat_score = score.flat.notesAndRests

    # group by offset
    offset_dict = {}
    for elem in flat_score:
        offset = round(elem.offset, 5)  # rounding to avoid floating point artifacts
        
        if offset not in offset_dict:
            offset_dict[offset] = []
        offset_dict[offset].append(elem)

    
    sorted_offsets = sorted(offset_dict.keys())
    for offset in sorted_offsets:
            elements = offset_dict[offset]
            simul_tokens = []

            for e in elements:
                tokens = note_to_token(e)

                if isinstance(tokens, list):
                    simul_tokens.extend(tokens)

                elif tokens is not None:
                    simul_tokens.append(tokens)

            if len(simul_tokens) > 1:
                all_tokens.append("<simul>")
                all_tokens.extend(simul_tokens)
                all_tokens.append("</simul>")
            else:
                all_tokens.extend(simul_tokens)

    return all_tokens


#test
# file_path = "score.musicxml"
# tokens = tokenize_musicxml(file_path)
# print(" ".join(tokens))


In [7]:
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

input_dir = Path('xmls')
output_file = Path('out.txt')

counter = 0
with open(output_file, 'w', encoding='utf-8') as f:
    for i,musicxml_file in enumerate(input_dir.rglob('*.musicxml')):
        if i % 20 == 0:
            print(f"{i} completed")
        
        tokens = tokenize_musicxml(str(musicxml_file))
        tokens.append("<end_song>\n")
        f.write("\n".join(tokens))



print(f"Wrote all tokens to {output_file}")


0 completed
20 completed
40 completed
60 completed
80 completed
100 completed
120 completed
140 completed
160 completed
180 completed
200 completed
220 completed
Wrote all tokens to out.txt


In [15]:
from collections import Counter

with open('out.txt', 'r') as f:
    tokens = [line.strip() for line in f if line.strip()]  # remove blank lines

counts = Counter(tokens)
len(list(dict(counts).keys()))



1151

In [16]:
import pandas as pd 
x = pd.Series(dict(counts).values())
x.describe()

count      1151.000000
mean        961.350130
std        7926.374668
min           1.000000
25%          34.000000
50%         140.000000
75%         482.500000
max      187507.000000
dtype: float64

## Play Tokens

In [None]:
import pygame.midi
from music21 import pitch
import time
import re
from fractions import Fraction



def pitchstr_to_num(note_name):
    return pitch.Pitch(note_name).midi


BPM = 480
second_per_quarter_note = 60 / BPM


def pitch_to_midi(pitch):
    match = re.match(r"([A-Ga-g])([-#b]?)(\d)", pitch)
    if not match:
        return None
    name, accidental, octave = match.groups()
    name = name.upper()

    if accidental == '-' or accidental == 'b':
        name += 'b'
    elif accidental == '#':
        name += '#'

    base = pitchstr_to_num(name) - 60 #See the NOTE_TO_MIDI dictionary. -60 because it automatically assumes middle c octave. We add octaves later 

    if base is None:
        return None
    midi_number = base + (int(octave) + 1) * 12
    if midi_number is None or midi_number < 0 or midi_number > 127:
        return None

    return midi_number

def play_note(midi_out, midi_note, duration, velocity=127): #velocity means volume. 127 is the max
    midi_out.note_on(midi_note, velocity)
    time.sleep(duration)
    midi_out.note_off(midi_note, velocity)

def quantize_duration(dur_str: str):
    STANDARD_DURATIONS = [ #what length of fractions do we allow?
        Fraction(1, 16), Fraction(1, 8), Fraction(1, 6), Fraction(1, 4),
        Fraction(1, 3), Fraction(3, 8), Fraction(1, 2), Fraction(2, 3),
        Fraction(3, 4), Fraction(1), Fraction(3, 2), Fraction(2),
        Fraction(3), Fraction(4)
    ]
    try:
        raw = Fraction(dur_str)
        closest = min(STANDARD_DURATIONS, key=lambda x: abs(x - raw))
        return float(closest)
    except Exception:
        return 0.25

def get_duration(dur:str):
    try:
        dur = max(quantize_duration(dur) * 4, 0.125) #turn quarter notes into 1s, minimum time is 1/8 of a quarter note (ie 1/32)
    except:
        dur = 1 #default quarter note
    return dur

def play_tokens(tokens, instrument):
    pygame.midi.init()
    player = pygame.midi.Output(0)
    player.set_instrument(instrument)

    i = 0
    while i < len(tokens):

        #SIMULTANEOUS HANDLING
        if tokens[i] == "<simul>":
            beginning = i
            i += 1

            simul_notes = []
            while i < len(tokens) and tokens[i] != "</simul>" and i - beginning < 7: #Cannot be more than 7 notes played simultaneously
                token = tokens[i]
                match = re.match(r"note(.+)_(\w+)", token)
                if match:
                    pitch, dur = match.groups()
                    midi = pitch_to_midi(pitch)

                    length = get_duration(dur) * second_per_quarter_note

                    if midi is not None:
                        simul_notes.append((midi, length))
                i += 1

            

            # Play all simultaneously
            for midi_note, _ in simul_notes:
                player.note_on(midi_note, 100)

            time.sleep(max((d for _, d in simul_notes), default=0.3))

            for midi_note, _ in simul_notes:
                player.note_off(midi_note, 100)
            i += 1  # Skip </simul>


        #NONSIMULTANEOUS HANDLING
        else:
            token = tokens[i]

            #NOTES
            if token.startswith("note"):
                match = re.match(r"note(.+)_(\w+)", token)
                if match:
                    pitch, dur = match.groups()
                    midi = pitch_to_midi(pitch)
                    length = get_duration(dur) * second_per_quarter_note
                    if midi is not None:
                        play_note(player, midi, length)

            #RESTS
            elif token.startswith("rest_"):
                match = re.match(r"rest_(\w+)", token)
                if match:
                    dur = match.group(1)
                    time.sleep(get_duration(dur) * second_per_quarter_note)

            i += 1
            

    del player
    pygame.midi.quit()

#------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------------------------
#                                     PLAY TOKENS
#------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------------------------



try:
    play_tokens(["noteC4_1/2", "noteG4_1/2"], instrument=101) #0: Piano, 101: Synth
except:
    pygame.midi.quit()



In [None]:
#Play a MusicXML File
tokens = tokenize_musicxml("score.musicxml")
play_tokens(tokens)