In [1]:
import os
import shutil
import json
import base64
import zipfile
import xml.etree.ElementTree as ET

In [2]:
data_dir = os.path.abspath(os.path.join('..', 'data'))
data_dir

'/Users/sejmou/Repos/misc/choir-music-repertoire/data'

In [3]:
for file in os.listdir(data_dir):
    print(file)

Music
Winter Wonderland
Have Yourself A Merry Little Christmas
.DS_Store
Glimpse of Us
Die Nacht
Immortal Bach
I Heard the Bells on Christmas Day
Santa's Coming to Town
Indodana
Viva la Vida
Leise rieselt der Schnee
Rockin' Around the Christmas Tree
Raindrops Keep Fallin on My Head


In [4]:
def get_musescore_files(folder_path):
    """
    Recursively search for all files in the folder_path
    """
    musescore_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".mscz"):
                musescore_files.append(os.path.join(root, file))
    return musescore_files

In [5]:
def extract_musescore_version(file_path):
    """Extracts the major version of any musescore file (.mscz)"""

    file_name = os.path.basename(file_path)
    file_name_without_extension = os.path.splitext(file_name)[0]
    temp_folder_name = f"{file_name_without_extension}_temp"
    try:
        # Every .mscz file is essentially a zip file
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(temp_folder_name)

            # A .mscx file (essentially an XML document) should be in the extracted content of the musescore file
            score_file_name = next(
                (f for f in os.listdir(temp_folder_name) if f.endswith(".mscx")), None
            )
            if score_file_name is None:
                raise FileNotFoundError(f"No .mscx file in {file_path}")

            score_file_path = os.path.join(temp_folder_name, score_file_name)
            with open(score_file_path, "r", encoding="utf-8") as score_file:
                # Parse the XML content and find the programVersion element
                tree = ET.parse(score_file)
                root = tree.getroot()
                version_element = root.find(".//programVersion")
                if version_element is not None:
                    return int(version_element.text.split(".")[0])
                else:
                    raise ValueError(f"No version information found in {file_path}")

    finally:
        shutil.rmtree(temp_folder_name, ignore_errors=True)

In [6]:
files = get_musescore_files(data_dir)

In [7]:
musescore3_path = '/Applications/MuseScore\ 3.app/Contents/MacOS/mscore'
musescore4_path = '/Applications/MuseScore\ 4.app/Contents/MacOS/mscore'

In [8]:
musescore3_files = [file for file in files if extract_musescore_version(file) == 3]
musescore4_files = [file for file in files if extract_musescore_version(file) == 4]

In [9]:
musescore3_files[0]

'/Users/sejmou/Repos/misc/choir-music-repertoire/data/Have Yourself A Merry Little Christmas/MuseScore/Have Yourself a Merry Little Christmas 2021-11-26 mit Synchronisierung.mscz'

In [10]:
mp3_test_file = 'test.mp3'

In [11]:
!{musescore4_path} "../original_part_mp3s/Tenor.mscz" -o "Tenor.mp3"

In [12]:
# write whole score to a test mp3 file
# !{musescore4_path} "{musescore4_files[0]}" -o "{mp3_test_file}"

In [13]:
parts_output_lines = !{musescore4_path} "{musescore4_files[0]}"  --score-parts

In [14]:
parts = json.loads(parts_output_lines[0])

In [15]:
parts

{'parts': ['Klavier 1',
  'Piano',
  'Klavier 1',
  'Klavier 1',
  'Klavier',
  'Drumset'],
 'partsBin': ['UEsDBBQAAAgIAM59g1f1yRiqBCkAAFoSAQAPAAAAc2NvcmVfc3R5bGUubXNzpV1bc9w4dn7fX+FypVJJVVbuu6SKRym3ZzyeWns8ZWlnZ55StJqWGHc3O2y2beXXhyBI4Fy+A5HaF1v9XUAQPACJC8GX//V9t332Na+ORbn/4fn0bPL8Wb6/LTfF/u6H53+/efPXi+f/dfWXl7vTMb++Las8ahdn08nzq788e/byun7Y5u6v5u9Ddpf/o9jU91cXZ8uXL+LPSL/Ni7v7+mo69XT3M/K/VcW+zj5tO+P52Xx6cem1goqen77m+3f55/p9Vt0V+6vJ2fJyslx2RxBkdH3YbEwT5/iRbsqDeaDIcc+6rOtyZ9oYzXJoHYxRzJE4lGSj7+ZbeSw2+eaqU4bfXnKss8+f/3445NW6rDZ5dXX+8oXCiPRd+U1JKUakb/OsQd6UZZ1Xv2UbF3ouEyZHrD8WzX/72/xq5YKNQ15W7I91ddrl+/rXbJd/+Pz5mNcudYh7S/blS7ltDssTV6gX74r99cOxzncBb+New508+y7wqVMrtMvJtrjr0rkpuyvW5B3B3pDvXeX4Pa/q4jbbXh+qpvicA+JdObY/fGpXs7YYKcJE/3vKqnxdZbdfXCGezYKWEdTy+lRtH6Jj2jsYLkqyyx4vR5bnUF4dOp+RMuTKxu+z2Cn7RCkYE6XobOIThcpXMEYQESy/NZXqTbHtS3/VygXoxduHqrg9/rbNbnMXoO4CSogJy+OrT+XX/Nn3H543zffDD8//Onv+QkrW+bb8FiVzoXhfuJYrRmV/SIELi29LossVgkUKa8AnZzPqgvp3xT7cNnotwaj0x+x43yTzLt/fNXeIydmi13NCWbLvwXLBLIEAlnjmK+GBp

At first glance, this output looks promising (especially the `partsBin` part). We can write it to a `.mscz` file like this:

In [16]:
def write_part_base64_to_file(part, file_path):
    import base64

    """Write the base64 encoded part to a file"""
    with open(file_path, "wb") as file:
        decoded = base64.b64decode(part)
        file.write(decoded)

# Example usage
partsBinary = parts["partsBin"][0]
write_part_base64_to_file(partsBinary, "test.mscz")

However this is [broken in MuseScore 4](https://github.com/musescore/MuseScore/issues/15582#issuecomment-1665863763) 🙃

In [17]:
def get_content_tree_from_musescore_file(file_path):
    """
    Extract the <whatever_name>.mscx file from a musescore file (.mscz),
    parse it as an XML document and return the element tree
    """
    file_name = os.path.basename(file_path)
    file_name_without_extension = os.path.splitext(file_name)[0]
    temp_folder_name = f"{file_name_without_extension}_temp"

    try:
        # Every .mscz file is essentially a zip file - unzip to a temporary folder
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(temp_folder_name)

        # mscx file in the mscz file is essentially an XML document describing most of the content of the score
        mscx_file_name = next(
            (f for f in os.listdir(temp_folder_name) if f.endswith(".mscx")), None
        )
        if mscx_file_name is None:
            raise FileNotFoundError(f"No .mscx file in {file_path}")

        score_file_path = os.path.join(temp_folder_name, mscx_file_name)
        with open(score_file_path, "r", encoding="utf-8") as score_file:
            # Parse the XML content and find the programVersion element
            tree = ET.parse(score_file)
            if tree is None:
                raise ValueError(f"No root element found in {file_path}")
            return tree

    finally:
        shutil.rmtree(temp_folder_name, ignore_errors=True)

In [18]:
tree = get_content_tree_from_musescore_file(musescore4_files[0])
root = tree.getroot()

In [19]:
root

<Element 'museScore' at 0x10b0c55d0>

In [20]:
# find all 'Staff' elements with a 'Measure' child
# these are the staffs of the parts of the score
staff_elements = root.findall(".//Staff[Measure]")
staff_elements

[<Element 'Staff' at 0x10b24fdd0>,
 <Element 'Staff' at 0x10b2286d0>,
 <Element 'Staff' at 0x10b265da0>,
 <Element 'Staff' at 0x10b5d63e0>,
 <Element 'Staff' at 0x10b71ba10>,
 <Element 'Staff' at 0x10b447a60>]

In [21]:
len(staff_elements[0].findall(".//Measure"))

106

In [22]:
measure_counts = [len(staff.findall(".//Measure")) for staff in staff_elements]

# all parts should have the same number of measures
assert len(set(measure_counts)) == 1

In [23]:
selected = staff_elements[0]
measures = selected.findall(".//Measure")

for measure in measures:
  # verify that every measure has a single 'voice' element as its direct child
  children = measure.findall("./*")
  assert len(children) == 1 and children[0].tag == 'voice'



In [24]:
part_ids = [staff.get('id') for staff in staff_elements]
part_ids

['1', '2', '3', '4', '5', '6']

In [25]:
def get_part_element(staff_id, root):
    # find the Part element with a Staff child element with the given id
    # unfortunately, this doesn't work directly
    # return root.find(f".//Part[Staff[@id='{staff_id}']]")
    # but we can do it in two steps

    # Find all Part elements
    part_elements = root.findall(".//Part")

    # Filter Part elements based on the presence of a Staff child element with the given id
    for part_element in part_elements:
        staff_element = part_element.find(f"Staff[@id='{staff_id}']")
        if staff_element is not None:
            return part_element

    # Return None if no matching Part element is found
    return None

In [26]:
part_elements = [get_part_element(staff_id, root) for staff_id in part_ids]

In [27]:
part_elements

[<Element 'Part' at 0x10b0c4770>,
 <Element 'Part' at 0x10b0c6110>,
 <Element 'Part' at 0x10b0c5990>,
 <Element 'Part' at 0x10b0c6250>,
 <Element 'Part' at 0x10b0c74c0>,
 <Element 'Part' at 0x10b24c590>]

In [28]:
def get_parts(root):
    return root.findall(".//Part")

def remove_part(part_element, root):
    # A <Path> element stores metadata about a part. This includes (among other things):
    # - the track name
    # - the track's short name and, most importantly
    # - the ID of the <Staff> element that contains the notes of the part

    # hence, if we want to remove a part, we need to not only remove the Part Element, but also remove the associated Staff element

    staff_id = part_element.find("Staff").get("id")
    part_id = part_element.get("id")
    # get parent of part element (i.e. element with Part with given part id as child)
    parent = root.find(f".//Part[@id='{part_id}']/..")
    # remove part element
    parent.remove(part_element)

    staff_element = root.find(f".//Staff[@id='{staff_id}']")
    # get parent of staff element (i.e. element with Staff with given staff id as child)
    staff_parent = root.find(f".//Staff[@id='{staff_id}']/..")
    staff_parent.remove(staff_element)

In [29]:
parts = get_parts(root)
parts

[<Element 'Part' at 0x10b0c4770>,
 <Element 'Part' at 0x10b0c6110>,
 <Element 'Part' at 0x10b0c5990>,
 <Element 'Part' at 0x10b0c6250>,
 <Element 'Part' at 0x10b0c74c0>,
 <Element 'Part' at 0x10b24c590>]

In [42]:
def get_staff_element(part_element):
    staff_id = part_element.find("Staff").get("id")
    return root.find(f"./Score/Staff[@id='{staff_id}']")

example_staff = get_staff_element(parts[0])

In [47]:
measures = example_staff.findall("Measure")
len(measures)

106

In [56]:
def pretty_print(element):
    print(ET.tostring(element, encoding="unicode", method="xml"))

In [60]:
voice = measures[0].find("voice")
pretty_print(voice)

<voice>
          <KeySig>
            <concertKey>1</concertKey>
            </KeySig>
          <TimeSig>
            <sigN>4</sigN>
            <sigD>4</sigD>
            </TimeSig>
          <RehearsalMark>
            <text>A</text>
            </RehearsalMark>
          <Tempo>
            <tempo>1.366667</tempo>
            <followText>1</followText>
            <offset x="4.24518" y="-2.44686" />
            <text><sym>metNoteQuarterUp</sym><font face="Edwin" /> = 82</text>
            </Tempo>
          <Rest>
            <durationType>quarter</durationType>
            </Rest>
          <Chord>
            <durationType>half</durationType>
            <Note>
              <pitch>67</pitch>
              <tpc>15</tpc>
              </Note>
            </Chord>
          <Chord>
            <durationType>quarter</durationType>
            <Note>
              <pitch>67</pitch>
              <tpc>15</tpc>
              </Note>
            </Chord>
          </voice>
        


In [62]:
# find all elements with a 'durationType' child
elements_with_duration = voice.findall(".//*[durationType]")
elements_with_duration

[<Element 'Rest' at 0x10b24b650>,
 <Element 'Chord' at 0x10b24b790>,
 <Element 'Chord' at 0x10b24b510>]

In [63]:
staff_elements_with_duration = []

for measure in measures:
    voice = measure.find("voice")
    elements_with_duration = voice.findall(".//*[durationType]")
    staff_elements_with_duration.extend(elements_with_duration)
  
len(staff_elements_with_duration)

470

In [64]:
# collect counts for tag names of elements with a durationType child
tag_counts = {}
for element in staff_elements_with_duration:
    tag = element.tag
    if tag in tag_counts:
        tag_counts[tag] += 1
    else:
        tag_counts[tag] = 1

In [65]:
tag_counts

{'Rest': 74, 'Chord': 396}

In [45]:
# remove all parts except the first one
for part in parts[1:]:
    remove_part(part, root)

AttributeError: 'NoneType' object has no attribute 'remove'

In [31]:
# write the modified score to a file
tree.write("test.mscx", encoding='utf-8', xml_declaration=True)