In [32]:
import os
import shutil
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import pandas as pd

import mir_eval
import librosa
from basic_pitch.inference import Model, predict
from basic_pitch import ICASSP_2022_MODEL_PATH

# Preload the Basic Pitch model
MODEL = Model(ICASSP_2022_MODEL_PATH)

In [33]:
AUDIO_DIR = Path("./audio")
ANNOTATION_DIR = Path("./annotation")

# Shuffle the data for random split
ITEMS = np.stack(
    (os.listdir(AUDIO_DIR), os.listdir(ANNOTATION_DIR)),
    axis=1
)
np.random.shuffle(ITEMS)

DATASET_SIZE = len(os.listdir(AUDIO_DIR))
TEST_SIZE = int(DATASET_SIZE * 0.2)

# Move random 20% of data to test set
for files in ITEMS[:TEST_SIZE]:
    shutil.move(AUDIO_DIR / files[0], "./test/audio")
    shutil.move(ANNOTATION_DIR / files[1], "./test/annotation")

# Move the remaining 80% to train set
for files in ITEMS[TEST_SIZE:]:
    shutil.move(AUDIO_DIR / files[0], "./train/audio")
    shutil.move(ANNOTATION_DIR / files[1], "./train/annotation")

In [None]:
def preprocess_note_events(note_events: list) -> tuple[np.ndarray, np.ndarray]:
    events = []
    for e in note_events:
        events.append([e[0], e[0]+1e-6, e[2]])
    
    df = pd.DataFrame(
        events, columns=["note_on", "note_off", "midi_pitch"]
    ).sort_values("note_on")

    # Prepare ndarrays for mir_eval
    intervals = df[["note_on", "note_off"]].to_numpy()
    pitches = librosa.midi_to_hz(df["midi_pitch"].to_numpy())

    return intervals, pitches

Note intervals need to be in the format of a rank-2 ndarray, and need to be positive (offset > onset) for mir_eval.  

Therefore, offsets are set to an arbitrary value (offset + 1e-6) as they will be discounted during evaluation.

In [None]:
def load_test_note_events(xml_path: str | Path) -> tuple[np.ndarray, np.ndarray]:
    tree = ET.parse(xml_path)
    root = tree.getroot()

    intervals = []
    pitches = []
    for event in root.findall("./transcription/event"):
        for child in event:
            if child.tag == "onsetSec":
                onset = float(child.text)
                intervals.append([onset, onset+1e-6]) # Onset and placeholder offset
            elif child.tag == "pitch":
                pitches.append(int(child.text)) # MIDI note

    # Prepare ndarrays for mir_eval
    intervals = np.array(intervals)
    pitches = librosa.midi_to_hz(np.array(pitches))

    return intervals, pitches

In [36]:
def classification_scores(
    true_intervals: np.ndarray, 
    true_pitches: np.ndarray, 
    pred_intervals: np.ndarray, 
    pred_pitches: np.ndarray
) -> tuple[float, float, float, float]:
    """
    Get the performance scores (Precision, Recall, F1, Avg. Overlap Ratio)
    of the model's note predictions against the true values.
    """
    scores = mir_eval.transcription.precision_recall_f1_overlap(
        true_intervals, true_pitches,
        pred_intervals, pred_pitches,
        offset_ratio=None # Ignore note offsets
    )
    return scores

In [None]:
def random_search(params: dict[str], iterations: int = 10) -> list:
    """
    Get model note prediction scores on all training data using a random
    hyperparameter setup chosen from a provided parameter distributions
    dictionary for a given number of iterations.
    """
    results = []

    # Get mean performance scores for 10 random hyperparameter setups
    for _ in range(iterations):
        f1_scores = []
        recall_scores = []
        precision_scores = []

        param_setup = { # Randomly select from param distributions
            "onset_threshold": np.random.choice(params["onset_threshold"]),
            "frame_threshold": np.random.choice(params["frame_threshold"]),
            "minimum_note_length": np.random.choice(params["minimum_note_length"]),
            "minimum_frequency": np.random.choice(params["minimum_frequency"]),
            "maximum_frequency": np.random.choice(params["maximum_frequency"]),
            "multiple_pitch_bends": np.random.choice(params["multiple_pitch_bends"]),
            "melodia_trick": np.random.choice(params["melodia_trick"])
        }

        # Make predictions on all training data and save scores
        for i, audio_file in enumerate(os.listdir("./train/audio")):
            note_events = predict(
                audio_path=f"./train/audio/{audio_file}",
                model_or_model_path=MODEL,
                onset_threshold=param_setup["onset_threshold"],
                frame_threshold=param_setup["frame_threshold"],
                minimum_note_length=param_setup["minimum_note_length"],
                minimum_frequency=param_setup["minimum_frequency"],
                maximum_frequency=param_setup["maximum_frequency"],
                multiple_pitch_bends=param_setup["multiple_pitch_bends"],
                melodia_trick=param_setup["melodia_trick"]
            )[2]
            pred_intervals, pred_pitches = preprocess_note_events(note_events)

            annotation_files = os.listdir("./train/annotation")
            true_intervals, true_pitches = load_test_note_events(
                f"./train/annotation/{annotation_files[i]}"
            )

            scores = classification_scores(
                true_intervals, true_pitches,
                pred_intervals, pred_pitches
            )
            precision_scores.append(scores[0])
            recall_scores.append(scores[1])
            f1_scores.append(scores[2])

        # Add hyperparameter setup and mean scores to results list
        results.append({
            "param_setup": param_setup,
            "mean_f1_score": np.mean(f1_scores),
            "mean_recall_score": np.mean(recall_scores),
            "mean_precision_score": np.mean(precision_scores),
        })

    return results

In [None]:
param_distributions = {
    "onset_threshold": np.linspace(0.1, 0.9),
    "frame_threshold": np.linspace(0.1, 0.9),
    "minimum_note_length": np.linspace(70, 140),
    "minimum_frequency": [None],
    "maximum_frequency": [None],
    "multiple_pitch_bends": [True, False],
    "melodia_trick": [True, False]
}

random_search(param_distributions, iterations=5)

### Preliminary Best Hyperparameter Setup:

- onset_threshold: **0.753061224489796**

- frame_threshold: **0.42653061224489797**

- minimum_note_length: **118.57142857142857**

- minimum_frequency: **None**

- maximum_frequency: **None**

- multiple_pitch_bends: **True**

- melodia_trick: **False**

### Best Model Scores:

- mean_f1_score: **0.770**

- mean_recall_score: **0.778**

- mean_precision_score: **0.798**