# Basic Pitch Parameter Tuning for Optimal Guitar Performance

## Setup and Preloading

Necessary libraries are imported and the default Basic Pitch model is preloaded.

In [22]:
import os
import shutil
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import pandas as pd

import mir_eval
import librosa
from basic_pitch.inference import Model, predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import demucs.separate

# Preload the Basic Pitch model
MODEL = Model(ICASSP_2022_MODEL_PATH)

## Train-Test Split

The IDMT-SMT-Guitar Dataset's "Dataset 2", containing various monophonic and polyphonic guitar recordings is split into a train and test set at a 80:20 ratio.

In [33]:
AUDIO_DIR = Path("./audio")
ANNOTATION_DIR = Path("./annotation")

# Shuffle the data for random split
ITEMS = np.stack(
    (os.listdir(AUDIO_DIR), os.listdir(ANNOTATION_DIR)),
    axis=1
)
np.random.shuffle(ITEMS)

DATASET_SIZE = len(os.listdir(AUDIO_DIR))
TEST_SIZE = int(DATASET_SIZE * 0.2)

# Move random 20% of data to test set
for files in ITEMS[:TEST_SIZE]:
    shutil.move(AUDIO_DIR / files[0], "./test/audio")
    shutil.move(ANNOTATION_DIR / files[1], "./test/annotation")

# Move the remaining 80% to train set
for files in ITEMS[TEST_SIZE:]:
    shutil.move(AUDIO_DIR / files[0], "./train/audio")
    shutil.move(ANNOTATION_DIR / files[1], "./train/annotation")

## Data Formatting Functions

Functions are defined below that perform necessary postprocessing steps to correctly format the note events for use with the `mir_eval` library.

##### **pred_note_events()**

- Takes a resultant `note_events` list from the Basic Pitch model's `predict()` method and returns predicted intervals and pitches ndarrays.

##### **true_note_events()**

- Parses an "annotation" XML file from a provided path, returning true intervals and pitches ndarrays.

Note intervals need to be in the format of a 2D ndarray, and need to be positive (offset > onset) for `mir_eval`. Therefore, offsets are set to an arbitrary value (onset + 1e-6) as they will be discounted during evaluation.

Pitches also need to be in Hz, so Librosa's `midi_to_hz()` method will be used to convert from MIDI notes to frequencies.

In [2]:
def pred_note_events(note_events: list) -> tuple[np.ndarray, np.ndarray]:
    """
    Get intervals and pitches ndarrays from a Basic Pitch predicted note_events
    list.
    """
    events = []
    for e in note_events:
        events.append([e[0], e[0]+1e-6, e[2]])
    
    df = pd.DataFrame(
        events, columns=["note_on", "note_off", "midi_pitch"]
    ).sort_values("note_on")

    # Prepare ndarrays for mir_eval
    intervals = df[["note_on", "note_off"]].to_numpy()
    pitches = librosa.midi_to_hz(df["midi_pitch"].to_numpy())

    return intervals, pitches

In [3]:
def true_note_events(xml_path: str | Path) -> tuple[np.ndarray, np.ndarray]:
    """
    Parse an annotation XML file given its path and return resultant intervals 
    and pitches ndarrays.
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()

    intervals = []
    pitches = []
    for event in root.findall("./transcription/event"):
        for child in event:
            if child.tag == "onsetSec":
                onset = float(child.text)
                intervals.append([onset, onset+1e-6]) # Onset and placeholder offset
            elif child.tag == "pitch":
                pitches.append(int(child.text)) # MIDI note

    # Prepare ndarrays for mir_eval
    intervals = np.array(intervals)
    pitches = librosa.midi_to_hz(np.array(pitches))

    return intervals, pitches

## Random Search Function

- Takes the mean F1, Recall and Precision scores for a number, `iterations`, of random parameter setups chosen from a given parameter distributions
dictionary `params`. 

- A results list containing scores and parameter configurations for each iteration is returned.

In [8]:
def random_search(params: dict[str], iterations: int = 10):
    """
    Get model note prediction scores on all training data using a random
    parameter setup chosen from a provided parameter distributions
    dictionary for a given number of iterations.
    """
    results = []

    # Get mean performance scores for 10 random parameter setups
    for _ in range(iterations):
        f1_scores = []
        recall_scores = []
        precision_scores = []

        param_setup = { # Randomly select from param distributions
            "onset_threshold": np.random.choice(params["onset_threshold"]),
            "frame_threshold": np.random.choice(params["frame_threshold"]),
            "minimum_note_length": np.random.choice(params["minimum_note_length"]),
            "minimum_frequency": np.random.choice(params["minimum_frequency"]),
            "maximum_frequency": np.random.choice(params["maximum_frequency"]),
            "multiple_pitch_bends": np.random.choice(params["multiple_pitch_bends"]),
            "melodia_trick": np.random.choice(params["melodia_trick"])
        }

        # Make predictions on all training data and save scores
        for i, audio_file in enumerate(os.listdir("./train/audio")):
            note_events = predict(
                audio_path=f"./train/audio/{audio_file}",
                model_or_model_path=MODEL,
                onset_threshold=param_setup["onset_threshold"],
                frame_threshold=param_setup["frame_threshold"],
                minimum_note_length=param_setup["minimum_note_length"],
                minimum_frequency=param_setup["minimum_frequency"],
                maximum_frequency=param_setup["maximum_frequency"],
                multiple_pitch_bends=param_setup["multiple_pitch_bends"],
                melodia_trick=param_setup["melodia_trick"]
            )[2]
            pred_intervals, pred_pitches = pred_note_events(note_events)

            annotation_files = os.listdir("./train/annotation")
            true_intervals, true_pitches = true_note_events(
                f"./train/annotation/{annotation_files[i]}"
            )

            scores = mir_eval.transcription.precision_recall_f1_overlap(
                true_intervals, true_pitches,
                pred_intervals, pred_pitches,
                offset_ratio=None # Ignore note offsets
            )
            precision_scores.append(scores[0])
            recall_scores.append(scores[1])
            f1_scores.append(scores[2])

        # Add parameter setup and mean scores to results list
        results.append({
            "param_setup": param_setup,
            "mean_f1_score": np.mean(f1_scores),
            "mean_recall_score": np.mean(recall_scores),
            "mean_precision_score": np.mean(precision_scores),
        })

    return results

## Display Results Function

In [12]:
def display_results(results: list):
    mean_f1_scores = []
    for i, setup in enumerate(results):
        print(f"Setup {i+1}.")
        print(f"Params: {setup['param_setup']}")
        print(f"Mean F1 Score: {setup['mean_f1_score']:.3f}")
        print(f"Mean Recall: {setup['mean_recall_score']:.3f}")
        print(f"Mean Precision: {setup['mean_precision_score']:.3f}\n")
        mean_f1_scores.append(setup["mean_f1_score"])

    best_setup_idx = np.argmax(mean_f1_scores)
    print("Best setup:")
    print(f"    {best_setup_idx+1}")

## Parameter Tuning

### Default Setup

In [None]:
param_distributions = {
    "onset_threshold": [0.5],
    "frame_threshold": [0.3],
    "minimum_note_length": [127.7],
    "minimum_frequency": [None],
    "maximum_frequency": [None],
    "multiple_pitch_bends": [False],
    "melodia_trick": [True]
}

results = random_search(param_distributions, iterations=1)

In [None]:
display_results(results)

A model with default parameters achieved:

- mean_f1_score: **0.594**

- mean_recall_score: **0.811**

- mean_precision_score: **0.505**

### Round 1

In [None]:
param_distributions = {
    "onset_threshold": np.linspace(0.5, 0.9),
    "frame_threshold": np.linspace(0.1, 0.5),
    "minimum_note_length": np.linspace(70, 140),
    "minimum_frequency": [None],
    "maximum_frequency": [None],
    "multiple_pitch_bends": [True, False],
    "melodia_trick": [True, False]
}

results = random_search(param_distributions)

In [None]:
display_results(results)

#### Best Found Setup:

- onset_threshold: **0.753**

- frame_threshold: **0.427**

- minimum_note_length: **118.6**

- minimum_frequency: **None**

- maximum_frequency: **None**

- multiple_pitch_bends: **True**

- melodia_trick: **False**

#### Best Scores:

- mean_f1_score: **0.770**

- mean_recall_score: **0.778**

- mean_precision_score: **0.798**

### Round 2

In [None]:
param_distributions = {
    "onset_threshold": np.linspace(0.65, 0.85),
    "frame_threshold": np.linspace(0.32, 0.52),
    "minimum_note_length": np.linspace(109, 129),
    "minimum_frequency": [None],
    "maximum_frequency": [None],
    "multiple_pitch_bends": [True, False],
    "melodia_trick": [True, False]
}

results = random_search(param_distributions)

In [None]:
display_results(results)

#### Best Found Setup:

- onset_threshold: **0.789**

- frame_threshold: **0.414**

- minimum_note_length: **113.5**

- minimum_frequency: **None**

- maximum_frequency: **None**

- multiple_pitch_bends: **False**

- melodia_trick: **False**

#### Best Scores:

- mean_f1_score: **0.774**

- mean_recall_score: **0.776**

- mean_precision_score: **0.808**

### Round 3

In [None]:
param_distributions = {
    "onset_threshold": np.linspace(0.75, 0.85),
    "frame_threshold": np.linspace(0.41, 0.43),
    "minimum_note_length": [113.5],
    "minimum_frequency": [None],
    "maximum_frequency": [None],
    "multiple_pitch_bends": [False],
    "melodia_trick": [False]
}

results = random_search(param_distributions)

In [None]:
display_results(results)

#### Best Found Setup:

- onset_threshold: **0.805**

- frame_threshold: **0.417**

- minimum_note_length: **113.5**

- minimum_frequency: **None**

- maximum_frequency: **None**

- multiple_pitch_bends: **False**

- melodia_trick: **False**

#### Best Scores:

- mean_f1_score: **0.776**

- mean_recall_score: **0.771**

- mean_precision_score: **0.818**

### Round 4

In [None]:
param_distributions = {
    "onset_threshold": [0.805],
    "frame_threshold": [0.417],
    "minimum_note_length": [113.5],
    "minimum_frequency": np.linspace(1, 80),
    "maximum_frequency": np.linspace(1600, 5000),
    "multiple_pitch_bends": [False],
    "melodia_trick": [False]
}

results = random_search(param_distributions)

In [None]:
display_results(results)

#### Best Found Setup:

- onset_threshold: **0.805**

- frame_threshold: **0.417**

- minimum_note_length: **113.5**

- minimum_frequency: **52.6**

- maximum_frequency: **4722.4**

- multiple_pitch_bends: **False**

- melodia_trick: **False**

#### Best Scores:

- mean_f1_score: **0.779**

- mean_recall_score: **0.768**

- mean_precision_score: **0.827**

### Evaluation on Test Set

The optimal parameter setup found from **Round 4** achieved a mean F1 score on the train set of **0.779**. This outperformed the default setup's F1 of **0.594**.

In [None]:
test_f1_scores = []
test_recall_scores = []
test_precision_scores = []

# Get scores of optimal model on test data
for i, audio_file in enumerate(os.listdir("./test/audio")):
    note_events = predict(
        audio_path=f"./test/audio/{audio_file}",
        model_or_model_path=MODEL,
        onset_threshold=0.805,
        frame_threshold=0.417,
        minimum_note_length=113.5,
        minimum_frequency=52.6,
        maximum_frequency=4722.4,
        multiple_pitch_bends=False,
        melodia_trick=False
    )[2]
    pred_intervals, pred_pitches = pred_note_events(note_events)

    annotation_files = os.listdir("./test/annotation")
    true_intervals, true_pitches = true_note_events(
        f"./test/annotation/{annotation_files[i]}"
    )

    scores = mir_eval.transcription.precision_recall_f1_overlap(
        true_intervals, true_pitches,
        pred_intervals, pred_pitches,
        offset_ratio=None # Ignore note offsets
    )
    test_precision_scores.append(scores[0])
    test_recall_scores.append(scores[1])
    test_f1_scores.append(scores[2])

In [21]:
print(f"""Mean Test F1 Score: {np.mean(test_f1_scores):.3f}
Mean Test Recall: {np.mean(test_recall_scores):.3f}
Mean Test Precision: {np.mean(test_precision_scores):.3f}
""")

Mean Test F1 Score: 0.800
Mean Test Recall: 0.805
Mean Test Precision: 0.841



The optimal parameter setup achieved a final mean F1 score of **0.8** on the test set.