In [None]:
from basic_pitch.inference import Model, predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import numpy as np
import py_midicsv as pm
import mir_eval
import librosa
import pandas as pd

# Preload the Basic Pitch model
model = Model(ICASSP_2022_MODEL_PATH)




In [None]:
import demucs.separate

# Perform demucs guitar separation on test song
separation_args = [
    "--two-stems", "guitar"
    "-n", "htdemucs_6s"
    "-o", "./train"
    "-d", "cuda"
    "--float32",
    "./tracks/test-song.wav"
]
demucs.separate.main(separation_args)

In [None]:
def preprocess_note_events(note_events: list) -> tuple[np.ndarray, np.ndarray]:
    events = []
    for e in note_events:
        # Intervals need to positive so set offset to arbitrary
        # value higher than onset as offsets will be ignored regardless
        events.append([e[0], e[0]+1e-6, e[2]])
    
    df = pd.DataFrame(
        events, columns=["note_on", "note_off", "midi_pitch"]
    ).sort_values("note_on")

    intervals = df[["note_on", "note_off"]].to_numpy()
    
    # mir_eval needs pitches in Hz
    pitches = librosa.midi_to_hz(df["midi_pitch"].to_numpy())

    return intervals, pitches

In [None]:
def load_test_note_events(csv_path: str) -> tuple[np.ndarray, np.ndarray]:
    # Read saved CSV as a DataFrane
    df = pd.read_csv(
        csv_path,
        sep=None,
        engine="python",
        index_col=False
    )

    # Find time and tempo info to get song note_on times in seconds
    # to allow comparison of predicted notes and "true" notes
    PPQ = int(df.columns[5])
    TEMPO = df.loc[df[" Header"] == " Tempo"][df.columns[3]].iloc[0]
    BPM = 60_000_000 / int(TEMPO)
    TICK_IN_S = 60 / (BPM * PPQ)

    df.drop(df.columns[[0,3,5]], axis=1, inplace=True) # Drop unnecessary cols

    note_on_events = df.loc[df[" Header"] == " Note_on_c"].copy().sort_values(" 0")
    note_on_events[" 0"] *= TICK_IN_S # Convert times to seconds

    ons = note_on_events[" 0"].to_numpy()
    offs = (note_on_events[" 0"] + 1e-6).to_numpy() # Fake offsets (will not be used)
    intervals = np.stack((ons, offs), axis=1) # Create intervals ndarray

    # mir_eval needs pitches in Hz
    pitches = librosa.midi_to_hz(note_on_events[" 3"].to_numpy().astype(int))

    return intervals, pitches

In [None]:
note_events = predict( # Get note events from Basic Pitch prediction
    "./train/htdemucs_6s/test-song/guitar.wav",
    model_or_model_path=model,
)[2]

pred_intervals, pred_pitches = preprocess_note_events(note_events)

In [None]:
# Parse the tab-converted midi file as a CSV file
csv_string_list = pm.midi_to_csv("./test/test-song.mid")

# Write the CSV file
with open("test-song.csv", "w") as f:
    f.writelines(csv_string_list)

test_intervals, test_pitches = load_test_note_events("test-song.csv")

In [None]:
# Validate the annotations before evaluating predictions
mir_eval.transcription.validate(
    test_intervals, test_pitches,
    pred_intervals, pred_pitches
)

In [26]:
# Get the classification metrics for the predicted notes against the ground truth tabbed notes
precision, recall, f1, avg_overlap_ratio = mir_eval.transcription.precision_recall_f1_overlap(
    test_intervals, test_pitches,
    pred_intervals, pred_pitches,
    offset_ratio=None # Ignore note offsets
)

print(f"""
F1 Score: {f1}
Precision: {precision}
Recall: {recall}
""")


F1 Score: 0.10174855605665005
Precision: 0.15269532177630016
Recall: 0.07629330802088277



The default model setup achieved an F1 score of **~0.10**.

The model's poor performance on the test song could be caused by factors such as:

- Bleed and noise from guitar separation negatively impacting predictions
- The ground truth (guitar tab converted to MIDI file) is inaccurate
- The model's training data was not representative enough of certain genres