# Importing packages

In [4]:
import numpy as np
import mirdata
from pesto import load_model
import torch
import mir_eval
import pesto
import time
from tqdm import tqdm

In [1]:
# for WSL / crepe
import mirdata
import mir_eval
import time
from tqdm import tqdm
import crepe
import numpy as np

# Benchmarking on MDB-stem-synth with PESTO


In [2]:
dataset = mirdata.initialize("mdb_stem_synth")
# dataset.download() #this line has to be commented and re-run if the dataset is not already installed
# dataset.validate()

### Single track (test) benchmarking

In [3]:


# ---- dataset / track ----
track_id = "AClassicEducation_NightOwl_STEM_01"
track = dataset.track(track_id)  # assumes `dataset` already exists
audio, sr = track.audio  # mirdata returns (samples, channels) usually

# ---- mono + torch tensor (PESTO guideline) ----
audio_mono = audio.mean(axis=-1) if audio.ndim > 1 else audio
x = torch.from_numpy(audio_mono).float()  # (num_samples,)

In [None]:

# ---- device + model (load once) ----
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0

# Important: pass sampling_rate to match your data (repo shows this pattern in advanced usage)
pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr).to(device)
pesto_model.eval()

# ---- inference ----
with torch.no_grad():
    # Repo example uses: f0, conf, amp = f0_estimator(x, convert_to_freq=True, return_activations=False)
    f0, conf, amp = pesto_model(
        x.to(device),
        convert_to_freq=True,
        return_activations=False,
    )

In [19]:
# ---- move to numpy and build timestamps (repo: step_size in ms; hop is derived from it) ----
f0_pred = f0.detach().cpu().numpy().squeeze()
times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)

# mir_eval expects unvoiced = 0 Hz (not NaN)
f0_pred = np.nan_to_num(f0_pred, nan=0.0)

# ---- reference ----
ref_times = track.f0.times
ref_freqs = track.f0.frequencies

# ---- metrics ----
scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

print(f"--- Results for Track: {track_id} ---")
print(f"Raw Pitch Accuracy (RPA): {scores['Raw Pitch Accuracy']:.4f}")
print(f"Raw Chroma Accuracy (RCA): {scores['Raw Chroma Accuracy']:.4f}")

--- Results for Track: AClassicEducation_NightOwl_STEM_01 ---
Raw Pitch Accuracy (RPA): 0.8691
Raw Chroma Accuracy (RCA): 0.9100


### Full dataset bechmarking

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0

metrics = [
    "Overall Accuracy",
    "Raw Pitch Accuracy",
    "Raw Chroma Accuracy",
    "Voicing Recall",
    "Voicing False Alarm",
]

# load once (reload only if sr changes)
track0 = dataset.track(dataset.track_ids[0])
audio0, sr0 = track0.audio

pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device).eval()

sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

with torch.no_grad():
    for track_id in tqdm(dataset.track_ids, desc="Evaluating tracks"):
        track = dataset.track(track_id)

        audio, sr = track.audio
        if sr != sr0:
            sr0 = sr
            pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device).eval()

        audio_mono = audio.mean(axis=-1) if audio.ndim > 1 else audio
        x = torch.from_numpy(audio_mono).float().to(device)

        f0, conf, amp = pesto_model(x, convert_to_freq=True, return_activations=False)

        f0_pred = f0.detach().cpu().numpy().squeeze()
        f0_pred = np.nan_to_num(f0_pred, nan=0.0)
        times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)

        ref_times = track.f0.times
        ref_freqs = track.f0.frequencies

        scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

        w = float(ref_times.shape[0])
        sum_w += w
        for m in metrics:
            sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print("Scores on MDB with PESTO: ",final_scores)


Evaluating tracks: 100%|██████████| 230/230 [17:18<00:00,  4.52s/it]

Scores on MDB with PESTO:  {'Overall Accuracy': np.float64(0.39504905628006476), 'Raw Pitch Accuracy': np.float64(0.9098419948224401), 'Raw Chroma Accuracy': np.float64(0.9358192913900837), 'Voicing Recall': np.float64(0.9999996735270973), 'Voicing False Alarm': np.float64(0.9998373050855409)}





# Benchmarking on Orchset with PESTO

In [None]:
dataset = mirdata.initialize("orchset")
#dataset.download()
#dataset.validate()

100%|██████████| 1/1 [00:00<00:00, 59.39it/s]
100%|██████████| 64/64 [00:02<00:00, 31.10it/s]


({'metadata': {}, 'tracks': {}}, {'metadata': {}, 'tracks': {}})

### Single track (test) benchmarking

In [None]:
# orchset
track_id = dataset.track_ids[0]
track = dataset.track(track_id)

# --- audio (orchset-specific) ---
audio, sr = track.audio_mono  # <-- correct for Orchset :contentReference[oaicite:1]{index=1}
x = torch.from_numpy(audio).float()

# --- pesto ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0
model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr).to(device).eval()

with torch.no_grad():
    f0, conf, amp = model(x.to(device), convert_to_freq=True, return_activations=False)

f0_pred = f0.detach().cpu().numpy().squeeze()
times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)
f0_pred = np.nan_to_num(f0_pred, nan=0.0)

# --- reference (orchset-specific) ---
ref_times = track.melody.times
ref_freqs = track.melody.frequencies

scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)
print(f"--- orchset / {track_id} ---")
print(f"RPA: {scores['Raw Pitch Accuracy']:.4f}")
print(f"RCA: {scores['Raw Chroma Accuracy']:.4f}")


--- orchset / Beethoven-S3-I-ex1 ---
RPA: 0.0368
RCA: 0.6468


### Full dataset Benchmarking

In [15]:
dataset = mirdata.initialize("orchset")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0

metrics = [
    "Overall Accuracy",
    "Raw Pitch Accuracy",
    "Raw Chroma Accuracy",
    "Voicing Recall",
    "Voicing False Alarm",
]

# load once (orchset has a fixed sr in practice, but keep this robust)
track0 = dataset.track(dataset.track_ids[0])
audio0, sr0 = track0.audio_mono
model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device).eval()

sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

with torch.no_grad():
    for track_id in dataset.track_ids:
        track = dataset.track(track_id)

        audio, sr = track.audio_mono
        if sr != sr0:
            sr0 = sr
            model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device).eval()

        x = torch.from_numpy(audio).float().to(device)

        f0, conf, amp = model(x, convert_to_freq=True, return_activations=False)
        f0_pred = f0.detach().cpu().numpy().squeeze()
        f0_pred = np.nan_to_num(f0_pred, nan=0.0)
        times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)

        ref_times = track.melody.times
        ref_freqs = track.melody.frequencies

        scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

        w = float(ref_times.shape[0])  # frame-weight by GT frames
        sum_w += w
        for m in metrics:
            sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print("Overall score for PESTO on orchestra: ",final_scores)


Overall score for PESTO on orchestra:  {'Overall Accuracy': np.float64(0.17283344364498154), 'Raw Pitch Accuracy': np.float64(0.18264329344654404), 'Raw Chroma Accuracy': np.float64(0.482965021786439), 'Voicing Recall': np.float64(0.9999686196039806), 'Voicing False Alarm': np.float64(0.9999546722490121)}


# Benchmarking on vocadito with PESTO

In [9]:
dataset = mirdata.initialize("vocadito")
# dataset.download()  # vocadito is downloadable from Zenodo :contentReference[oaicite:4]{index=4}
# dataset.validate()

### Single track (test) benchmarking

In [10]:
# ---- pick a track ----
track_id = dataset.track_ids[0]
track = dataset.track(track_id)

audio, sr = track.audio

# ---- mono ----
audio_mono = audio

x = torch.from_numpy(audio_mono).float()

# ---- model ----
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0
pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr).to(device)
pesto_model.eval()

with torch.no_grad():
    f0, conf, amp = pesto_model(x.to(device), convert_to_freq=True, return_activations=False)

f0_pred = f0.detach().cpu().numpy().squeeze()
times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)
f0_pred = np.nan_to_num(f0_pred, nan=0.0)

# ---- reference ----
ref_times = track.f0.times
ref_freqs = track.f0.frequencies

scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)
print(track_id, scores["Raw Pitch Accuracy"], scores["Raw Chroma Accuracy"])


1 0.9884678747940692 0.9884678747940692


### Full-data set benchmarking

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
step_size_ms = 20.0

# model loaded once
track0 = dataset.track(dataset.track_ids[0])
_, sr0 = track0.audio
pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device)
pesto_model.eval()

metrics = [
    "Overall Accuracy",
    "Raw Pitch Accuracy",
    "Raw Chroma Accuracy",
    "Voicing Recall",
    "Voicing False Alarm",
]

sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

with torch.no_grad():
    for track_id in dataset.track_ids:
        track = dataset.track(track_id)
        audio, sr = track.audio

        # if sampling rates differ across tracks, reload model to match sr
        if sr != sr0:
            sr0 = sr
            pesto_model = load_model("mir-1k_g7", step_size=step_size_ms, sampling_rate=sr0).to(device)
            pesto_model.eval()

        audio_mono = audio.mean(axis=-1) if getattr(audio, "ndim", 1) > 1 else audio
        x = torch.from_numpy(audio_mono).float().to(device)

        f0, conf, amp = pesto_model(x, convert_to_freq=True, return_activations=False)
        f0_pred = f0.detach().cpu().numpy().squeeze()
        f0_pred = np.nan_to_num(f0_pred, nan=0.0)

        times_pred = np.arange(f0_pred.shape[-1]) * (step_size_ms / 1000.0)

        ref_times = track.f0.times
        ref_freqs = track.f0.frequencies

        scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

        w = float(ref_times.shape[0])  # frame-weighting by number of GT frames
        sum_w += w
        for m in metrics:
            sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print("Scores on Vocadito with PESTO: ",final_scores)


Scores on Vocadito with PESTO:  {'Overall Accuracy': np.float64(0.647838502081232), 'Raw Pitch Accuracy': np.float64(0.9811545604137208), 'Raw Chroma Accuracy': np.float64(0.9848056886964734), 'Voicing Recall': np.float64(0.9999869620011912), 'Voicing False Alarm': np.float64(0.9993370197333225)}


# Benchmarking MDB with CREPE

In [2]:
data_path = "/mnt/c/tmp/mir_datasets/mdb_stem_synth" 
dataset = mirdata.initialize("mdb_stem_synth", data_home=data_path)
# dataset.download()
# dataset.validate()

In [None]:

step_size_ms = 20.0
voicing_thresh = 0.5

metrics = ["Overall Accuracy", "Raw Pitch Accuracy", "Raw Chroma Accuracy", "Voicing Recall", "Voicing False Alarm"]
sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

for track_id in tqdm(dataset.track_ids, desc="Evaluating tracks"):
    track = dataset.track(track_id)

    audio, sr = track.audio
    audio_mono = audio.mean(axis=-1) if audio.ndim > 1 else audio

    times_pred, f0_pred, conf, _ = crepe.predict(
        audio_mono,
        sr,
        step_size=step_size_ms,
        viterbi=False,
        verbose=0,
    )

    f0_pred = np.nan_to_num(f0_pred, nan=0.0)
    f0_pred = np.where(conf >= voicing_thresh, f0_pred, 0.0)

    ref_times = track.f0.times
    ref_freqs = track.f0.frequencies

    scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

    w = float(ref_times.shape[0])
    sum_w += w
    for m in metrics:
        sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print(final_scores)


Evaluating tracks:   0%|          | 0/230 [00:00<?, ?it/s]2026-01-04 01:29:42.934421: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):
I0000 00:00:1767486585.557484   23439 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060, pci bus id: 0000:01:00.0, compute capability: 8.9
2026-01-04 01:29:51.705775: I external/local_xla/xla/service/service.cc:163] XLA service 0x7b470c00c0b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-04 01:29:51.705815: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4060, Compute Capability 8.9
2026-01-04 01:29:

# Benchmarking Orhcest with CREPE

In [None]:
data_path = "/mnt/c/tmp/mir_datasets/orchset" 
dataset = mirdata.initialize("orchset", data_home=data_path)
# dataset.download()
# dataset.validate()

32.0kB [00:00, 143kB/s]                             
100%|██████████| 1/1 [00:00<00:00, 46.54it/s]
100%|██████████| 64/64 [00:19<00:00,  3.25it/s]


({'metadata': {}, 'tracks': {}}, {'metadata': {}, 'tracks': {}})

In [9]:

step_size_ms = 20.0
voicing_thresh = 0.5

metrics = ["Overall Accuracy", "Raw Pitch Accuracy", "Raw Chroma Accuracy", "Voicing Recall", "Voicing False Alarm"]
sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

for track_id in tqdm(dataset.track_ids, desc="Evaluating tracks"):
    track = dataset.track(track_id)

    audio, sr = track.audio_mono

    times_pred, f0_pred, conf, _ = crepe.predict(
        audio,
        sr,
        step_size=step_size_ms,
        viterbi=False,
        verbose=0,
    )

    f0_pred = np.nan_to_num(f0_pred, nan=0.0)
    f0_pred = np.where(conf >= voicing_thresh, f0_pred, 0.0)

    ref_times = track.melody.times
    ref_freqs = track.melody.frequencies

    scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

    w = float(ref_times.shape[0])
    sum_w += w
    for m in metrics:
        sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print("Scores on Orchestra with CREPE: ",final_scores)


Evaluating tracks: 100%|██████████| 64/64 [00:59<00:00,  1.08it/s]

Scores on Orchestra with CREPE:  {'Overall Accuracy': np.float64(0.15466404765463201), 'Raw Pitch Accuracy': np.float64(0.13194821346421762), 'Raw Chroma Accuracy': np.float64(0.365956993830301), 'Voicing Recall': np.float64(0.5789966514288181), 'Voicing False Alarm': np.float64(0.5298053465740868)}





# Benchmarking vocadito with CREPE

In [2]:
dataset = mirdata.initialize("vocadito")
# dataset.download()
# dataset.validate()


In [3]:
step_size_ms = 20.0
voicing_thresh = 0.5  # common default; tune if you want

metrics = ["Overall Accuracy", "Raw Pitch Accuracy", "Raw Chroma Accuracy", "Voicing Recall", "Voicing False Alarm"]
sum_w = 0.0
sum_scores = {m: 0.0 for m in metrics}

for track_id in tqdm(dataset.track_ids, desc="Evaluating tracks"):
    track = dataset.track(track_id)

    audio, sr = track.audio
    audio_mono = audio.mean(axis=-1) if audio.ndim > 1 else audio

    times_pred, f0_pred, conf, _ = crepe.predict(
        audio_mono,
        sr,
        step_size=step_size_ms,
        viterbi=False,
        verbose=0,
    )

    f0_pred = np.nan_to_num(f0_pred, nan=0.0)
    f0_pred = np.where(conf >= voicing_thresh, f0_pred, 0.0)

    ref_times = track.f0.times
    ref_freqs = track.f0.frequencies

    scores = mir_eval.melody.evaluate(ref_times, ref_freqs, times_pred, f0_pred)

    w = float(ref_times.shape[0])
    sum_w += w
    for m in metrics:
        sum_scores[m] += scores[m] * w

final_scores = {m: (sum_scores[m] / sum_w) for m in metrics}
print("Final scores on vocadito with CREPE: ", final_scores)


Evaluating tracks:   0%|          | 0/40 [00:00<?, ?it/s]2026-01-04 00:59:53.956767: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):
I0000 00:00:1767484796.350275   10511 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060, pci bus id: 0000:01:00.0, compute capability: 8.9
2026-01-04 00:59:58.711739: I external/local_xla/xla/service/service.cc:163] XLA service 0x77b27800bc60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-04 00:59:58.711779: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4060, Compute Capability 8.9
2026-01-04 00:59:5

Final scores on vocadito with CREPE:  {'Overall Accuracy': np.float64(0.891207682800358), 'Raw Pitch Accuracy': np.float64(0.9686287550960121), 'Raw Chroma Accuracy': np.float64(0.9718860625183027), 'Voicing Recall': np.float64(0.98306390568432), 'Voicing False Alarm': np.float64(0.2670181517978474)}



