In [None]:
import librosa
import crepe
import mir_eval
import numpy as np
from IPython.display import Audio, display

In [None]:
import IPython

# Plots
import matplotlib.pyplot as plt
from pylab import plot, show, figure, imshow
plt.rcParams['figure.figsize'] = (15, 6)

import numpy

In [None]:
import essentia.standard as es

audiofile = '/Users/fernando/dev/upf/mir/motif-detection/John Williams & London Symphony Orchestra - Star Wars - The Ultimate Digital Collection (2016 - Soundtracks) [Flac 24-44_192]/46. John Williams & London Symphony Orchestra - Episode IV - Main Title.flac'
audiofile = "/Users/fernando/Downloads/other.flac"
sr = 44100
# Load audio file.
# It is recommended to apply equal-loudness filter for PredominantPitchMelodia.
loader = es.EqloudLoader(filename=audiofile, sampleRate=sr)
audio = loader()
print("Duration of the audio sample [sec]:")
print(len(audio)/sr)

# Extract the pitch curve
# PitchMelodia takes the entire audio signal as input (no frame-wise processing is required).

pitch_extractor = es.PredominantPitchMelodia(frameSize=2048, hopSize=128)
pitch_values, pitch_confidence = pitch_extractor(audio)

# Pitch is estimated on frames. Compute frame time positions.
pitch_times = numpy.linspace(0.0,len(audio)/sr,len(pitch_values) )

# Plot the estimated pitch contour and confidence over time.
f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(pitch_times, pitch_values)
axarr[0].set_title('estimated pitch [Hz]')
axarr[1].plot(pitch_times, pitch_confidence)
axarr[1].set_title('pitch confidence')
plt.show()

In [None]:
# play the audio
IPython.display.Audio(audio, rate=44100)

In [None]:
duration_seconds = len(audio) / float(sr)
pitch_times = np.linspace(0.0, duration_seconds, len(pitch_values))

# # 4) Optionally apply a confidence threshold: set pitch to 0 where confidence is low
# if confidence_threshold is not None:
#     pitch_values[pitch_confidence < confidence_threshold] = 0.0

# 5) Use mir_eval's sonify.pitch_contour to create a sine wave at the detected pitches
sonification = mir_eval.sonify.pitch_contour(
    pitch_times,  # timestamps for each pitch frame
    pitch_values, # pitch in Hz
    fs=sr         # sample rate for the synthesized signal
)

# 6) Return an IPython.display.Audio object so you can listen to it directly in a notebook
Audio(sonification, rate=sr)

In [None]:
def estimate_pitch(
    audio_path: str,
    # voicing_threshold: float = 0.3,
    use_viterbi: bool = False,
    model_capacity="full",
    crepe_verbose_level=1,
    seconds_to_analyze=None,
    resample_sr=44100,
):
    y, sr = librosa.load(audio_path)
    
    if sr != resample_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=resample_sr)
        sr = resample_sr

    if seconds_to_analyze:
        y = y[: int(sr * seconds_to_analyze)]
    time, frequency, confidence, activation = crepe.predict(
        audio=y,
        sr=sr,
        viterbi=use_viterbi,
        model_capacity=model_capacity,
        verbose=crepe_verbose_level,
    )

    # for idx, conf in enumerate(confidence):
    #     if conf < voicing_threshold:
    #         frequency[idx] = 0

    return time, frequency, confidence, activation

In [None]:
def plot_pitch(time, frequency, confidence, activation):
    """
    Plot pitch tracking information including the fundamental frequency (F0) over time, 
    the confidence of the estimates, and an activation matrix representing the salience 
    of pitches over time.

    Parameters
    ----------
    time : array_like
        An array of time stamps at which the frequency and confidence values are estimated.
    frequency : array_like
        An array containing estimated fundamental frequency (F0) values in Hertz (Hz) for each time stamp.
    confidence : array_like
        An array containing confidence values associated with each F0 estimate.
    activation : array_like
        A 2D array representing the activation of different pitch bins over time. 
        The vertical dimension corresponds to pitch bins, and the horizontal dimension 
        corresponds to time.

    Notes
    -----
    This function plots three subplots: The first subplot displays the F0 estimate over time,
    the second subplot shows the confidence of these estimates over time, and the third 
    subplot shows the activation matrix with pitch bins in cents over time. A bug fix is 
    applied for the pitch calculation as per a known issue in the CREPE repository.

    The function does not return any values but renders a matplotlib figure directly.

    References
    ----------
    .. [1] https://github.com/marl/crepe/issues/2
    """
    fig, axes = plt.subplots(ncols=1, nrows=3, figsize=(12, 8), sharex=False)
    axes[0].plot(time, frequency)
    axes[0].set_xlabel("Time (s)")
    axes[0].set_ylabel("Estimated F0 (Hz)")
    axes[0].set_title("F0 Estimate Over Time")
    
    axes[1].plot(time, confidence)
    axes[1].set_xlabel("Time (s)")
    axes[1].set_ylabel("Confidence")
    axes[1].set_title("Estimate Confidence Over Time")
    
    axes[2].imshow(activation.T, origin="lower", aspect="auto")
    axes[2].set_xticks(np.arange(len(activation))[::500])
    
    c1 = 32.7 # Hz, fix for a known issue in CREPE
    c1_cent = mir_eval.melody.hz2cents(np.array([c1]))[0]
    c = np.arange(0, 360) * 20 + c1_cent
    freq = 10 * 2 ** (c / 1200)
    
    axes[2].set_yticks(np.arange(len(freq))[::35])
    axes[2].set_yticklabels([int(f) for f in freq[::35]])
    axes[2].set_ylim([0, 300])
    axes[2].set_xticklabels((np.arange(len(activation))[::500] / 100).astype(int))
    axes[2].set_xlabel("Time (s)")
    axes[2].set_ylabel("Frequency")
    axes[2].set_title("Activation Matrix: 20 Cent Bins Over Time")
    
    plt.tight_layout()
    plt.show()


In [None]:
sr = 44100
time, frequency, confidence, activation = estimate_pitch(audiofile, use_viterbi=True, resample_sr=sr)
plot_pitch(time, frequency, confidence, activation)

sonification = mir_eval.sonify.pitch_contour(time, frequency, fs=sr)
display(Audio(sonification, rate=sr))

In [None]:
sr = 44100
sonification = mir_eval.sonify.pitch_contour(time, frequency, fs=sr)
display(Audio(sonification, rate=sr))