In [1]:
import numpy as np
import librosa

In [18]:
def power_to_db(S):
    S = np.asarray(S)
    magnitude = S
    ref_value = np.abs(1.0)
    log_spec = 10.0 * np.log10(np.maximum(1e-10, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(1e-10, ref_value))
    return log_spec

In [19]:
def spectral_contrast(y=None,sr=22050,S=None,n_fft=2048,hop_length=512,
    win_length=None,window="hann",center=True,pad_mode="constant",fmin=200.0,n_bands=6,quantile=0.02,linear=False):
    
    S= np.abs(librosa.stft(y,n_fft=n_fft,hop_length=hop_length,win_length=win_length,center=center,window=window,pad_mode=pad_mode))

    freq = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)

    freq = np.atleast_1d(freq)
    octa = np.zeros(n_bands + 2)
    octa[1:] = fmin * (2.0 ** np.arange(0, n_bands + 1))
    shape = list(S.shape)
    shape[-2] = n_bands + 1

    valley = np.zeros(shape)
    peak = np.zeros_like(valley)

    for k, (f_low, f_high) in enumerate(zip(octa[:-1], octa[1:])):
        current_band = np.logical_and(freq >= f_low, freq <= f_high)

        idx = np.flatnonzero(current_band)

        if k > 0:
            current_band[idx[0] - 1] = True

        if k == n_bands:
            current_band[idx[-1] + 1 :] = True

        sub_band = S[..., current_band, :]

        if k < n_bands:
            sub_band = sub_band[..., :-1, :]

        # Always take at least one bin from each side
        idx = np.rint(quantile * np.sum(current_band))
        idx = int(np.maximum(idx, 1))

        sortedr = np.sort(sub_band, axis=-2)

        valley[..., k, :] = np.mean(sortedr[..., :idx, :], axis=-2)
        peak[..., k, :] = np.mean(sortedr[..., -idx:, :], axis=-2)

    if linear:
        return peak - valley
    else:
        return power_to_db(peak) - power_to_db(valley)


In [20]:
signal, sr=librosa.load(r"D:\Projects\Voice recognition\recording0.wav")
spectral_contrast(signal, sr=sr)

array([[ 6.88642393,  9.71209797, 12.59873334, 13.79632237, 18.22676486,
        13.44481881, 15.34101783, 15.84652123, 13.88513334,  9.25873392,
        15.77110586, 16.9565739 , 17.66981609,  8.93748794, 13.92299494,
        12.41159432, 12.65827335, 13.82421512, 14.57311675, 12.78438101,
        10.4298553 , 13.06622076,  9.79057847, 16.93565618,  9.85453673,
        13.66466786, 12.30447593, 13.71859718, 11.10882543, 11.31060864,
        16.82222406, 25.58817329, 25.9810368 , 27.85634187, 31.52234404,
        41.90309585, 30.06514327, 28.1719877 , 31.94585605, 29.42837327,
        31.01001082, 30.21189051, 28.81646869, 30.83547642, 28.63747527,
        36.55320356, 27.68688043, 22.37311574, 21.07928417, 22.70056777,
        16.71412906, 16.19775741, 15.19426854, 22.25144906, 20.35873861,
        22.57023056, 24.11939109, 32.03474592, 33.72150235, 30.07132573,
        26.76404061, 36.53684712, 27.90670384, 38.31478847, 31.11983788,
        32.07583132, 47.95950975, 33.18564226, 33.0

In [24]:
def normalize(S,norm=np.inf, axis=0):
    threshold = np.finfo(np.float32).tiny

    mag = np.abs(S).astype(float)
    fill_norm = 1
    length = np.sum(mag**norm, axis=axis, keepdims=True) ** (1.0 / norm)
    fill_norm = mag.shape[axis] ** (-1.0 / norm)
    small_idx = length < threshold
    Snorm = np.empty_like(S)
    length[small_idx] = 1.0
    Snorm[:] = S / length

    return Snorm

In [25]:
def tonnetz(y=None, sr=22050, chroma=None):
    dim_map = np.linspace(0, 12, num=chroma.shape[-2], endpoint=False)
    scale = np.asarray([7.0 / 6, 7.0 / 6, 3.0 / 2, 3.0 / 2, 2.0 / 3, 2.0 / 3])
    V = np.multiply.outer(scale, dim_map)
    V[::2] -= 0.5
    R = np.array([1, 1, 1, 1, 0.5, 0.5])  # Fifths  # Minor  # Major
    phi = R[:, np.newaxis] * np.cos(np.pi * V)
    return np.einsum(
        "pc,...ci->...pi", phi, util.normalize(chroma, norm=1, axis=-2), optimize=True
    )