$f_{0}$ extraction using the Pitch Tracking Dataset from TU Graz (PTDBUG)

At first, import packages to be used for the experiments

In [None]:
import os
import glob
import numpy as np
from tqdm import tqdm
import time

from sklearn.base import clone
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, zero_one_loss
from sklearn.cluster import MiniBatchKMeans, KMeans
# from sklearn_extra.cluster import KMedoids
from sklearn.manifold import TSNE
from joblib import Parallel, delayed, dump, load
from pyrcn.echo_state_network import ESNRegressor
from pyrcn.base import InputToNode, PredefinedWeightsInputToNode, NodeToNode
from pyrcn.linear_model import IncrementalRegression
import matplotlib
from matplotlib import pyplot as plt
#Options
params = {'image.cmap' : 'jet',
          'text.usetex' : False,
          'font.size' : 11,
          'axes.titlesize' : 24,
          'axes.labelsize' : 20,
          'lines.linewidth' : 3,
          'lines.markersize' : 10,
          'xtick.labelsize' : 16,
          'ytick.labelsize' : 16,
          }
plt.rcParams.update(params) 
# plt.rcParams['pdf.fonttype'] = 42
# plt.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['text.usetex'] = False

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

import librosa
import librosa.display

Print number of files that are included in the dataset

In [None]:
training_files = np.loadtxt(r"/projects/p_transcriber/SPEECH_DATA/SPEECH_DATAsplit/training.txt", dtype=str)
validation_files = np.loadtxt(r"/projects/p_transcriber/SPEECH_DATA/SPEECH_DATAsplit/validation.txt", dtype=str)
test_files = np.loadtxt(r"/projects/p_transcriber/SPEECH_DATA/SPEECH_DATAsplit/test.txt", dtype=str)
print("{0}\t{1}\t{2}".format(len(training_files), len(validation_files), len(test_files)))

Feature visualization

In [None]:
# Load the audio signal and normalize it.
print(training_files[0])
x, sr = librosa.core.load(training_files[0], sr=None, mono=False)
stft_frames = np.abs(librosa.stft(x, n_fft=2048, hop_length=int(0.01*sr), win_length=int(0.04*sr)))**2
S = librosa.power_to_db(stft_frames, ref=np.max)
X = S.T
print(sr)
print(X[:, 65].shape)
# Define time axis in seconds
t = np.arange(len(x)) / sr
plt.subplot(4, 1, 1)
plt.plot(t, x, color='gray')
# plt.xlabel('Time (seconds)')
# plt.ylabel('Amplitude')
plt.ylim([-0.3, 0.3])
plt.title("Speech signal")
plt.subplot(4, 1, 2)
librosa.display.specshow(stft_frames, sr=sr, y_axis='log', x_axis='time')
plt.title('Power spectrogram')
plt.colorbar()
plt.subplot(4, 1, 3)
librosa.display.specshow(S, sr=sr, y_axis='log', x_axis='time')
plt.title('Log-Power spectrogram')
plt.colorbar()
plt.subplot(4, 1, 4)
librosa.display.specshow(np.log(stft_frames + 1.0), sr=sr, y_axis='log', x_axis='time')
plt.title('Log-Power spectrogram')
plt.colorbar()

Helper function for feature extraction

In [None]:
def extract_features(file_name):
    x, sr = librosa.core.load(file_name, sr=None, mono=False)
    stft_frames = np.abs(librosa.stft(x, n_fft=2048, hop_length=int(0.01*sr), win_length=int(0.04*sr)))**2
    S = np.log(stft_frames + 1.0)
    X = np.pad(S.T[:, :65], ((2, 2), (0, 0)), 'edge')
    U = np.concatenate((X[:-4, :], X[1:-3, :], X[2:-2, :], X[3:-1, :], X[4:, :]), axis=1)
    y = np.zeros(shape=(S.T.shape[0], 2))
    txt_data = np.loadtxt(file_name.replace("MIC", "REF").replace("mic", "ref").replace(".wav", ".f0"), usecols=(0, 1))
    y[2:2+len(txt_data), :] = txt_data
    return U, y

Feature visualization

In [None]:
# Load the audio signal and normalize it.
print(training_files[0])
x, sr = librosa.core.load(training_files[0], sr=None, mono=False)
stft_frames = np.abs(librosa.stft(x, n_fft=2048, hop_length=int(0.01*sr), win_length=int(0.04*sr)))**2
S = librosa.power_to_db(stft_frames, ref=np.max)
X, y = extract_features(training_files[0])
print(sr)
print(X.shape)
# Define time axis in seconds
t = np.arange(len(x)) / sr
plt.subplot(4, 1, 1)
plt.plot(t, x, color='gray')
# plt.xlabel('Time (seconds)')
# plt.ylabel('Amplitude')
plt.ylim([-0.3, 0.3])
plt.title("Speech signal")
plt.subplot(4, 1, 2)
librosa.display.specshow(stft_frames, sr=sr, y_axis='log', x_axis='time')
plt.title('Power spectrogram')
plt.colorbar()
plt.subplot(4, 1, 3)
librosa.display.specshow(S, sr=sr, y_axis='log', x_axis='time')
plt.title('Log-Power spectrogram')
plt.colorbar()
plt.subplot(4, 1, 4)
librosa.display.specshow(X.T, sr=sr, y_axis='log', x_axis='time')
plt.title('Normalized Log-Power spectrogram')
plt.colorbar()

Define several error functions for $f_{0}$ extraction

In [None]:
def gpe(y_true, y_pred):
    """
    Gross pitch error:
    
    All frames that are considered voiced by both pitch tracker and ground truth, 
    for which the relative pitch error is higher than a certain threshold (\SI{20}{\percent}).
    
    """
    idx = np.nonzero(y_true*y_pred)[0]
    return np.mean(np.abs(y_true[idx] - y_pred[idx]) > 0.2 * y_true[idx])


def vde(y_true, y_pred):
    """
    Voicing Decision Error:
    
    Proportion of frames for which an incorrect voiced/unvoiced decision is made.
    
    """
    return zero_one_loss(y_true, y_pred)


def fpe(y_true, y_pred):
    """
    Fine Pitch Error:
    
    Standard deviation of the distribution of relative error values (in cents) from the frames
    that do not have gross pitch errors
    """
    idx_voiced = np.nonzero(y_true * y_pred)[0]
    idx_correct = np.argwhere(np.abs(y_true - y_pred) <= 0.2 * y_true).ravel()
    idx = np.intersect1d(idx_voiced, idx_correct)
    if idx.size == 0:
        return 0
    else:
        return 100 * np.std(np.log2(y_pred[idx] / y_true[idx]))


def ffe(y_true, y_pred):
    """
    $f_{0}$ Frame Error:
    
    Proportion of frames for which an error (either according to the GPE or the VDE criterion) is made.
    FFE can be seen as a single measure for assessing the overall performance of a pitch tracker.
    """
    idx_correct = np.argwhere(np.abs(y_true - y_pred) <= 0.2 * y_true).ravel()
    return 1 - len(idx_correct) / len(y_true)

Initialize an Echo State Network

In [None]:
base_input_to_node = InputToNode(hidden_layer_size=500, activation='identity', k_in=10, input_scaling=0.1, bias_scaling=0.0, random_state=10)
base_node_to_node = NodeToNode(hidden_layer_size=500, spectral_radius=0.6, leakage=1.0, bias_scaling=1.0, k_rec=90, random_state=10)
base_reg = IncrementalRegression(alpha=1e-3)

base_esn = ESNRegressor(input_to_node=base_input_to_node,
                        node_to_node=base_nodes_to_node,
                        regressor=base_reg, 
                        random_state=0)

Try to load a pre-trained ESN

In [None]:
try:
    esn = load("dataset/f0_extraction/models/sparse_esn_500.joblib")
except FileNotFoundError:
    print("Fitting ESN with features from the training set...")
    esn = base_esn
    with tqdm(total=len(training_files)) as pbar:
        for k, file_name in enumerate(training_files[:-1]):
            X, y = extract_features(file_name)
            esn.partial_fit(X=X, y=y, postpone_inverse=True)
            pbar.update(1)
        X, y = extract_features(training_files[-1])
        esn.partial_fit(X=X, y=y, postpone_inverse=False)
        pbar.update(1)
    print("done!")
    dump(esn, "sparse_esn_500.joblib")

Compute errors on the training, validation and test set

In [None]:
gpe_training = [None] * len(training_files)
vde_training = [None] * len(training_files)
fpe_training = [None] * len(training_files)
ffe_training = [None] * len(training_files)
with tqdm(total=len(training_files)) as pbar:
    for k, file_name in enumerate(training_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_training[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_training[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_training[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_training[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

gpe_validation = [None] * len(validation_files)
vde_validation = [None] * len(validation_files)
fpe_validation = [None] * len(validation_files)
ffe_validation = [None] * len(validation_files)
with tqdm(total=len(validation_files)) as pbar:
    for k, file_name in enumerate(validation_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_validation[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_validation[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_validation[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_validation[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

gpe_test = [None] * len(test_files)
vde_test = [None] * len(test_files)
fpe_test = [None] * len(test_files)
ffe_test = [None] * len(test_files)
with tqdm(total=len(test_files)) as pbar:
    for k, file_name in enumerate(test_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_test[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_test[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_test[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_test[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

print("Training: GPE\t VDE\t FPE\t FFE")
print("Training: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_training), np.mean(vde_training), np.mean(fpe_training), np.mean(ffe_training) ))
print("Validation: GPE\t VDE\t FPE\t FFE")
print("Validation: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_validation), np.mean(vde_validation), np.mean(fpe_validation), np.mean(ffe_validation) ))
print("Test: GPE\t VDE\t FPE\t FFE")
print("Test: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_test), np.mean(vde_test), np.mean(fpe_test), np.mean(ffe_test) ))

Find the negative examples from training, validation and test sets

In [None]:
np.argmax(gpe_training), np.argmax(gpe_validation), np.argmax(gpe_test)

Find the positive examples from training, validation and test sets

In [None]:
np.argmin(gpe_training), np.argmin(gpe_validation), np.argmin(gpe_test)

Visualize worst and best training example

In [None]:
X, y = extract_features(training_files[323])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(training_files[303])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))

Visualize worst and best validation example

In [None]:
X, y = extract_features(validation_files[33])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(validation_files[243])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))

Visualize worst and best test example

In [None]:
X, y = extract_features(test_files[21])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(test_files[368])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))

$K$-Means initialization

In [None]:
t1 = time.time()
kmeans = MiniBatchKMeans(n_clusters=500, n_init=20, reassignment_ratio=0, max_no_improvement=50, init='k-means++', verbose=1, random_state=0)
print("Fitting kmeans with features from the training set...")
X = [None] * len(training_files)
y = [None] * len(training_files)
with tqdm(total=len(training_files)) as pbar:
    for k, file_name in enumerate(training_files):
        X[k], y[k] = extract_features(file_name)
        pbar.update(1)
    kmeans.fit(X=np.vstack(X))
print("done in {0}!".format(time.time() - t1))
del X
del y

Initialize an Echo State Network

In [None]:
if base_input_to_nodes.hidden_layer_size <=500:
    w_in = np.divide(kmeans.cluster_centers_, np.linalg.norm(kmeans.cluster_centers_, axis=1)[:, None])
else:
    w_in = np.pad(np.divide(kmeans.cluster_centers_, np.linalg.norm(kmeans.cluster_centers_, axis=1)[:, None]), ((0, base_input_to_nodes.hidden_layer_size - 500), (0, 0)), mode='constant', constant_values=0)

base_input_to_node = PredefinedWeightsInputToNode(predefined_input_weights=w_in.T, activation='identity', input_scaling=0.1)
base_node_to_node = NodeToNode(hidden_layer_size=500, spectral_radius=0.1, leakage=1.0, bias_scaling=2.1, k_rec=10, random_state=10)
base_reg = FastIncrementalRegression(alpha=1e-3)

base_esn = ESNRegressor(input_to_node=base_input_to_node,
                        node_to_node=base_node_to_node,
                        regressor=base_reg)

Try to load a pre-trained ESN

In [None]:
try:
    esn = load("dataset/f0_extraction/models/kmeans_esn_500.joblib")
except FileNotFoundError:
    print("Fitting ESN with features from the training set...")
    esn = base_esn
    with tqdm(total=len(training_files)) as pbar:
        for k, file_name in enumerate(training_files[:-1]):
            X, y = extract_features(file_name)
            esn.partial_fit(X=X, y=y, postpone_inverse=True)
            pbar.update(1)
        X, y = extract_features(training_files[-1])
        esn.partial_fit(X=X, y=y, postpone_inverse=False)
        pbar.update(1)
    print("done!")
    dump(esn, "kmeans_esn_500.joblib")

Compute errors on the training, validation and test set

In [None]:
gpe_training = [None] * len(training_files)
vde_training = [None] * len(training_files)
fpe_training = [None] * len(training_files)
ffe_training = [None] * len(training_files)
with tqdm(total=len(training_files)) as pbar:
    for k, file_name in enumerate(training_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_training[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_training[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_training[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_training[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

gpe_validation = [None] * len(validation_files)
vde_validation = [None] * len(validation_files)
fpe_validation = [None] * len(validation_files)
ffe_validation = [None] * len(validation_files)
with tqdm(total=len(validation_files)) as pbar:
    for k, file_name in enumerate(validation_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_validation[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_validation[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_validation[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_validation[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

gpe_test = [None] * len(test_files)
vde_test = [None] * len(test_files)
fpe_test = [None] * len(test_files)
ffe_test = [None] * len(test_files)
with tqdm(total=len(test_files)) as pbar:
    for k, file_name in enumerate(test_files):
        X, y = extract_features(file_name)
        y_pred = esn.predict(X=X)
        gpe_test[k] = gpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        vde_test[k] = vde(y_true=y[:, 1], y_pred=y_pred[:, 1] >= .5)
        fpe_test[k] = fpe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        ffe_test[k] = ffe(y_true=y[:, 0]*y[:, 1], y_pred=y_pred[:, 0]*(y_pred[:, 1] >= .5))
        pbar.update(1)

print("Training: GPE\t VDE\t FPE\t FFE")
print("Training: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_training), np.mean(vde_training), np.mean(fpe_training), np.mean(ffe_training) ))
print("Validation: GPE\t VDE\t FPE\t FFE")
print("Validation: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_validation), np.mean(vde_validation), np.mean(fpe_validation), np.mean(ffe_validation) ))
print("Test: GPE\t VDE\t FPE\t FFE")
print("Test: {0}\t {1}\t {2}\t {3}".format(np.mean(gpe_test), np.mean(vde_test), np.mean(fpe_test), np.mean(ffe_test) ))

Find the negative examples from training, validation and test sets

In [None]:
np.argmax(gpe_training), np.argmax(gpe_validation), np.argmax(gpe_test)

Find the positive examples from training, validation and test sets

In [None]:
np.argmin(gpe_training), np.argmin(gpe_validation), np.argmin(gpe_test)

Visualize worst and best training example

In [None]:
X, y = extract_features(training_files[1547])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(training_files[7])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))

Visualize worst and best validation example

In [None]:
X, y = extract_features(validation_files[277])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(validation_files[499])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))

Visualize worst and best test example

In [None]:
X, y = extract_features(test_files[21])
y_pred = esn.predict(X=X)
plt.subplot(2,1,1)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))
X, y = extract_features(test_files[276])
y_pred = esn.predict(X=X)
plt.subplot(2,1,2)
plt.plot(y[:, 0])
plt.plot(y_pred[:, 0]*(y_pred[:, 1] >= .5))