In [7]:
import os
import pickle
import numpy as np
import tensorflow as tf
import madmom
import mir_eval
from sklearn.model_selection import KFold

from modules.labels import get_label_vector
from modules.madmom_cnn_prep import cnn_preprocessor
from datasets import Dataset
from modules.analysis_funcs import get_idx_to_fold, get_segmented_data, get_test_peaks, aubio_peakpicker_do, aubio_postprocessing
from analyze_detection import evaluate
from modules.energy_based import legato_mg, onsets_threshold_gate

%load_ext autoreload
%autoreload 2

FPS = 100
CONTEXT = 7

# Load Madmom normalization
def cnn_normalize(frames):
    inv_std = np.load("models/bock2013pret_inv_std.npy")
    mean = np.load("models/bock2013pret_mean.npy")
    frames_normalized = (frames - np.reshape(mean, (1,80,3)))*np.reshape(inv_std, (1,80,3))
    return frames_normalized

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
frame = madmom.audio.signal.FramedSignalProcessor(frame_size=2048, hop_size=441)
stft = madmom.audio.stft.STFTProcessor()
spect = madmom.audio.spectrogram.SpectrogramProcessor()
proc = madmom.processors.SequentialProcessor([frame, stft, spect])

In [3]:
ds0 = Dataset("initslurtest")
ds1 = Dataset("slurtest_add_1")

audio_fnames = ds0.get_audio_paths() + ds1.get_audio_paths()
label_fnames = ds0.get_annotation_paths() + ds1.get_annotation_paths()

audios = [madmom.audio.signal.load_wave_file(filename)[0] for filename in audio_fnames]
sample_rates = [madmom.audio.signal.load_wave_file(filename)[1] for filename in audio_fnames]
onset_schedules = [np.loadtxt(label_fname, usecols=0) for label_fname in label_fnames]

  file_sample_rate, signal = wavfile.read(filename, mmap=True)


In [4]:
# Dataset info
audio_lengths_sec = [len(audio)/sr for audio,sr in zip(audios, sample_rates)]
iois = [np.ediff1d(onsets) for onsets in onset_schedules]
ioi_spreads = [np.std(np.ediff1d(onsets)) for onsets in onset_schedules]
onset_number = [len(onsets) for onsets in onset_schedules]
mean_ioi = np.sum([np.sum(ioi) for ioi in iois])/np.sum(onset_number)
print(mean_ioi)

0.3556419933217189


In [4]:
random_seed = 119
n_splits =  5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=True)
kf_gen = list(kf.split(np.arange(len(audio_fnames))))

In [6]:
base_path = "results/cnn-training-220409/"
folds_path = base_path + "folds.pkl"

model_name = "added-sample-gen-nostandard"

folds = kf_gen
#with open(folds_path, "rb") as f:
    #folds = pickle.load(f)

itf = get_idx_to_fold(folds)

neural = False
TOL = 0.025


In [7]:
CD_list = []
FN_list = []
FP_list = []
for r in range(len(itf.keys())):
    fold = itf[r]
    rec_name = os.path.basename(audio_fnames[r])
    x = get_segmented_data(audio_fnames[r])
    if neural:
        model = tf.keras.models.load_model(base_path + "fold_" + str(fold) + "_" + model_name + "_model")
        out = model.predict(x)
        peaks = get_test_peaks(out, 1./FPS)
    else:
        leg_on, leg_val = legato_mg(audio_fnames[r], rel_delta=0.2)
        peaks = onsets_threshold_gate(leg_on, leg_val, 1.0)
    
    [CD,FN,FP,doubles,merged] = evaluate(onset_schedules[r], peaks, tol_sec=TOL)
    CD_list.append(CD)
    FN_list.append(FN)
    FP_list.append(FP)

    scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
    print(rec_name + "\t" + "F-score: {:.2f}".format(100*scores["F-measure"]))



In [23]:
# Only first fold
#model = tf.keras.models.load_model("results/cnn-training-220409/fold_0_added-seq-gen-nostandard_model")
av_scores = []
for r in folds[0][1]:
    rec_name = os.path.basename(audio_fnames[r])
    sig = madmom.audio.Signal(audio_fnames[r])
    spect = madmom.audio.spectrogram.Spectrogram(audio_fnames[r])
    hfc = madmom.features.onsets.high_frequency_content(spect)
    onehot, out = aubio_peakpicker_do(hfc, threshold=0.15)
    peaks = aubio_postprocessing(onehot, sig, db_thres=-90, min_ioi_frames=6)/FPS
    
    #x = get_segmented_data(audio_fnames[r])
    #out = model.predict(x)
    #peaks = get_test_peaks(out, 1./FPS)
    scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
    print(scores["F-measure"]*100)
    av_scores.append(scores["F-measure"])
print(np.mean(av_scores))

80.26315789473685
19.88950276243094
63.128491620111724
56.92307692307692
43.39622641509434
6.862745098039215
76.30057803468206
66.19718309859154
0.5162012023084546


In [25]:
CD_list = []
FN_list = []
FP_list = []
for fold, (train_idx, test_idx) in enumerate(folds):
    print(fold)
    fold_average = []
    for r in test_idx:
        rec_name = os.path.basename(audio_fnames[r])
        sig = madmom.audio.Signal(audio_fnames[r])
        x = get_segmented_data(audio_fnames[r])
        if neural:
            model = tf.keras.models.load_model(base_path + "fold_" + str(fold) + "_" + model_name + "_model")
            out = model.predict(x)
            peaks = get_test_peaks(out, 1./FPS)
        else:
            spect = madmom.audio.spectrogram.Spectrogram(audio_fnames[r])
            hfc = madmom.features.onsets.high_frequency_content(spect)
            onehot, out = aubio_peakpicker_do(hfc, threshold=0.15)
            peaks = aubio_postprocessing(onehot, sig, db_thres=-90, min_ioi_frames=6)/FPS
            
            #leg_on, leg_val = legato_mg(audio_fnames[r], rel_delta=0.2)
            #peaks = onsets_threshold_gate(leg_on, leg_val, 1.0)
        
        [CD,FN,FP,doubles,merged] = evaluate(onset_schedules[r], peaks, tol_sec=TOL)
        CD_list.append(CD)
        FN_list.append(FN)
        FP_list.append(FP)

        scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
        print(rec_name + "\t" + "F-score: {:.2f}".format(100*scores["F-measure"]))
        fold_average.append(scores["F-measure"])
    print("Fold {}    average F-score {:.2f}".format(fold, 100*np.mean(fold_average)))
    print()

0
slurtest04.wav	F-score: 80.26
slurtest18.wav	F-score: 19.89
stormhatten_IR2.wav	F-score: 63.13
slurtest01_IR2.wav	F-score: 56.92
6xtpsg_220319.wav	F-score: 43.40


  file_sample_rate, signal = wavfile.read(filename, mmap=True)


6xtpsg_220306.wav	F-score: 6.86
slurtest04_FK1.wav	F-score: 76.30
slurtest03_IR1.wav	F-score: 66.20
Fold 0    average F-score 51.62

1
slurtest05.wav	F-score: 68.27
slurtest04_IR2.wav	F-score: 78.48
melodyvib_220319.wav	F-score: 24.74
slurtest09_IR2.wav	F-score: 55.95
janissa_IR2.wav	F-score: 45.02
slurtest01_FK1.wav	F-score: 59.31
slurtest08_FK1.wav	F-score: 71.93
Fold 1    average F-score 57.67

2
slurtest03.wav	F-score: 74.17
slurtest11.wav	F-score: 29.76
slurtest15.wav	F-score: 57.94
slurtest02_IR1.wav	F-score: 50.75
slurtest03_FK1.wav	F-score: 62.32
slurtest01_IR1.wav	F-score: 60.47


  file_sample_rate, signal = wavfile.read(filename, mmap=True)


63an_start_220306.wav	F-score: 10.34
Fold 2    average F-score 49.39

3
slurtest02.wav	F-score: 71.52
slurtest07.wav	F-score: 70.88
slurtest08.wav	F-score: 72.99
slurtest14.wav	F-score: 56.94
slurtest17.wav	F-score: 39.65


ValueError: frames must be a 2D array or iterable, got <class 'madmom.audio.signal.FramedSignal'> with shape (5663, 2048, 2).

In [40]:
np.sum(CD_list)/(np.sum(CD_list)+.5*(np.sum(FP_list) + np.sum(FN_list)))

0.8505654281098546