In [2]:
import os
import pickle
import numpy as np
import tensorflow as tf
import madmom
import mir_eval
from sklearn.model_selection import KFold

from modules.labels import get_label_vector
from modules.madmom_cnn_prep import cnn_preprocessor
from datasets import Dataset
from modules.analysis_funcs import get_idx_to_fold, get_segmented_data, get_test_peaks, aubio_peakpicker_do, aubio_postprocessing
from analyze_detection import evaluate
from modules.energy_based import legato_mg, onsets_threshold_gate

%load_ext autoreload
%autoreload 2

FPS = 100
CONTEXT = 7

# Load Madmom normalization
def cnn_normalize(frames):
    inv_std = np.load("models/bock2013pret_inv_std.npy")
    mean = np.load("models/bock2013pret_mean.npy")
    frames_normalized = (frames - np.reshape(mean, (1,80,3)))*np.reshape(inv_std, (1,80,3))
    return frames_normalized

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
frame = madmom.audio.signal.FramedSignalProcessor(frame_size=2048, hop_size=441)
stft = madmom.audio.stft.STFTProcessor()
spect = madmom.audio.spectrogram.SpectrogramProcessor()
proc = madmom.processors.SequentialProcessor([frame, stft, spect])

In [4]:
ds0 = Dataset("initslurtest")
ds1 = Dataset("slurtest_add_1")

audio_fnames = ds0.get_audio_paths() + ds1.get_audio_paths()
label_fnames = ds0.get_annotation_paths() + ds1.get_annotation_paths()

audios = [madmom.audio.signal.load_wave_file(filename)[0] for filename in audio_fnames]
sample_rates = [madmom.audio.signal.load_wave_file(filename)[1] for filename in audio_fnames]
onset_schedules = [np.loadtxt(label_fname, usecols=0) for label_fname in label_fnames]

  file_sample_rate, signal = wavfile.read(filename, mmap=True)


In [5]:
# Dataset info
audio_lengths_sec = [len(audio)/sr for audio,sr in zip(audios, sample_rates)]
iois = [np.ediff1d(onsets) for onsets in onset_schedules]
ioi_spreads = [np.std(np.ediff1d(onsets)) for onsets in onset_schedules]
onset_number = [len(onsets) for onsets in onset_schedules]
mean_ioi = np.sum([np.sum(ioi) for ioi in iois])/np.sum(onset_number)
print(mean_ioi)

0.3556419933217189


In [6]:
random_seed = 119
n_splits =  5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=True)
kf_gen = list(kf.split(np.arange(len(audio_fnames))))

In [12]:
base_path = "results/cnn-training-220420/"
folds_path = base_path + "folds.pkl"

model_name = "added-seq-gen-nostandard"

folds = kf_gen
#with open(folds_path, "rb") as f:
    #folds = pickle.load(f)

itf = get_idx_to_fold(folds)

neural = False
TOL = 0.025


In [7]:
CD_list = []
FN_list = []
FP_list = []
for r in range(len(itf.keys())):
    fold = itf[r]
    rec_name = os.path.basename(audio_fnames[r])
    x = get_segmented_data(audio_fnames[r])

    model = tf.keras.models.load_model(base_path + "fold_" + str(fold) + "_" + model_name + "_model")
    out = model.predict(x)
    peaks = get_test_peaks(out, 1./FPS)
    
    [CD,FN,FP,doubles,merged] = evaluate(onset_schedules[r], peaks, tol_sec=TOL)
    CD_list.append(CD)
    FN_list.append(FN)
    FP_list.append(FP)

    scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
    print(rec_name + "\t" + "F-score: {:.2f}".format(100*scores["F-measure"]))



In [23]:
# Only first fold
#model = tf.keras.models.load_model("results/cnn-training-220409/fold_0_added-seq-gen-nostandard_model")
av_scores = []
for r in folds[0][1]:
    rec_name = os.path.basename(audio_fnames[r])
    sig = madmom.audio.Signal(audio_fnames[r])
    spect = madmom.audio.spectrogram.Spectrogram(audio_fnames[r])
    hfc = madmom.features.onsets.high_frequency_content(spect)
    onehot, out = aubio_peakpicker_do(hfc, threshold=0.15)
    peaks = aubio_postprocessing(onehot, sig, db_thres=-90, min_ioi_frames=6)/FPS
    
    #x = get_segmented_data(audio_fnames[r])
    #out = model.predict(x)
    #peaks = get_test_peaks(out, 1./FPS)
    scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
    print(scores["F-measure"]*100)
    av_scores.append(scores["F-measure"])
print(np.mean(av_scores))

80.26315789473685
19.88950276243094
63.128491620111724
56.92307692307692
43.39622641509434
6.862745098039215
76.30057803468206
66.19718309859154
0.5162012023084546


In [13]:
CD_list = []
FN_list = []
FP_list = []
for fold, (train_idx, test_idx) in enumerate(folds):
    print(fold)
    fold_average = []
    for r in test_idx:
        rec_name = os.path.basename(audio_fnames[r])
        sig = madmom.audio.Signal(audio_fnames[r])
        x = get_segmented_data(audio_fnames[r])
        model = tf.keras.models.load_model(base_path + "fold_" + str(fold) + "_" + model_name + "_model")
        out = model.predict(x)
        peaks = get_test_peaks(out, 1./FPS)
        
        [CD,FN,FP,doubles,merged] = evaluate(onset_schedules[r], peaks, tol_sec=TOL)
        CD_list.append(CD)
        FN_list.append(FN)
        FP_list.append(FP)

        scores = mir_eval.onset.evaluate(onset_schedules[r], peaks, window=TOL)
        print(rec_name + "\t" + "F-score: {:.2f}".format(100*scores["F-measure"]))
        fold_average.append(scores["F-measure"])
    print("Fold {}    average F-score {:.2f}".format(fold, 100*np.mean(fold_average)))
    print()

0
slurtest04.wav	F-score: 94.19
slurtest18.wav	F-score: 55.26
stormhatten_IR2.wav	F-score: 83.33
slurtest01_IR2.wav	F-score: 93.02
6xtpsg_220319.wav	F-score: 82.68
6xtpsg_220306.wav	F-score: 79.33


  file_sample_rate, signal = wavfile.read(filename, mmap=True)


slurtest04_FK1.wav	F-score: 95.36
slurtest03_IR1.wav	F-score: 90.91
Fold 0    average F-score 84.26

1
slurtest05.wav	F-score: 81.01
slurtest04_IR2.wav	F-score: 95.36
melodyvib_220319.wav	F-score: 62.44
slurtest09_IR2.wav	F-score: 87.67
janissa_IR2.wav	F-score: 73.26
slurtest01_FK1.wav	F-score: 88.37
slurtest08_FK1.wav	F-score: 97.92
Fold 1    average F-score 83.72

2
slurtest03.wav	F-score: 91.47
slurtest11.wav	F-score: 61.54
slurtest15.wav	F-score: 87.00
slurtest02_IR1.wav	F-score: 95.31
slurtest03_FK1.wav	F-score: 95.24
slurtest01_IR1.wav	F-score: 87.50
63an_start_220306.wav	F-score: 84.02
Fold 2    average F-score 86.01

3
slurtest02.wav	F-score: 93.85
slurtest07.wav	F-score: 93.93
slurtest08.wav	F-score: 95.14


  file_sample_rate, signal = wavfile.read(filename, mmap=True)


slurtest14.wav	F-score: 86.24
slurtest17.wav	F-score: 61.16
slurtest19.wav	F-score: 93.89
stormhatten_IR1.wav	F-score: 89.80
Fold 3    average F-score 87.71

4
slurtest01.wav	F-score: 88.06
slurtest06.wav	F-score: 95.28
slurtest09.wav	F-score: 86.09
slurtest10.wav	F-score: 44.44
slurtest12.wav	F-score: 69.88
slurtest13.wav	F-score: 88.17
slurtest16.wav	F-score: 85.38
Fold 4    average F-score 79.62



In [14]:
np.sum(CD_list)/(np.sum(CD_list)+.5*(np.sum(FP_list) + np.sum(FN_list)))

0.8609198567887634