In [1]:
# Perform SUPERFLUX Onset detection on all folds
import numpy as np
import os
import re
import librosa
import madmom
from tqdm import tqdm
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.stft import ShortTimeFourierTransformProcessor
from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                      LogarithmicSpectrogramProcessor,
                                      SpectrogramDifferenceProcessor)
from madmom.features.onsets import OnsetPeakPickingProcessor
from madmom.evaluation.onsets import OnsetEvaluation, OnsetSumEvaluation
from madmom.processors import SequentialProcessor
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


# Feature extraction parameters
sr = 44100               # Sampling rate
fs = 2048                # Frame size
fps = 200                # Frames per second
window = np.hanning      # Hann-Window for STFT
num_bands = 24           # Number of bands per octave
f_min = 27.5             # Minimum frequency
f_max = 16000            # Maximum frequency

# Peak picking and evaluation parameters
pre_max = 0.03      # Use pre_max seconds past information for moving maximum
post_max = 0.03     # Use post_max seconds future information for moving maximum
pre_avg = 0.1       # Use pre_avg seconds past information for moving average
post_avg = 0.07     # Use post_avg seconds future information for moving average
eval_window = 0.05  # Time window around a reference onset
combine = 0.03      # Only report one onset within combine seconds
delay = 0           # Report the detected onsets delay seconds delayed
diff_max_bins = 3
threshold = 1.3

# Instantiate Processors with above parameters
signal_proc = SignalProcessor(sample_rate=sr, num_channels=1, norm=True)
frames_proc = FramedSignalProcessor(frame_size=fs, fps=fps)
stft_proc = ShortTimeFourierTransformProcessor(window=window)
fil_spec_proc = FilteredSpectrogramProcessor(num_bands=num_bands, fmin=f_min, fmax=f_max, norm_filters=False)
logfil_spec_proc = LogarithmicSpectrogramProcessor()
specDiffProc = SpectrogramDifferenceProcessor(diff_max_bins=diff_max_bins, positive_diffs=True)

preprocessor = SequentialProcessor([signal_proc, frames_proc, stft_proc, fil_spec_proc, logfil_spec_proc])

peak_proc = OnsetPeakPickingProcessor(threshold=threshold,
                                      pre_max=pre_max,
                                      post_max=post_max,
                                      pre_avg=pre_avg,
                                      post_avg=post_avg,
                                      combine=combine,
                                      delay=delay,
                                      fps=fps)




def preprocessing(audiofile, sr, preprocessor):
    # Load and normalize audio file
    signal, sr = librosa.load(audiofile, sr=sr, mono=True)

    # Calculate preprocessed spectrogram
    log_filtered_spec = preprocessor.process(signal)
    return log_filtered_spec

def reduction_filtering(log_filtered_spec):
    deriv = specDiffProc.process(log_filtered_spec)
    odf = deriv.sum(axis=1)
    return odf

def peak_picking(odf, peakprocessor):
    # Peak-Picking
    onsets = peakprocessor.process(odf)
    return onsets

def evaluation(onsets, annotationsfile):
    # Evaluation
    # Load annotations
    annotations = madmom.io.load_events(annotationsfile)
    evl = OnsetEvaluation(detections=onsets,
                          annotations=annotations,
                          window=eval_window,
                          combine=combine,
                          delay=delay)
    return evl


def get_filenames_from_fold(fold):
    path_to_folds = "data/splits"
    with open(os.path.join(path_to_folds, fold)) as fd:
            track_names = [line.rstrip() for line in fd]  # List containing all track names for a specific fold
            fd.close()
            return track_names


fold0 = get_filenames_from_fold('8-fold_cv_random_0.fold')
fold1 = get_filenames_from_fold('8-fold_cv_random_1.fold')
fold2 = get_filenames_from_fold('8-fold_cv_random_2.fold')
fold3 = get_filenames_from_fold('8-fold_cv_random_3.fold')
fold4 = get_filenames_from_fold('8-fold_cv_random_4.fold')
fold5 = get_filenames_from_fold('8-fold_cv_random_5.fold')
fold6 = get_filenames_from_fold('8-fold_cv_random_6.fold')
fold7 = get_filenames_from_fold('8-fold_cv_random_7.fold')

test_folds = [ fold7, fold1, fold4, fold3, fold6, fold0, fold5, fold2 ] # Sort folds in correct order!!!
df_list = []
dict_fscores = []
total_sum_evals = []
# Iterate over all filter kernels and their corresponding test folds
for i in range(0,8,1):
    
    track_names = test_folds[i]
    list_evals = []
    
    dict_fscores = {}
    # Iterate over all tracks in this fold and perform Onset detection with evaluations
    for file in tqdm(track_names, desc="Processing..."):
        str_audiodir = 'data/audio'
        str_annodir = 'data/annotations/onsets'
        audiodir = os.fsencode(str_audiodir)
        refdir = os.fsencode(str_annodir)
        audiofile = os.path.join(str_audiodir, file+".flac")
        annotationsfile = os.path.join(str_annodir, file+".onsets")
        
        # Pipeline:
        spec = preprocessing(audiofile=audiofile, sr=sr, preprocessor=preprocessor)
        odf = reduction_filtering(log_filtered_spec=spec)
        onsets = peak_picking(odf=odf, peakprocessor=peak_proc)
        eval_obj = evaluation(onsets=onsets, annotationsfile=annotationsfile)
        
        # Write evaluation measures to log file
        dict_fscores[re.sub('\.onsets$', '', file)] = round(eval_obj.fmeasure, 2)
        #f.write(re.sub('\.onsets$', '', filename) + ": " + str(round(eval_obj.fmeasure, 2)) + "\n")
        #f.write("Precision: " + str(round(eval_obj.precision, 2)) + "\n")
        #f.write("Recall:    " + str(round(eval_obj.recall, 2)) + "\n")
        #f.write("F-Measure: " + str(round(eval_obj.fmeasure, 2)) + "\n\n")
        list_evals.append(eval_obj)
        
    # Evaluation over complete dataset
    sum_eval = OnsetSumEvaluation(list_evals)
    print(sum_eval)
    total_sum_evals.append(sum_eval)
        
    # Sort results
    results_dict = dict(sorted(dict_fscores.items(), key=lambda x:x[1], reverse=True))
    df = pd.DataFrame.from_dict(data=results_dict, orient='index', columns=['F-Score'])
    df_list.append(df)

Processing...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.70it/s]


sum for 40 files
  Onsets:  3559 TP:  3081 FP:   485 FN:   478 Precision: 0.864 Recall: 0.866 F-measure: 0.865 mean:  -7.4 ms std:   9.5 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 41/41 [00:10<00:00,  4.03it/s]


sum for 41 files
  Onsets:  3220 TP:  2586 FP:   367 FN:   634 Precision: 0.876 Recall: 0.803 F-measure: 0.838 mean:  -5.4 ms std:  11.4 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.24it/s]


sum for 40 files
  Onsets:  2542 TP:  2089 FP:   451 FN:   453 Precision: 0.822 Recall: 0.822 F-measure: 0.822 mean:  -5.3 ms std:  11.9 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [00:11<00:00,  3.60it/s]


sum for 40 files
  Onsets:  2995 TP:  2581 FP:   387 FN:   414 Precision: 0.870 Recall: 0.862 F-measure: 0.866 mean:  -7.0 ms std:   8.8 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.01it/s]


sum for 40 files
  Onsets:  3915 TP:  3362 FP:   361 FN:   553 Precision: 0.903 Recall: 0.859 F-measure: 0.880 mean:  -6.0 ms std:  10.1 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 41/41 [00:09<00:00,  4.19it/s]


sum for 41 files
  Onsets:  2696 TP:  2330 FP:   275 FN:   366 Precision: 0.894 Recall: 0.864 F-measure: 0.879 mean:  -6.9 ms std:  10.0 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 40/40 [00:11<00:00,  3.63it/s]


sum for 40 files
  Onsets:  2920 TP:  2585 FP:   299 FN:   335 Precision: 0.896 Recall: 0.885 F-measure: 0.891 mean:  -6.9 ms std:  10.1 ms


Processing...: 100%|███████████████████████████████████████████████████████████████████| 39/39 [00:11<00:00,  3.48it/s]

sum for 39 files
  Onsets:  3980 TP:  2865 FP:   286 FN:  1115 Precision: 0.909 Recall: 0.720 F-measure: 0.804 mean:  -5.9 ms std:  10.7 ms



