In [29]:
import librosa
import numpy as np
import pandas as pd
import scipy

In [97]:
def columns():
    feature_sizes = dict(chroma_stft=12, chroma_cqt=12, chroma_cens=12,
                         mfcc=12, rms=1, spectral_centroid=1, spectral_bandwidth=1, 
                         spectral_contrast=7, spectral_flatness=1, spectral_rolloff=1,
                         poly_features=3, tonnetz=6, zcr=1, dtempo=1,
                         onset_strength=1, tempogram_ratio=13, plp=1)
    single_features = ['onset_num', 'beats', 'tempo', 'dtempo_changes']
    moments = ('mean', 'std', 'median', 'min', 'max')

    columns = []
    for name, size in feature_sizes.items():
        for moment in moments:
            it = (f"{name}_{i:02d}_{moment}" for i in range(size))
            columns.extend(it)
    # columns.extend(single_features)
    columns = np.sort(np.array(columns))
    columns = np.append(columns, single_features)
    columns = np.append(columns, 'Genre')
    return columns

In [100]:
len(columns())

435

In [101]:
columns()

array(['chroma_cens_00_max', 'chroma_cens_00_mean',
       'chroma_cens_00_median', 'chroma_cens_00_min',
       'chroma_cens_00_std', 'chroma_cens_01_max', 'chroma_cens_01_mean',
       'chroma_cens_01_median', 'chroma_cens_01_min',
       'chroma_cens_01_std', 'chroma_cens_02_max', 'chroma_cens_02_mean',
       'chroma_cens_02_median', 'chroma_cens_02_min',
       'chroma_cens_02_std', 'chroma_cens_03_max', 'chroma_cens_03_mean',
       'chroma_cens_03_median', 'chroma_cens_03_min',
       'chroma_cens_03_std', 'chroma_cens_04_max', 'chroma_cens_04_mean',
       'chroma_cens_04_median', 'chroma_cens_04_min',
       'chroma_cens_04_std', 'chroma_cens_05_max', 'chroma_cens_05_mean',
       'chroma_cens_05_median', 'chroma_cens_05_min',
       'chroma_cens_05_std', 'chroma_cens_06_max', 'chroma_cens_06_mean',
       'chroma_cens_06_median', 'chroma_cens_06_min',
       'chroma_cens_06_std', 'chroma_cens_07_max', 'chroma_cens_07_mean',
       'chroma_cens_07_median', 'chroma_cens_07_min'

In [107]:
def count_value_changes(arr):
    changes = 0
    for i in range(1, len(arr)):
        if arr[i] != arr[i - 1]:
            changes += 1
    return changes


def calculate_features_for_single_record(file_path):
    y, sr = librosa.load(file_path)
    
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=12)  #
    chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr, n_chroma=12)    #
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)               #
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)                  #
    rms = librosa.feature.rms(y=y)                                      #

    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)   #
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr) #
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)   #
    spectral_flatness = librosa.feature.spectral_flatness(y=y)          #
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)     #

    poly_features = librosa.feature.poly_features(y=y, sr=sr, order=2)  #
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr) #
    zcr = librosa.feature.zero_crossing_rate(y)                         #

    onset_env = librosa.onset.onset_strength(y=y, sr=sr)                #
    plp = librosa.beat.plp(onset_envelope=onset_env, sr=sr)             #
    
    dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, aggregate=None) #
    tempogram_ratio = librosa.feature.tempogram_ratio(tg=librosa.feature.tempogram(y=y, sr=sr), sr=sr) #
    
    # Single features
    dtempo_changes = count_value_changes(dtempo)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    onset_num = len(librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr))

    moments = ['mean', 'std', 'median', 'min', 'max']

    def aggregate_feature(feature):
        return [np.max(feature), np.mean(feature), np.median(feature), np.min(feature), np.std(feature)]
    
    features = []

    for f in [chroma_cens, chroma_cqt, chroma_stft, dtempo, mfcc, onset_env, plp, poly_features, rms, spectral_bandwidth,
              spectral_centroid, spectral_contrast, spectral_flatness, spectral_rolloff, tempogram_ratio, 
              tonnetz, zcr]:
        if f.ndim == 1:
            features.extend(aggregate_feature(f))
        else:
            features.extend(np.hstack([aggregate_feature(f[i]) for i in range(f.shape[0])]))

    features.append(onset_num)
    features.append(len(beats))
    features.append(tempo[0])
    features.append(dtempo_changes)

    genre = file_path.split('/')[-2]
    features.append(genre)

    return features

In [108]:
res = calculate_features_for_single_record('../datasets/fma/fma_small/Hip-Hop/000002.mp3')

In [109]:
len(res)

435

In [110]:
res

[np.float32(0.7376803),
 np.float32(0.49256533),
 np.float32(0.48467067),
 np.float32(0.31425408),
 np.float32(0.062148973),
 np.float32(0.47996426),
 np.float32(0.30732974),
 np.float32(0.30164075),
 np.float32(0.17959258),
 np.float32(0.049466945),
 np.float32(0.39072192),
 np.float32(0.27581272),
 np.float32(0.2707575),
 np.float32(0.09297817),
 np.float32(0.03838634),
 np.float32(0.35007122),
 np.float32(0.2573156),
 np.float32(0.25383344),
 np.float32(0.15650176),
 np.float32(0.026604397),
 np.float32(0.34356406),
 np.float32(0.2526637),
 np.float32(0.2478726),
 np.float32(0.13039277),
 np.float32(0.030183008),
 np.float32(0.46906856),
 np.float32(0.28693238),
 np.float32(0.27072838),
 np.float32(0.14936936),
 np.float32(0.062998176),
 np.float32(0.4230646),
 np.float32(0.26416123),
 np.float32(0.25045657),
 np.float32(0.16732943),
 np.float32(0.038644467),
 np.float32(0.4509913),
 np.float32(0.27331513),
 np.float32(0.25737765),
 np.float32(0.2281471),
 np.float32(0.039544135),
 