In [None]:
import musdb
import essentia
import essentia.standard as estd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# Dataset's Path
mus = musdb.DB(root_dir='musdb/')

# Load the training tracks
train_tracks = mus.load_mus_tracks(subsets=['train'])

# Load the test tracks
test_tracks = mus.load_mus_tracks(subsets=['test'])

# Music Extractor from Essentia
frame_size = 1024
tcontext = 120 #in number of frames
extractor = estd.Extractor(lowLevelFrameSize = int(frame_size*tcontext), lowLevelHopSize = int(frame_size*tcontext/4), rhythm = False, highLevel = False )

# Order of selected features
intraFrame_features = ['lowLevel.barkbands', 'lowLevel.barkbands_kurtosis', 'lowLevel.barkbands_skewness', 'lowLevel.barkbands_spread', 
                       'lowLevel.dissonance', 'lowLevel.hfc', 'lowLevel.mfcc', 'lowLevel.pitch', 'lowLevel.pitch_instantaneous_confidence', 
                       'lowLevel.pitch_salience', 'lowLevel.sccoeffs', 'lowLevel.scvalleys', 'lowLevel.silence_rate_20dB', 
                       'lowLevel.silence_rate_30dB', 'lowLevel.silence_rate_60dB', 'lowLevel.spectral_centroid', 'lowLevel.spectral_complexity', 
                       'lowLevel.spectral_crest', 'lowLevel.spectral_decrease', 'lowLevel.spectral_energy', 'lowLevel.spectral_energyband_high', 
                       'lowLevel.spectral_energyband_low', 'lowLevel.spectral_energyband_middle_high', 'lowLevel.spectral_energyband_middle_low', 
                       'lowLevel.spectral_flatness_db', 'lowLevel.spectral_flux', 'lowLevel.spectral_kurtosis', 'lowLevel.spectral_rms', 
                       'lowLevel.spectral_rolloff', 'lowLevel.spectral_skewness', 'lowLevel.spectral_spread', 'lowLevel.spectral_strongpeak', 
                       'lowLevel.zerocrossingrate', 'sfx.inharmonicity', 'sfx.oddtoevenharmonicenergyratio', 'sfx.tristimulus']

with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]
    
def extract_features(track, intraFrame_features, label):
    # Cast to 32 bits and then, to Essentia array in order to apply the Extractor algorithm
    track = np.float32(track)
    track = essentia.array(track)
    pool = extractor(track)
    # First feature assignment 
    features = pool[intraFrame_features[0]]
    for i in intraFrame_features[1:]:
        # If not multidimensional feature, expands dimensions to allow concatenation
        if len(pool[i].shape) > 1:
            features = np.concatenate((features, pool[i]), axis = 1)
        else:
            features = np.concatenate((features, np.expand_dims(pool[i], axis = 1)), axis =1)
    # Adds the frame's label column
    features = np.concatenate((features, np.expand_dims(np.tile(label, features.shape[0]), axis = 1)), axis = 1)
    return features

features = np.ndarray([], dtype = 'float32')
for i, track in enumerate(train_tracks):
    # Train tracks: Downmix to mono and feature extraction
    # Vocals
    aux = track.targets['vocals'].audio[:,0]*0.5 + track.targets['vocals'].audio[:,1]*0.5
    # if first iteration, it does not concatenate (empty array)
    if i == 0:
        features = extract_features(aux, intraFrame_features, 0)
    else:
        features = np.concatenate((features, extract_features(aux, intraFrame_features, 0)))
    # Drums
    aux = track.targets['drums'].audio[:,0]*0.5 + track.targets['drums'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 1)))
    # Bass
    aux = track.targets['bass'].audio[:,0]*0.5 + track.targets['bass'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 2)))
    # Other
    aux = track.targets['other'].audio[:,0]*0.5 + track.targets['other'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 3)))
    print(str(len(train_tracks) - i-1) + ' train tracks remaining')



for i, track in enumerate(test_tracks):
    # Test tracks: Downmix to mono and feature extraction
    # Vocals
    aux = track.targets['vocals'].audio[:,0]*0.5 + track.targets['vocals'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 1)))
    # Drums
    aux = track.targets['drums'].audio[:,0]*0.5 + track.targets['drums'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 1)))
    # Bass
    aux = track.targets['bass'].audio[:,0]*0.5 + track.targets['bass'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 2)))
    # Other
    aux = track.targets['other'].audio[:,0]*0.5 + track.targets['other'].audio[:,1]*0.5
    features = np.concatenate((features, extract_features(aux, intraFrame_features, 3)))
    print(str(len(test_tracks) - i-1) + ' test tracks remaining')    
    

In [None]:
# Save array
np.save('features_tcontext_' + str(tcontext) + '_frameSize_' + str(frame_size) + '.npy', features)

In [None]:
import musdb
import essentia
import essentia.standard as estd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# Dataset's Path
mus = musdb.DB(root_dir='musdb/')

# Load the training tracks
train_tracks = mus.load_mus_tracks(subsets=['train'])

# Load the test tracks
test_tracks = mus.load_mus_tracks(subsets=['test'])

# Music Extractor from Essentia
frame_size = 1024
tcontext = 120 #in number of frames
extractor = estd.Extractor(lowLevelFrameSize = int(frame_size*tcontext), lowLevelHopSize = int(frame_size*tcontext), rhythm = False, highLevel = False )

# Order of selected features
intraFrame_features = ['lowLevel.barkbands', 'lowLevel.barkbands_kurtosis', 'lowLevel.barkbands_skewness', 'lowLevel.barkbands_spread', 
                       'lowLevel.dissonance', 'lowLevel.hfc', 'lowLevel.mfcc', 'lowLevel.pitch', 'lowLevel.pitch_instantaneous_confidence', 
                       'lowLevel.pitch_salience', 'lowLevel.sccoeffs', 'lowLevel.scvalleys', 'lowLevel.silence_rate_20dB', 
                       'lowLevel.silence_rate_30dB', 'lowLevel.silence_rate_60dB', 'lowLevel.spectral_centroid', 'lowLevel.spectral_complexity', 
                       'lowLevel.spectral_crest', 'lowLevel.spectral_decrease', 'lowLevel.spectral_energy', 'lowLevel.spectral_energyband_high', 
                       'lowLevel.spectral_energyband_low', 'lowLevel.spectral_energyband_middle_high', 'lowLevel.spectral_energyband_middle_low', 
                       'lowLevel.spectral_flatness_db', 'lowLevel.spectral_flux', 'lowLevel.spectral_kurtosis', 'lowLevel.spectral_rms', 
                       'lowLevel.spectral_rolloff', 'lowLevel.spectral_skewness', 'lowLevel.spectral_spread', 'lowLevel.spectral_strongpeak', 
                       'lowLevel.zerocrossingrate', 'sfx.inharmonicity', 'sfx.oddtoevenharmonicenergyratio', 'sfx.tristimulus']

with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]
    
def extract_features(track, intraFrame_features, label):
    # Cast to 32 bits and then, to Essentia array in order to apply the Extractor algorithm
    track = np.float32(track)
    track = essentia.array(track)
    pool = extractor(track)
    # First feature assignment 
    features = pool[intraFrame_features[0]]
    for i in intraFrame_features[1:]:
        # If not multidimensional feature, expands dimensions to allow concatenation
        if len(pool[i].shape) > 1:
            features = np.concatenate((features, pool[i]), axis = 1)
        else:
            features = np.concatenate((features, np.expand_dims(pool[i], axis = 1)), axis =1)
    # Adds the frame's label column
    features = np.concatenate((features, np.expand_dims(np.tile(label, features.shape[0]), axis = 1)), axis = 1)
    return features

aux = train_tracks[0].targets['vocals'].audio[:,0]*0.5 + train_tracks[0].targets['vocals'].audio[:,1]*0.5
f  = extract_features(aux, intraFrame_features, 0)
