# FMA: A Dataset For Music Analysis

Kirell Benzi, Michaël Defferrard, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

We explore three types of baselines:
1. simple algorithms,
2. state-of-the-art in genre recognition,
3. deep Learning approaches,

using different input features:
1. raw audio,
2. echonest features,
3. audio features from librosa or [kapre](https://github.com/keunwoochoi/kapre).

We aim at showing that given sufficient data, DL approaches can outperfom all the others without domain-specific / expert knowledge.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import utils
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython.display as ipd
import time
import os

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

Using TensorFlow backend.


In [2]:
DATA_DIR = os.environ.get('DATA_DIR')
df = pd.read_json(os.path.join(DATA_DIR, 'fma_small.json'))
path = utils.build_path(df, DATA_DIR)

## 1 Simple classifiers on Echonest features

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [3]:
# Select features.
#features = utils.ECHONEST_AUDIO_FEATURES + utils.ECHONEST_SOCIAL_FEATURES
features = utils.ECHONEST_AUDIO_FEATURES

# Discard songs with NaN Echonest features.
# TODO: fix dataset.
keep = df[features].isnull().apply(lambda x: not x.any(), axis=1)
df = pd.DataFrame(df[keep])

In [4]:
def pre_process(df, features, multi_label=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        y = enc.fit_transform(df['top_genre'])
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        y = enc.fit_transform(df['genres'])
    print('Genres ({}): {}'.format(len(enc.classes_), enc.classes_))

    X = df[features].as_matrix()
    
    # Split in training, validation and testing sets.
    train = df['train'] == True
    y_train = y[train]
    y_test = y[~train]
    X_train = X[train]
    X_test = X[~train]
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.2, random_state=42)
    print('{} training examples, {} validation examples, {} testing examples'.format(y_train.shape[0], y_val.shape[0], y_test.shape[0]))
    print('{} features'.format(X_train.shape[1]))
    
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

### 1.2 Single genre

Maximum observed is around 38%, both on `fma_small` and `fma_medium`.

In [5]:
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(df, features)

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=200),
    SVC(),
    SVC(kernel="linear"),
    LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(n_estimators=10),
    MLPClassifier(max_iter=400),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

for clf in classifiers:
    t = time.process_time()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('{:.2f}% {:.2f}s {}'.format(score*100, time.process_time()-t, type(clf).__name__))

Genres (10): ['Electronic' 'Folk' 'Hip-Hop' 'Indie-Rock' 'Jazz' 'Old-Time / Historic'
 'Pop' 'Psych-Rock' 'Punk' 'Rock']
2524 training examples, 631 validation examples, 788 testing examples
8 features
33.25% 0.04s LogisticRegression
31.98% 0.07s KNeighborsClassifier
36.04% 0.36s SVC
33.50% 0.19s SVC
33.25% 0.83s LinearSVC
34.14% 0.01s DecisionTreeClassifier
34.52% 0.02s RandomForestClassifier
31.09% 0.04s AdaBoostClassifier
36.29% 2.80s MLPClassifier
31.47% 0.03s GaussianNB
31.22% 0.05s QuadraticDiscriminantAnalysis


### 1.3 Multiple genres

Maximum observed is around 11% for `fma_small` and 7.6% for `fma_medium`.

TODO:
* Eliminate rare genres. On small only the 10 selected genres are meaningful.

In [6]:
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(df, features, multi_label=True)

classifiers = [
    #LogisticRegression(),
    OneVsRestClassifier(LogisticRegression()),
    OneVsRestClassifier(SVC()),
]

for clf in classifiers:
    t = time.process_time()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('{:.2f}% {:.2f}s {}'.format(score*100, time.process_time()-t, type(clf).__name__))

Genres (108): ['20th Century Classical' 'African' 'Afrobeat' 'Alternative Hip-Hop'
 'Americana' 'Asia-Far East' 'Balkan' 'Big Band/Swing' 'Bigbeat'
 'Bluegrass' 'Bollywood' 'Brazilian' 'Breakbeat' 'Breakcore - Hard'
 'British Folk' 'Chamber Music' 'Chill-out' 'Chip Music' 'Chiptune'
 'Classical' 'Composed Music' 'Country' 'Country & Western' 'Cumbia'
 'Dance' 'Disco' 'Downtempo' 'Drone' 'Dubstep' 'Easy Listening'
 'Easy Listening: Vocal' 'Electro-Punk' 'Electroacoustic' 'Electronic'
 'Europe' 'Flamenco' 'Folk' 'Freak-Folk' 'Free-Folk' 'Free-Jazz' 'French'
 'Funk' 'Gospel' 'Goth' 'Hardcore' 'Hip-Hop' 'Hip-Hop Beats' 'Holiday'
 'House' 'IDM' 'Improv' 'Indian' 'Indie-Rock' 'Industrial' 'Instrumental'
 'Interview' 'Jazz' 'Jazz: Out' 'Jazz: Vocal' 'Klezmer' 'Krautrock' 'Latin'
 'Latin America' 'Loud-Rock' 'Lounge' 'Metal' 'Middle East'
 'Minimal Electronic' 'Minimalism' 'Modern Jazz' 'Musique Concrete'
 'New Age' 'New Wave' 'No Wave' 'Nu-Jazz' 'Old-Time / Historic' 'Opera'
 'Polka' 'Pop' 'P

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


10.15% 0.47s OneVsRestClassifier


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


11.42% 2.08s OneVsRestClassifier


## 2 Deep learning on raw audio

In [7]:
# TODO: fix dataset.
df['top_genre'] = df['top_genre'].apply(lambda genre: 'Old-Time' if genre == 'Old-Time / Historic' else genre)

# TODO: fix dataset.
# Clips with less than 1321967 samples because lower sampling rate, or mono.
BAD_CLIPS = [16402, 16425, 16406, 16431, 33709, 16352, 16404, 33708, 31375, 33702, 22590, 22591, 16039,
             12856, 33716, 16426, 16422, 16421, 16405, 16427, 16401, 16038, 16424, 16429, 16351, 16428,
             16039, 33716, 33702, 31375, 16422, 16352, 22591, 16426, 16429, 16038, 16401, 12856, 16404,
             16402, 16428, 16425, 16405, 33708, 16424, 33709, 16427, 16431, 22590, 16351, 33714, 16421, 16406]
BAD_CLIPS.extend([11665, 12899, 12916, 12917, 16353, 16398, 16400, 16423, 16430, 18689, 18691])

df = df.drop(BAD_CLIPS)
path = utils.build_path(df, DATA_DIR)

In [8]:
labels_onehot = LabelBinarizer().fit_transform(df.top_genre)

train = np.argwhere(df['train'] == True).flatten()
test = np.argwhere(df['train'] == False).flatten()

Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    * resampling is very slow
    * does not work with multi-processing, for keras `fit_generator()`
* pydub is a high-level interface for audio modification, uses ffmpeg to load
    * store a temporary `.wav`
* directly pipe ffmpeg output
    * fastest method
* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries

In [9]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(path(0))
SampleLoader = utils.build_sample_loader(path, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape

(2, 1321967)

In [10]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

### 2.1 Fully connected neural network

* Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

CPU
* batch 4, worker 8, queue 1, 600s
* batch 20, worker 24, queue 5, 190s
* batch 20, worker 12, queue 10, 185s
* batch 40, worker 12, queue 10, 135s
* batch 64, worker 12, queue 10, 110s
* batch 128, worker 12, queue 10, 100s

GPU Tesla K40c
* batch 4, worker 12, queue 10, 250s
* batch 16, worker 12, queue 10, 100s
* batch 32, worker 12, queue 10, 90s
* batch 64, worker 12, queue 10, 70s
* batch 96-128 --> memory error

In [11]:
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss

Dimensionality: (59953,)
Epoch 1/2
Epoch 2/2


[14.458574412872432, 0.10296010296969187]

### 2.2 Convolutional neural network

* Architecture from [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf) by Sander Dieleman, Benjamin Schrauwen.
* Missing: track segmentation and majority voting

In [12]:
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss

(None, 479625, 1)
(None, 936, 128)
(None, 929, 32)
(None, 225, 32)
(None, 56, 32)
(None, 1792)
(None, 100)
(None, 10)
Epoch 1/2
Epoch 2/2




[14.479318942802752, 0.10167310336074928]

### 2.3 Recurrent neural network

## 3 Deep learning on extracted audio features

### 3.1 ConvNet on MFCC

* Architecture from [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf) by Tom LH. Li, Antoni B. Chan and Andy HW. Chun
* Missing: track segmentation and majority voting.
* Best seen: 17.6%

In [13]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(path, labels_onehot, loader)
loader.load(path(0))[0].shape

(2582,)

In [14]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss

(None, 13, 2582, 1)
(None, 1, 644, 3)
(None, 1, 159, 15)
(None, 1, 38, 65)
(None, 2470)
(None, 10)
Epoch 1/2
Epoch 2/2


[2.3026296003137929, 0.10167310167310167]