In [1]:
# import comet_ml at the top of your file
from comet_ml import Experiment
experiment = Experiment(
            project_name="music-genre-multiclass-classification",
            workspace="wodenwang820118",
        )
import comet_ml
import logging

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/wodenwang820118/music-genre-multiclass-classification/01254c17d1b74352b5c30297e7e02aaa



In [2]:
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger("comet_ml")

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.utils.np_utils import to_categorical

%matplotlib inline
sns.set_style('whitegrid')

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
# tensorflow 2.7 
import tensorflow as tf 
from tensorflow.keras.layers import Input,Flatten,Dense,Dropout,BatchNormalization,Conv2D,MaxPooling2D,Conv1D,MaxPooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers,activations

In [5]:
# scale the numeric data
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
scaler = StandardScaler()

In [7]:
df_train_features = pd.read_csv('data/train_features.csv')
df_train_labels = pd.read_csv('data/train_labels.csv')

df_valid_features = pd.read_csv('data/valid_features.csv')
df_valid_labels = pd.read_csv('data/valid_labels.csv')

df_test_features = pd.read_csv('data/test_features.csv')
df_test_labels = pd.read_csv('data/test_labels.csv')

In [8]:
num_train_data = df_train_features.iloc[:,9:]
num_valid_data = df_valid_features.iloc[:,9:]
num_test_data = df_test_features.iloc[:,9:]

In [10]:
num_train = num_train_data.astype('float64')
num_valid = num_valid_data.astype('float64')
num_test = num_test_data.astype('float64')

In [11]:
num_train = scaler.fit_transform(num_train_data)
num_valid = scaler.fit_transform(num_valid_data)
num_test = scaler.fit_transform(num_test_data)

In [12]:
num_train.shape[0], num_train.shape[1]

(7678, 148)

In [13]:
num_train = num_train.reshape(num_train.shape[0],148,1)
num_valid = num_valid.reshape(num_valid.shape[0],148,1)
num_test = num_test.reshape(num_test.shape[0],148,1)

In [14]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [15]:
num_train_y = to_categorical(encoder.fit_transform(df_train_labels['genre']),num_classes=8)
num_valid_y = to_categorical(encoder.fit_transform(df_valid_labels['genre']),num_classes=8)

In [16]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df_train_labels['genre']),y=df_train_labels['genre'])
class_weight = {i :class_weight[i] for i in range(8)}

In [17]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=2, verbose=1, mode='auto', min_delta=0.00001, cooldown=0, min_lr=0.0001)

In [18]:
class AudioGenreClassifier:
    def __init__(self,num_train,num_train_y,num_valid,num_valid_y,class_weight,early_stop,experiment):
        self.num_train = num_train
        self.num_train_y = num_train_y
        self.num_valid = num_valid
        self.num_valid_y = num_valid_y
        self.class_weight = class_weight
        self.early_stop = early_stop
        self.experiment = experiment
    
    def build_model(self):
        # Build the model
        # kernel size: https://stats.stackexchange.com/questions/296679/what-does-kernel-size-mean/339265
        # strides: https://stackoverflow.com/questions/51542442/what-is-the-default-stride-length-in-keras-conv1d
        # filters: output dimensions -> from small to the larger number to gradually grab the abstraction features and details
        model = Sequential()
        model.add(
        Conv1D(
          filters=self.experiment.get_parameter("first_layer_filters"),
          kernel_size=self.experiment.get_parameter("first_layer_kernels"),
          strides=1,
          input_shape=(self.num_train.shape[1], self.num_train.shape[2]),
          activation='relu',
          padding='same'
          )
        )
        model.add(MaxPooling1D())
        model.add(BatchNormalization())
       
        model.add(Flatten())

        model.add(
            Dense(
                units=self.experiment.get_parameter("first_layer_dense_units"),
                )
            )
        model.add(layers.Activation(activations.elu))
        model.add(Dropout(self.experiment.get_parameter("first_layer_dropout_units")))
        model.add(BatchNormalization())

        model.add(
            Dense(
                units=self.experiment.get_parameter("second_layer_dense_units"),
                )
            )
        model.add(layers.Activation(activations.elu))
        model.add(Dropout(self.experiment.get_parameter("second_layer_dropout_units")))
        model.add(BatchNormalization())

        model.add(
            Dense(
                units=self.experiment.get_parameter("third_layer_dense_units"),
                )
            )
        model.add(layers.Activation(activations.elu))
        model.add(Dropout(self.experiment.get_parameter("third_layer_dropout_units")))
        model.add(BatchNormalization())

        model.add(Dense(8,activation='softmax'))
        optimizer = keras.optimizers.Adam(0.001)
        model.compile(
            optimizer=optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def train_model(self):
        # Train the model
        model = self.build_model()
        model.fit(
            self.num_train,
            self.num_train_y,
            batch_size=self.experiment.get_parameter("batch_size"),
            epochs=self.experiment.get_parameter("epochs"),
            validation_data=(self.num_valid,self.num_valid_y),
            shuffle=True,
            class_weight=self.class_weight,
            callbacks=[self.early_stop,reduce_lr]
        )
        return model
    
    def evaluate_model(self):
        # Evaluate the model
        model = self.train_model()
        score = model.evaluate(self.num_valid,self.num_valid_y)
        LOGGER.info(f"{ score }")
    
    def grid_search(self, config_dict):
        opt = comet_ml.Optimizer(config_dict)
        for self.experiment in opt.get_experiments(project_name="music-genre-multiclass-classification"):

            self.build_model()
            self.train_model()
            self.evaluate_model()
            self.experiment.end()

In [19]:
audio_model = AudioGenreClassifier(num_train,num_train_y,num_valid,num_valid_y,class_weight,early_stop,experiment)
# mu is the mean number of units, sigma is the standard deviation
audio_model.grid_search({
    "algorithm": "bayes",
    "name": "Optimize Music Classification Network",
    "spec": {"maxCombo": 10, "objective": "minimize", "metric": "loss"},
    "parameters": {
        "first_layer_filters": {"type": "discrete", "values": [2,4,7,8,9,10,16]},
        "first_layer_kernels": {"type": "discrete", "values": [1,2,3,4]},
        "first_layer_dense_units": {"type": "discrete", "values":[700,800,900,1000,1100,1200,1300]},
        "first_layer_dropout_units": {"type": "discrete", "values":[0.4,0.5,0.6,0.7,0.8]},
        "second_layer_dense_units": {"type": "discrete", "values":[300,400,500,600]},
        "second_layer_dropout_units": {"type": "discrete", "values":[0,0.1,0.2,0.3,0.4]},
        "third_layer_dense_units": {"type": "discrete", "values":[50,60,70,80,90,100,200]},
        "third_layer_dropout_units": {"type": "discrete", "values":[0,0.1,0.2,0.3,0.4]},
        "batch_size": {"type": "discrete", "values": [16,32]},
        "epochs": {"type": "discrete", "values": [25]},
    },
    "trials": 1,
})

COMET INFO: COMET_OPTIMIZER_ID=892192d979524b1f8e391aec3f2457e3
COMET INFO: Using optimizer config: {'algorithm': 'bayes', 'configSpaceSize': 1372000, 'endTime': None, 'id': '892192d979524b1f8e391aec3f2457e3', 'lastUpdateTime': None, 'maxCombo': 10, 'name': 'Optimize Music Classification Network', 'parameters': {'batch_size': {'type': 'discrete', 'values': [16, 32]}, 'epochs': {'type': 'discrete', 'values': [25]}, 'first_layer_dense_units': {'type': 'discrete', 'values': [700, 800, 900, 1000, 1100, 1200, 1300]}, 'first_layer_dropout_units': {'type': 'discrete', 'values': [0.4, 0.5, 0.6, 0.7, 0.8]}, 'first_layer_filters': {'type': 'discrete', 'values': [2, 4, 7, 8, 9, 10, 16]}, 'first_layer_kernels': {'type': 'discrete', 'values': [1, 2, 3, 4]}, 'second_layer_dense_units': {'type': 'discrete', 'values': [300, 400, 500, 600]}, 'second_layer_dropout_units': {'type': 'discrete', 'values': [0, 0.1, 0.2, 0.3, 0.4]}, 'third_layer_dense_units': {'type': 'discrete', 'values': [50, 60, 70, 80, 9

Epoch 1/25


COMET INFO: ignoring tensorflow summary log of metrics because of keras; set `comet_ml.loggers.tensorboard_logger.LOG_METRICS = True` to override


In [None]:
num_train.shape

In [None]:
model = Sequential()
model.add(
Conv1D(
  filters=9,
  kernel_size=4,
  strides=1,
  input_shape=(num_train.shape[1], num_train.shape[2]),
  activation='elu',
  padding='same'
  )
)
model.add(MaxPooling1D())
model.add(BatchNormalization())

model.add(Flatten())

model.add(
    Dense(
        units=1100,
        )
    )
model.add(layers.Activation(activations.elu))
model.add(Dropout(0.6))
model.add(BatchNormalization())

model.add(
    Dense(
        units=300,
        )
    )
model.add(layers.Activation(activations.elu))
model.add(Dropout(0))
model.add(BatchNormalization())

model.add(
    Dense(
        units=70,
        )
    )
model.add(layers.Activation(activations.elu))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(8,activation='softmax'))
optimizer = keras.optimizers.Adam(0.001)
model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.fit(
    num_train,
    num_train_y,
    batch_size=32,
    epochs=50,
    validation_data=(num_valid,num_valid_y),
    class_weight=class_weight,
    callbacks=[early_stop,reduce_lr],shuffle=True
)

In [None]:
from sklearn.metrics import classification_report
validation = model.predict(num_valid)
validation = validation.argmax(axis=1)
classes_mapping = {
    0: 'classic pop and rock',
    1: 'dance and electronica',
    2: 'folk',
    3: 'jazz and blues',
    4: 'metal',
    5: 'pop',
    6: 'punk',
    7: 'soul and reggae',
}
predict_label_array = np.vectorize(classes_mapping.get)(validation)
correct_labels = df_valid_labels['genre'].values
print(classification_report(correct_labels,predict_label_array))

In [None]:
validation = model.predict(num_test)
validation = validation.argmax(axis=1)
classes_mapping = {
    0: 'classic pop and rock',
    1: 'dance and electronica',
    2: 'folk',
    3: 'jazz and blues',
    4: 'metal',
    5: 'pop',
    6: 'punk',
    7: 'soul and reggae',
}
predict_label_array = np.vectorize(classes_mapping.get)(validation)
correct_labels = df_test_labels['genre'].values
print(classification_report(correct_labels,predict_label_array))

In [None]:
# CNN
## Scaler comparison (based on the model valiation loss is not increasing)
### RobustScaler -> validation | test accurancy: 0.61 | 0.36
### MinMaxScaler -> validation | test accurancy: 0.29 | 0.18

### StandardScaler -> validation | test accurancy: 0.55 | 0.39
#### Optimized CNN
## Trial 1
# - float128
# - filters: 9
# - kernels: 4
# - strides: 1 -> for sure to use
# - MaxPooling1D: yes -> for sure to use
# - 1100,0.6 ; 300,0 ; 70,0.2 (3 hidden dense layers)
# - Conv1D: relu; Dense all relu
# - validation | test accurancy: 0.55 | 0.40

## Trial 2
# - float128
# - filters: 9
# - kernels: 4
# - strides: 1 -> for sure to use
# - MaxPooling1D: yes -> for sure to use
# - 1100,0.6 ; 300,0 ; 70,0.2 (3 hidden dense layers)
# - Conv1D: elu; Dense all elu
# - validation | test accurancy: 0.53 | 0.45 (the best, but could vary due to random state)

### Overall, the accuracy is around 0.39-0.45. The best trial uses all elu activation functions.