#Music genre classification

Music information retrieval is the interdisciplinary science of retrieving information from music.
As the deep learning has become so popular recently, many related researches are available on the internet. 

I will use Keras for this project.

First part was processing the audio file into data base and separate them into training data and test data.
GTZAN genre data set has 10 genres, each of genre has 100 songs, and each song last 30 seconds. 
The basic idea of process is to transform them into frequency spectrum and using Mel Frequency Cepstral Coefficients which has been used for lab07. These processes are already available from the python code that I include in.


In [1]:
import os
device = 'cpu'
import argparse
import csv
import datetime
import glob
import math
import sys
import time
import numpy as np
import pandas as pd # Pandas for reading CSV files and easier Data handling in preparation
from os.path import join

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import __version__ as sklearn_version

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import keras
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, merge
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD

# Plotting

import matplotlib.pyplot as plt
#matplotlib inline 

# Local imports
import rp_extract as rp
from audiofile_read import audiofile_read
# SET YOUR OWN PATH HERE
AUDIO_PATH = 'data'

csv_file = join(AUDIO_PATH,'filelist_GTZAN_mp3_wclasses.txt')
metadata = pd.read_csv(csv_file, index_col=0, header=None)

# create list of filenames with associated classes
filelist = metadata.index.tolist()
classes = metadata[1].values.tolist()
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
labelencoder.fit(classes)

# we keep (and print) the number of classis
n_classes = len(labelencoder.classes_)

classes_num = labelencoder.transform(classes)

from sklearn.preprocessing import OneHotEncoder

# make a row vector a column vector, as needed by OneHotEncoder, using reshape(-1,1) 
classes_num_col = classes_num.reshape(-1, 1)

encoder = OneHotEncoder(sparse=False)
classes_num_1hot = encoder.fit_transform(classes_num_col)

list_spectrograms = [] # spectrograms are put into a list first

# desired output parameters
n_mel_bands = 40   # y axis
frames = 80        # x axis

# some FFT parameters
fft_window_size=1024 #512
fft_overlap = 0.5
hop_size = int(fft_window_size*(1-fft_overlap))
segment_size = fft_window_size + (frames-1) * hop_size # segment size for desired # frames

for filename in filelist:
    #print (".") 
    filepath = os.path.join(AUDIO_PATH, filename)
    samplerate, samplewidth, wavedata = audiofile_read(filepath,verbose=False)
    sample_length = wavedata.shape[0]

    # make Mono (in case of multiple channels / stereo)
    if wavedata.ndim > 1:
        wavedata = np.mean(wavedata, 1)
      
    pos = int(sample_length / 2 - segment_size / 2)
    wav_segment = wavedata[pos:pos+segment_size]
    
    # AUDIO PRE-PROCESSING

    # 1) FFT spectrogram 
    spectrogram = rp.calc_spectrogram(wav_segment,fft_window_size,fft_overlap)

    # 2) Transform to perceptual Mel scale (uses librosa.filters.mel)
    spectrogram = rp.transform2mel(spectrogram,samplerate,fft_window_size,n_mel_bands)
        
    # 3) Log 10 transform
    spectrogram = np.log10(spectrogram)
    
    list_spectrograms.append(spectrogram)
        
print ("\nRead", len(filelist), "audio files")

Using TensorFlow backend.



Read 1000 audio files


Here I made some changes on the code to improve the resulte. I assume that human ear has trouble to hear very low bass sound, so to classify a music genre is less relative for these sound. Therefore the weight of them would be small and if the data set is not sufficent to neglecte them, they would cause negetive effect to the result.
From the result, I found the lowest 2 frequency band would reduce the accuracy, so I drop them.

In [266]:
voidmel = 2

for i in range(len(list_spectrograms)):
    for j in range(voidmel):
        list_spectrograms[i][j,:] = 0


I seperate the date into train data and test data. For each genre, there are 75 train data and 25 test data.

In [267]:
data = np.array(list_spectrograms)
N, ydim, xdim = data.shape
data = data.reshape(N, xdim*ydim)
scaler = preprocessing.StandardScaler()
data = scaler.fit_transform(data)
testset_size = 0.25
train_set, test_set, train_classes, test_classes = train_test_split(data, classes_num, test_size=testset_size, random_state=0)
from collections import Counter
cnt = Counter(train_classes)

splitter = StratifiedShuffleSplit(n_splits=1, test_size=testset_size, random_state=0)
splits = splitter.split(data, classes_num)

for train_index, test_index in splits:
    # split the data
    train_set = data[train_index]
    test_set = data[test_index]
    
    # and the numeric classes (groundtruth)
    train_classes = classes_num[train_index]
    train_classes_1hot = classes_num_1hot[train_index]  # 1 hot we need for traning
    test_classes = classes_num[test_index]
cnt = Counter(train_classes)
n_channels = 1 # 1 for grey-scale, 3 for RGB (in this case usually already present in the data)

train_set = train_set.reshape(train_set.shape[0], ydim, xdim, n_channels)
test_set = test_set.reshape(test_set.shape[0], ydim, xdim, n_channels)
input_shape = train_set.shape[1:]  

For the hidden layer, I choose Rectified linear unit(ReLU) as my activation funciton, comparing with other activation function, I found this one works best.

In [268]:
n_filters = 16  # e.g. 16 or 32 
dropout = 0.25 # None or 0 < dropout < 1
# Input only specifies the input shape
input = Input(input_shape)

# CNN layers
# specify desired number of filters

# The functional API allows to specify the predecessor in (brackets) after the new Layer function call
conv_layer1 = Conv2D(n_filters, 10, 4, activation='relu')(input)  # a vertical filter
conv_layer2 = Conv2D(n_filters, 4, 10, activation='relu')(input)  # a horizontal filter

# LARGER Pooling layers - complementary to vertical/horizontal filter
maxpool1 = MaxPooling2D(pool_size=(1,5))(conv_layer1)
maxpool2 = MaxPooling2D(pool_size=(5,1))(conv_layer2) # used 4,1 first

# Dropout for both layers
if dropout:
    maxpool1 = Dropout(dropout)(maxpool1)
    maxpool2 = Dropout(dropout)(maxpool2)

# we have to flatten the Pooling output in order to be concatenated
poolflat1 = Flatten()(maxpool1)
poolflat2 = Flatten()(maxpool2)

# Merge the 2 parallel pipelines
merged = merge([poolflat1, poolflat2], mode='concat')

full = Dense(256, activation='sigmoid')(merged)
output_layer = Dense(n_classes, activation='softmax')(full)

# finally create the model
model = Model(input=input, output=output_layer)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  name=name)


Stochastic gradient descent(SGD) could also produce a great result, but I found Adam was much fater and have a little higher accuracy.

In [269]:
from keras import optimizers
# Define a loss function 
loss = 'categorical_crossentropy' 
#loss = 'binary_crossentropy' 

opt = optimizers.Adam(lr = 0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
opt2 = 'sgd' 

# Compiling the model
model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

In [270]:
History = model.fit(train_set, train_classes_1hot, batch_size=32, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [271]:
test_pred = model.predict(test_set)
test_pred = np.argmax(test_pred, axis=1)
accuracy_score(test_classes, test_pred)

0.55200000000000005

The result accuracy is around 53% to 55%.