In [1]:
# Base
import librosa # alternativa pyAudioAnalysis ali audioFlux
import numpy as np
import os
import h5py
import time
import datetime
from scipy import signal
import matplotlib.pyplot as plt
import librosa

# Preprocessing, Metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Keras, Classification
import keras
from keras import models
from keras import layers
from sklearn.svm import SVC
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical

ModuleNotFoundError: No module named 'h5py'

In [None]:
# Load file
fn = f'./genres/rock/rock.00000.wav'
sig, sr = librosa.load(fn, mono=True, duration=5)

# Try Mel - Parameters!
frame_length = int(0.010 * sr)
frame_step = int(0.005 * sr)
mel_spec = librosa.feature.melspectrogram(y=sig, sr=sr, n_fft=512, hop_length=frame_step, win_length=frame_length, window='hann', n_mels=20, fmin=100, fmax=4000)

img = librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), fmin=100, fmax=4000)

# Shapes
print('Mel-Scpectrogram:', np.shape(mel_spec))

In [None]:
# Parameters
genres = np.array('pop rock classical blues country disco metal jazz reggae hiphop'.split())
n_genres = 2 # Only two genres used #len(genres)
n_genres_files = 100 
filter_size = 20
n_windows = 1003 # For 5 second signal chunk - number of timestamps

In [None]:
# [NumberOfSignalParts,NumberOfWindows,NumberOfFilters]
data = np.zeros((n_genres * n_genres_files, filter_size, n_windows))

# [NumberOfSignalParts,1]
data_labels = np.zeros((n_genres * n_genres_files, 1))

# Dataset - Will take some time to generate
data_index = 0
for i_genre in range(0, n_genres):
    print(f"genre: {genres[i_genre]}")
    for filename in os.listdir(f'./genres/{genres[i_genre]}'):
        fn = f'./genres/{genres[i_genre]}/{filename}'
        
        # There is one problematic file - format problem (can try ffmpeg decoder)
        try:
            # Load file (sig-signal; sr-sampling rate)
            sig, sr = librosa.load(fn, mono=True, duration=5)

            # For demo we will only use first 5 seconds of audio
            # Change this!
            # Be careful - the size of training data defines later usage

            mel_spec = librosa.feature.melspectrogram(y=sig, sr=sr, n_fft=512, hop_length=frame_step, win_length=frame_length, window='hann', n_mels=20, fmin=100, fmax=4000)

            # Features - Data
            data[data_index] = mel_spec

            # Genre - Label
            data_labels[data_index] = i_genre

            data_index = data_index + 1
        except:
            print("error")   

# Save to h5 file
hf = h5py.File('dataset_mel.h5', 'w')
hf.create_dataset('data', data=data)
hf.create_dataset('data_labels', data=data_labels)
hf.close()

In [None]:
# Load dataset from h5 file
hf = h5py.File('dataset_mel.h5', 'r')

data = hf.get('data')
data = np.array(data)

data_labels = hf.get('data_labels')
data_labels = np.array(data_labels)

print('Data size:', np.shape(data))
print('Data_labels size:', np.shape(data_labels))

hf.close()

img = librosa.display.specshow(librosa.power_to_db(data[0,:], ref=np.max), fmin=100, fmax=4000)

In [None]:
# Normalize
scaler = StandardScaler()

x = np.reshape(data, (data.shape[0]*data.shape[1], data.shape[2]))
X = np.reshape(scaler.fit_transform(np.array(x, dtype = float)), data.shape)

# Split into test and train
# Why stratify=data_labels?
# Check the histograms, try removing stratify
X_train, X_test, y_train, y_test = train_test_split(X, data_labels, test_size=0.2, stratify=data_labels)

# Split into train and valid
# Why stratify=y_train?
# Check the histograms, try removing stratify
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train)

# Sizes
print('Original dims')
print('Train X:', np.shape(X_train))
print('Train Y:', np.shape(y_train))
print('Test X:', np.shape(X_test))
print('Test Y:', np.shape(y_test))
print('Val X:', np.shape(X_val))
print('Val Y:', np.shape(y_val))

# Why correction?
print('Corrected dims')
X_train = np.expand_dims(X_train, 3)
X_test = np.expand_dims(X_test, 3)
X_val = np.expand_dims(X_val, 3)

print('Train X:', np.shape(X_train))
print('Train Y:', np.shape(y_train))
print('Test X:', np.shape(X_test))
print('Test Y:', np.shape(y_test))
print('Val X:', np.shape(X_val))
print('Val Y:', np.shape(y_val))

plt.hist(y_train, bins=n_genres, rwidth=0.7)
plt.show()
plt.hist(y_test, bins=n_genres, rwidth=0.7)
plt.show()
plt.hist(y_val, bins=n_genres, rwidth=0.7)
plt.show()

In [None]:
# Fix the model - add extra layers, change the number of neurons, number of filters, etc...

# NN model
model = models.Sequential()
model.add(layers.Input(X_train.shape[1:]))

model.add(layers.Conv2D(filters=32, kernel_size=(5, 5), activation='relu'))
model.add(layers.MaxPool2D(pool_size=(2, 2)))
model.add(layers.Dropout(rate=0.25))

model.add(layers.Flatten())

model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dropout(rate=0.5))
model.add(layers.Dense(n_genres))

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.0001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Computes the crossentropy loss between the labels and predictions
metr = keras.metrics.SparseCategoricalAccuracy() # Calculates how often predictions match integer labels
model.compile(optimizer=opt, loss=loss, metrics=[metr])

model.summary()

In [None]:
# Stopping criterion to avoid overfitting
# patience: Number of epochs with no improvement after which training will be stopped.
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Save best weights
model_checkpoint = ModelCheckpoint("cnn.weights.h5", save_best_only=True, save_weights_only=True)

# Train
t_epochs = 10 # Needs to be tuned
b_size = 32 # Needs to be tuned as well - What is batch_size?
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=t_epochs, batch_size=b_size, callbacks=[early_stopping, model_checkpoint])

# Load best weights
model.load_weights("cnn.weights.h5")

In [None]:
# Lets observe the loss metric on both the training (blue) and validation (orange) set
# What do we noice?
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
plt.figure()
plt.plot(history.history['sparse_categorical_accuracy'])
plt.plot(history.history['val_sparse_categorical_accuracy'])
plt.show()

In [None]:
# Now to evaluate our model on train and test data

# Train NN
loss, acc = model.evaluate(X_train, y_train, verbose=0)
print('Acc train NN: %.3f' % acc)

# Test NN
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Acc test NN: %.3f' % acc)

# Val NN
loss, acc = model.evaluate(X_val, y_val, verbose=0)
print('Acc val NN: %.3f' % acc)

In [None]:
# Test NN
# Predictions for additional analysis
predictions = model.predict(X_test)

# Confusion matrix
predicted_labels = np.argmax(predictions, axis=1)
conf = confusion_matrix(y_test, predicted_labels, normalize="pred") # Normalize pred! Explain why?

# Visualise confusion matrix
plt.figure()
plt.imshow(conf)
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.yticks(np.arange(n_genres), genres[0:2])
plt.xticks(np.arange(n_genres), genres[0:2], rotation='vertical')
plt.colorbar()
plt.show()