In [None]:
import os
import warnings

import librosa
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
from keras import models
from sklearn.metrics import confusion_matrix

os.environ["KERAS_BACKEND"] = "jax"
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format='retina'

## Read the dataset


In [None]:
data_dir = "../Data/genres_original/"
classes = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]

In [None]:
def load_and_preprocess_data(data_dir, classes, target_shape=(150, 150)):
    data = []
    labels = []
    for i_class, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        print("Processing--", class_name)
        for file_name in os.listdir(class_dir):
            if file_name.endswith(".wav"):
                file_path = os.path.join(class_dir, file_name)
                try:
                    # Try to load the audio file
                    audio_data, sample_rate = librosa.load(file_path, sr=None)

                    # Performing preprocessing
                    chunk_duration = 4
                    overlap_duration = 2
                    chunk_samples = chunk_duration * sample_rate
                    overlap_samples = overlap_duration * sample_rate
                    num_chunks = (
                        int(
                            np.ceil(
                                (len(audio_data) - chunk_samples)
                                / (chunk_samples - overlap_samples)
                            )
                        )
                        + 1
                    )

                    # Iterate over each chunk
                    for i in range(num_chunks):
                        start = i * (chunk_samples - overlap_samples)
                        end = start + chunk_samples
                        chunk = audio_data[start:end]
                        mel_spectrogram = librosa.feature.melspectrogram(
                            y=chunk, sr=sample_rate
                        )

                        # Resize matrix to the target shape
                        mel_spectrogram = tf.image.resize(
                            np.expand_dims(mel_spectrogram, axis=-1), target_shape
                        )
                        data.append(mel_spectrogram)
                        labels.append(i_class)
                except Exception as e:
                    # Handle the exception and skip the corrupted file
                    print(f"Error processing file {file_path}: {e}")

    return np.array(data), np.array(labels)

In [None]:
data, labels = load_and_preprocess_data(data_dir, classes)

In [None]:
data.shape

In [None]:
labels.shape

In [None]:
len(classes)

In [None]:
from keras import utils

labels = utils.to_categorical(labels, num_classes=len(classes))
labels

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42
)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Build the model


In [None]:
from keras import layers

model = models.Sequential()

# convolutional layers parameters
conv_layers = [
    (32, 2),
    (64, 2),
    (128, 2),
    (256, 2),
    (512, 2),
]

# Input layer
model.add(
    layers.Conv2D(
        filters=conv_layers[0][0],
        kernel_size=3,
        padding="same",
        activation="relu",
        input_shape=X_train[0].shape,
    )
)

# Convolutional layers
for filters, num_layers in conv_layers:
    for _ in range(num_layers - 1):
        model.add(layers.Conv2D(filters=filters, kernel_size=3, activation="relu"))
    model.add(layers.MaxPool2D(pool_size=2, strides=2))
    if filters >= 128:
        model.add(layers.Dropout(0.3))

# Full connected layers
model.add(layers.Flatten())
model.add(layers.Dense(units=1200, activation="relu"))
model.add(layers.Dropout(0.45))
model.add(layers.Dense(units=len(classes), activation="softmax"))


In [None]:
model.summary()

In [None]:
from keras import optimizers

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.0001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [None]:
from keras import callbacks

checkpoint = callbacks.ModelCheckpoint(
    "working/model_weights.keras",
    save_best_only=True,
    monitor="val_loss",
    mode="min",
)

In [None]:
training_history = model.fit(
    X_train,
    y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[checkpoint],
)

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(training_history.history["loss"], label="Training Loss")
plt.plot(training_history.history["val_loss"], label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(training_history.history["accuracy"], label="Training Accuracy")
plt.plot(training_history.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()

plt.show()

In [None]:
model = models.load_model("working/model_weights.keras")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true_classes, y_pred_classes)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=classes,
    yticklabels=classes,
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
# save the plot
plt.savefig("../report/graphics/cnn_evaluation.pdf")
plt.show()

In [None]:
# calculate accuracy
accuracy = np.sum(y_pred_classes == y_true_classes) / len(y_true_classes)
accuracy

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

precision = precision_score(y_true_classes, y_pred_classes, average="macro")
recall = recall_score(y_true_classes, y_pred_classes, average="macro")
f1 = f1_score(y_true_classes, y_pred_classes, average="macro")

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)