In [1]:
import os

import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.image import resize
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPool2D
from tensorflow.keras.optimizers import Adam


## Read the dataset


In [2]:
# file_path = "../Data/features_3_sec.csv"
# data = pd.read_csv(file_path)
# data.drop(labels="filename", axis=1, inplace=True)

In [3]:
data_dir = "../Data/genres_original/"
classes = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]

In [4]:
def load_and_preprocess_data(data_dir, classes, target_shape=(150, 150)):
    data = []
    labels = []
    for i_class, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        print("Processing--", class_name)
        for file_name in os.listdir(class_dir):
            if file_name.endswith(".wav"):
                file_path = os.path.join(class_dir, file_name)
                try:
                    # Try to load the audio file
                    audio_data, sample_rate = librosa.load(file_path, sr=None)

                    # Performing preprocessing
                    chunk_duration = 4
                    overlap_duration = 2
                    chunk_samples = chunk_duration * sample_rate
                    overlap_samples = overlap_duration * sample_rate
                    num_chunks = (
                        int(
                            np.ceil(
                                (len(audio_data) - chunk_samples)
                                / (chunk_samples - overlap_samples)
                            )
                        )
                        + 1
                    )

                    # Iterate over each chunk
                    for i in range(num_chunks):
                        start = i * (chunk_samples - overlap_samples)
                        end = start + chunk_samples
                        chunk = audio_data[start:end]
                        mel_spectrogram = librosa.feature.melspectrogram(
                            y=chunk, sr=sample_rate
                        )

                        # Resize matrix to the target shape
                        mel_spectrogram = resize(
                            np.expand_dims(mel_spectrogram, axis=-1), target_shape
                        )
                        data.append(mel_spectrogram)
                        labels.append(i_class)
                except Exception as e:
                    # Handle the exception and skip the corrupted file
                    print(f"Error processing file {file_path}: {e}")

    return np.array(data), np.array(labels)

In [5]:
data, labels = load_and_preprocess_data(data_dir, classes)

Processing-- blues
Processing-- classical
Processing-- country
Processing-- disco
Processing-- hiphop
Processing-- jazz


  audio_data, sample_rate = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing file ../Data/genres_original/jazz/jazz.00054.wav: 
Processing-- metal
Processing-- pop
Processing-- reggae
Processing-- rock


In [6]:
from tensorflow.keras.utils import to_categorical

labels = to_categorical(labels, num_classes=len(classes))
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Split the dataset into training and testing sets


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42
)

## Define the Convolutional Neural Network


In [8]:
X_train.shape

(11980, 150, 150, 1)

In [9]:
model = tf.keras.models.Sequential()
model.add(
    Conv2D(
        filters=32,
        kernel_size=3,
        padding="same",
        activation="relu",
        input_shape=X_train[0].shape,
    )
)
model.add(Conv2D(filters=32, kernel_size=3, activation="relu"))
model.add(MaxPool2D(pool_size=2, strides=2))
model.add(Conv2D(filters=64, kernel_size=3, padding="same", activation="relu"))
model.add(Conv2D(filters=64, kernel_size=3, activation="relu"))
model.add(MaxPool2D(pool_size=2, strides=2))

model.add(Conv2D(filters=128, kernel_size=3, padding="same", activation="relu"))
model.add(Conv2D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPool2D(pool_size=2, strides=2))

model.add(Dropout(0.3))


model.add(Conv2D(filters=256, kernel_size=3, padding="same", activation="relu"))
model.add(Conv2D(filters=256, kernel_size=3, activation="relu"))
model.add(MaxPool2D(pool_size=2, strides=2))
model.add(Conv2D(filters=512, kernel_size=3, padding="same", activation="relu"))
model.add(Conv2D(filters=512, kernel_size=3, activation="relu"))
model.add(MaxPool2D(pool_size=2, strides=2))

model.add(Dropout(0.3))

model.add(Flatten())

model.add(Dense(units=1200, activation="relu"))
model.add(Dropout(0.45))
# Output layer
model.add(Dense(units=len(classes), activation="softmax"))
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# compile model
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [11]:
checkpoint = ModelCheckpoint(
    "./working/model_weights.keras",
    save_best_only=True,
    monitor="val_loss",
    mode="min",
)

In [12]:
# training model
training_history = model.fit(
    X_train,
    Y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_test, Y_test),
    callbacks=[checkpoint],
)

Epoch 1/30
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 804ms/step - accuracy: 0.1938 - loss: 2.1337 - val_accuracy: 0.4274 - val_loss: 1.5853
Epoch 2/30
[1m 36/375[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m4:23[0m 778ms/step - accuracy: 0.3821 - loss: 1.6719

KeyboardInterrupt: 