# Music Genre Classifier using CNN

Accuracy ~ 79.88%
**Keras file not uploaded due to memory constraints

## Data Preprocessing

In [1]:
#Import required libraries
import pandas as pd
import numpy as np
import os
import librosa
import cv2

from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, MaxPooling2D, Conv2D, Flatten, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [2]:
classes = ['blues', 'classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']

def load_and_preprocess_data(data_dir, classes, target_shape=(150, 150)):
    data = []
    labels = []

    for label_idx, genre in enumerate(classes):
        genre_dir = os.path.join(data_dir, genre)
        print(f"Processing {genre}")

        for filename in os.listdir(genre_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(genre_dir, filename)
                try:
                    audio, sr = librosa.load(file_path, sr=22050)

                    chunk_duration = 4  # seconds
                    overlap_duration = 2  # seconds

                    chunk_samples = chunk_duration * sr
                    overlap_samples = overlap_duration * sr

                    num_chunks = int(np.ceil((len(audio) - chunk_samples) / (chunk_samples - overlap_samples))) + 1

                    for i in range(num_chunks):
                        start = i * (chunk_samples - overlap_samples)
                        end = start + chunk_samples
                        if end > len(audio): break

                        chunk = audio[start:end]

                        mel_spec = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=128)
                        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                        mel_spec_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))

                        mel_resized = cv2.resize(mel_spec_norm, target_shape[:2], interpolation=cv2.INTER_AREA)
                        mel_resized = np.expand_dims(mel_resized, axis=-1)

                        data.append(mel_resized)
                        labels.append(label_idx)

                except Exception as e:
                    print(f"Error processing {filename}: {e}")
                    continue

    return np.array(data), np.array(labels)
    
data, labels = load_and_preprocess_data('Data/genres_original', classes)
labels = to_categorical(labels, num_classes=len(classes))

print("Final data shape:", data.shape)
print("Labels shape:", labels.shape)

Processing blues
Processing classical
Processing country
Processing disco
Processing hiphop
Processing jazz
Processing metal
Processing pop
Processing reggae
Processing rock
Final data shape: (13963, 150, 150, 1)
Labels shape: (13963, 10)


## Modelling

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=69)

CNN = Sequential()

CNN.add(Input(shape=(150, 150, 1)))
CNN.add(Conv2D(32, (3, 3), activation='relu'))
CNN.add(Conv2D(64, (3, 3), activation='relu'))
CNN.add(MaxPooling2D(pool_size=(2, 2)))

CNN.add(Conv2D(128, (3, 3), activation='relu'))
CNN.add(Conv2D(256, (3, 3), activation='relu'))
CNN.add(MaxPooling2D(pool_size=(2, 2)))

CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(MaxPooling2D(pool_size=(2, 2)))
CNN.add(Dropout(0.35))

CNN.add(Flatten())
CNN.add(Dense(512, activation='relu'))
CNN.add(Dropout(0.42))
CNN.add(Dense(len(classes), activation='softmax'))
CNN.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
CNN.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min', verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

history = CNN.fit(X_train, y_train, epochs=50, batch_size=128, validation_split=0.2, callbacks=[early_stopping, model_checkpoint, reduce_lr])

Epoch 1/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24s/step - accuracy: 0.7857 - loss: 0.6118 
Epoch 1: val_loss improved from inf to 0.65539, saving model to best_model.keras
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1802s[0m 26s/step - accuracy: 0.7859 - loss: 0.6113 - val_accuracy: 0.7820 - val_loss: 0.6554 - learning_rate: 0.0010
Epoch 2/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20s/step - accuracy: 0.8236 - loss: 0.4928 
Epoch 2: val_loss did not improve from 0.65539
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1476s[0m 21s/step - accuracy: 0.8237 - loss: 0.4927 - val_accuracy: 0.7699 - val_loss: 0.6599 - learning_rate: 0.0010
Epoch 3/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20s/step - accuracy: 0.8477 - loss: 0.4409 
Epoch 3: val_loss improved from 0.65539 to 0.63320, saving model to best_model.keras
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1478s[0m 21s/step - 

In [None]:
CNN.evaluate(X_test, y_test)

[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 1s/step - accuracy: 0.8016 - loss: 0.5879


[0.6018626093864441, 0.7987826466560364]

## Deployment

In [5]:
def preprocess_audio(file_path, target_shape=(150, 150)):
    audio, sr = librosa.load(file_path, sr=22050)
    chunk_duration = 4  # seconds
    overlap_duration = 2  # seconds

    chunk_samples = chunk_duration * sr
    overlap_samples = overlap_duration * sr

    num_chunks = int(np.ceil((len(audio) - chunk_samples) / (chunk_samples - overlap_samples))) + 1

    mel_specs = []

    for i in range(num_chunks):
        start = i * (chunk_samples - overlap_samples)
        end = start + chunk_samples
        if end > len(audio): break

        chunk = audio[start:end]

        mel_spec = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=128)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))

        mel_resized = cv2.resize(mel_spec_norm, target_shape[:2], interpolation=cv2.INTER_AREA)
        mel_resized = np.expand_dims(mel_resized, axis=-1)

        mel_specs.append(mel_resized)

    return np.array(mel_specs)

df = preprocess_audio('Data/genres_original/disco/disco.00008.wav')

model = keras.models.load_model('CNN.keras')
predictions = model.predict(df)
genres = {0: 'blues', 1: 'classical', 2: 'country', 3: 'disco', 4: 'hiphop', 5: 'jazz', 6: 'metal', 7: 'pop', 8: 'reggae', 9: 'rock'}
print(f"The predicted genre for the uploaded audio file is {genres[np.argmax(predictions[0])]}.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
The predicted genre for the uploaded audio file is disco.
