In [42]:
import librosa
import librosa.feature
import librosa.display
import numpy as np
import os
import pandas as pd
from PIL import Image

def load_fixed_audio(path, duration=30, sr=22050):
    audio, _ = librosa.load(path, sr=sr, duration=duration)
    desired_length = duration * sr
    
    # Pad with zeros if shorter
    if len(audio) < desired_length:
        audio = np.pad(audio, (0, desired_length - len(audio)))
    # Or truncate if longer
    else:
        audio = audio[:desired_length]
    
    return audio

def audio_to_melspec(audio, sr=22050, n_mels=128, hop_length=512):
    melspec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, hop_length=hop_length)
    melspec_db = librosa.power_to_db(melspec+1e-10, ref=np.max, amin=1e-10)
    return melspec_db

def normalize(melspec_db):
    min_val = np.min(melspec_db)
    max_val = np.max(melspec_db)
    # Ensure no division by zero
    if max_val - min_val == 0:
        return np.zeros_like(melspec_db)
    return (melspec_db - min_val) / (max_val - min_val)

def get_all_wav_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.wav'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files


def get_all_image_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.png'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files

In [5]:
files = get_all_wav_files("../data/genres")
mel_spectograms = []

for file in files:
    try:
        filename = file.split('/')[-1]
        mel_spectrogram = normalize(audio_to_melspec(load_fixed_audio(file)))
        flat_input = mel_spectrogram.flatten()
        entry = flat_input.tolist()
        mel_spectograms.append([filename] + entry)
    except Exception as e:
        print(e)
        print(f"Could not process {file}")

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(np.row_stack(mel_spectograms))
df.head(10)

In [None]:
df.to_csv("../data/files/mel_spectograms.csv")

In [43]:
files = get_all_image_files('../data/images/')
mel_spectograms = {}

for file in files:
    try:
        filename = file.split('/')[-1]
        print(filename)
        img = Image.open(file).convert('RGB')  # ensure RGB format
        img_array = np.asarray(img)
        img_array = img_array.astype(np.float16)
        mel_spectograms[filename] = img_array
        img.close()
    except Exception as e:
        print(e)
        print(f"Could not process {file}")
        


blues.00000.wav_mel.png
blues.00001.wav_mel.png
blues.00002.wav_mel.png
blues.00003.wav_mel.png
blues.00004.wav_mel.png
blues.00005.wav_mel.png
blues.00006.wav_mel.png
blues.00007.wav_mel.png
blues.00008.wav_mel.png
blues.00009.wav_mel.png
blues.00010.wav_mel.png
blues.00011.wav_mel.png
blues.00012.wav_mel.png
blues.00013.wav_mel.png
blues.00014.wav_mel.png
blues.00015.wav_mel.png
blues.00016.wav_mel.png
blues.00017.wav_mel.png
blues.00018.wav_mel.png
blues.00019.wav_mel.png
blues.00020.wav_mel.png
blues.00021.wav_mel.png
blues.00022.wav_mel.png
blues.00023.wav_mel.png
blues.00024.wav_mel.png
blues.00025.wav_mel.png
blues.00026.wav_mel.png
blues.00027.wav_mel.png
blues.00028.wav_mel.png
blues.00029.wav_mel.png
blues.00030.wav_mel.png
blues.00031.wav_mel.png
blues.00032.wav_mel.png
blues.00033.wav_mel.png
blues.00034.wav_mel.png
blues.00035.wav_mel.png
blues.00036.wav_mel.png
blues.00037.wav_mel.png
blues.00038.wav_mel.png
blues.00039.wav_mel.png
blues.00040.wav_mel.png
blues.00041.wav_

In [44]:
from sklearn.model_selection import train_test_split
from tensorflow.keras import utils

def clean_labels(s):
    return s.split('.')[0]

labelMap = {
    "blues": 0,
    "classical": 1,
    "country": 2,
    "disco": 3,
    "hiphop": 4,
    "jazz": 5,
    "metal": 6,
    "pop": 7,
    "reggae": 8,
    "rock": 9
}

X = np.array(list(mel_spectograms.values()))
y = np.array(list(map(lambda x: labelMap[clean_labels(x)], mel_spectograms.keys())))
y_cat = utils.to_categorical(y, num_classes=10)

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, random_state=1122)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [66]:
from tensorflow.keras import Sequential, layers

labels = labelMap.values()
input_shape = X_train.shape[1:]
num_classes = len(labels)

model = Sequential([
    layers.Rescaling(1./255, input_shape=input_shape),
    layers.Conv2D(16, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(num_classes)
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [67]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=4,
    validation_split=0.2
)

Epoch 1/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 180ms/step - accuracy: 0.1021 - loss: 7.9381 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoch 2/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 182ms/step - accuracy: 0.1261 - loss: 9.5842 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoch 3/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 180ms/step - accuracy: 0.1003 - loss: 9.6438 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoch 4/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 181ms/step - accuracy: 0.0826 - loss: 9.6688 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoch 5/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 180ms/step - accuracy: 0.0919 - loss: 9.7278 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoch 6/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 180ms/step - accuracy: 0.0956 - loss: 9.7014 - val_accuracy: 0.1000 - val_loss: 9.1336
Epoc

KeyboardInterrupt: 

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {test_acc:.2%}")

y_pred = model.predict(X_test).argmax(axis=1)
y_true_labels = np.argmax(y_test, axis=1)
print(classification_report(y_true_labels, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_true_labels, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()