In [1]:
import librosa
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib as plt
import os
import multiprocessing

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Check GPU
tf.config.experimental.list_physical_devices("GPU")

2024-04-22 19:04:12.563456: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-22 19:04:12.606194: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-22 19:04:14.294775: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-22 19:04:14.357245: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devi

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Configurations

In [2]:
TEST_AUDIO = "../processed_data/CAL500_32kps/CAL500_32kps/10cc-for_you_and_i.mp3"

AUDIO_BASE_DIR = "../processed_data/CAL500_32kps/CAL500_32kps"
AUDIO_FILE_PATHS = [
    os.path.join(AUDIO_BASE_DIR, f)
    for f in os.listdir(AUDIO_BASE_DIR)
    if f.endswith(".mp3")
]
AUDIO_FILE_PATHS.sort()
AUDIO_FILE_PATHS = pd.Series(AUDIO_FILE_PATHS)

N_CHROMA = 12
N_MFCC = 20

In [3]:
def get_features(audio_path: str):
    """Extract features from audio file path"""
    
    # y = audio time series, sr = sampling rate
    y, sr = librosa.load(audio_path)

    rms = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=N_CHROMA)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)

    result = {
        "audio_path": audio_path.split('/')[-1].replace('.mp3', ''),
        "rms": np.mean(rms),
        "spectral_centroid": np.mean(spec_cent),
        "spectral_bandwidth": np.mean(spec_bw),
        "rolloff": np.mean(rolloff),
        "zero_crossing_rate": np.mean(zcr),
    }
    result.update({f"chroma_stft_{i}": np.mean(value) for i, value in enumerate(chroma_stft)})
    result.update({f"mfcc_{i}": np.mean(value) for i, value in enumerate(mfcc)})

    return result

In [4]:
%%time
features = get_features(TEST_AUDIO)

# View sample output
print(features['audio_path'])
print(features['rms'])
print(features['spectral_centroid'])
print(features['spectral_bandwidth'])
print(features['rolloff'])
print(features['zero_crossing_rate'])
print(features['chroma_stft_11'])
print(features['mfcc_19'])

10cc-for_you_and_i
0.078477375
1242.8761088682588
1726.1277596491416
2474.1747054066545
0.05013854620521961
0.2546595
-12.683043
CPU times: user 2.56 s, sys: 3.17 s, total: 5.73 s
Wall time: 3.86 s


In [5]:
# %%time
# TEST_LIMIT = 502

# def process_audio(audio_path):
#     features = get_features(audio_path)
#     return features

# pool = multiprocessing.Pool()
# results = pool.map(process_audio, AUDIO_FILE_PATHS[:TEST_LIMIT])
# pool.close()
# pool.join()

# df_ls = results

In [6]:
# df = pd.DataFrame.from_records(df_ls, index="audio_path")
# df.to_csv("processed_data/CLEANED_chroma_features.csv")

In [7]:
df = pd.read_csv("../processed_data/CLEANED_chroma_features.csv", index_col="audio_path")
print(df.shape)
df.head()

(500, 37)


Unnamed: 0_level_0,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,chroma_stft_0,chroma_stft_1,chroma_stft_2,chroma_stft_3,chroma_stft_4,...,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19
audio_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10cc-for_you_and_i,0.078477,1242.876109,1726.12776,2474.174705,0.050139,0.402762,0.232984,0.277037,0.313551,0.340628,...,-13.063147,-4.559336,-8.964912,-7.25837,-5.434934,-9.783126,-5.295118,-11.920966,-1.528779,-12.683043
2pac-trapped,0.238134,2429.877858,2475.507312,5477.636982,0.091782,0.531476,0.469931,0.461683,0.477741,0.489731,...,-0.361852,10.822704,6.678515,2.56076,6.720437,2.623295,7.223275,-7.491878,10.546954,-8.585839
5th_dimension-one_less_bell_to_answer,0.064127,1654.608737,1852.08037,3570.534366,0.079638,0.250085,0.313986,0.299618,0.233613,0.288843,...,-15.038347,-1.683385,-9.528951,-7.305714,-1.321918,-9.135875,0.357172,-10.904411,2.871611,-7.8164
a_tribe_called_quest-bonita_applebum,0.180943,1954.297377,2037.378982,4195.438543,0.084297,0.610729,0.543563,0.366985,0.297738,0.280891,...,-7.689218,2.494104,-3.390706,-0.134979,3.72972,-6.839774,0.683344,-9.631829,2.695987,-7.485964
aaron_neville-tell_it_like_it_is,0.10045,1663.30344,1737.428955,3524.073698,0.081335,0.512765,0.367458,0.403097,0.290497,0.389738,...,-6.668141,-1.155371,-0.712915,-3.925929,4.62171,-0.737825,1.114189,-8.386178,2.575307,-5.809391


In [8]:
annotations_to_use = [
"Genre--_Alternative",
"Genre--_Alternative_Folk",
"Genre--_Bebop",
"Genre--_Brit_Pop",
"Genre--_Classic_Rock",
"Genre--_Contemporary_Blues",
"Genre--_Contemporary_R&B",
"Genre--_Cool_Jazz",
"Genre--_Country_Blues",
"Genre--_Dance_Pop",
"Genre--_Electric_Blues",
"Genre--_Funk",
"Genre--_Gospel",
"Genre--_Metal/Hard_Rock",
"Genre--_Punk",
"Genre--_Roots_Rock",
"Genre--_Singer_/_Songwriter",
"Genre--_Soft_Rock",
"Genre--_Soul",
"Genre--_Swing",
"Genre-Bluegrass",
"Genre-Blues",
"Genre-Country",
"Genre-Electronica",
"Genre-Folk",
"Genre-Hip_Hop/Rap",
"Genre-Jazz",
"Genre-Pop",
"Genre-R&B",
"Genre-Rock",
"Genre-World",
]

annotations = pd.read_csv(
    "../processed_data/CLEANED_cal500_annotations.csv", index_col=0, header=0
)
annotations = annotations[annotations_to_use]

annotations.head()

Unnamed: 0_level_0,Genre--_Alternative,Genre--_Alternative_Folk,Genre--_Bebop,Genre--_Brit_Pop,Genre--_Classic_Rock,Genre--_Contemporary_Blues,Genre--_Contemporary_R&B,Genre--_Cool_Jazz,Genre--_Country_Blues,Genre--_Dance_Pop,...,Genre-Blues,Genre-Country,Genre-Electronica,Genre-Folk,Genre-Hip_Hop/Rap,Genre-Jazz,Genre-Pop,Genre-R&B,Genre-Rock,Genre-World
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10cc-for_you_and_i,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2pac-trapped,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5th_dimension-one_less_bell_to_answer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a_tribe_called_quest-bonita_applebum,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
aaron_neville-tell_it_like_it_is,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [9]:
# Final Dataset
dataset = df.join(annotations)
print(dataset.shape)
dataset

(500, 68)


Unnamed: 0_level_0,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,chroma_stft_0,chroma_stft_1,chroma_stft_2,chroma_stft_3,chroma_stft_4,...,Genre-Blues,Genre-Country,Genre-Electronica,Genre-Folk,Genre-Hip_Hop/Rap,Genre-Jazz,Genre-Pop,Genre-R&B,Genre-Rock,Genre-World
audio_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10cc-for_you_and_i,0.078477,1242.876109,1726.127760,2474.174705,0.050139,0.402762,0.232984,0.277037,0.313551,0.340628,...,0,0,0,0,0,0,0,0,0,0
2pac-trapped,0.238134,2429.877858,2475.507312,5477.636982,0.091782,0.531476,0.469931,0.461683,0.477741,0.489731,...,0,0,0,0,1,0,0,0,0,0
5th_dimension-one_less_bell_to_answer,0.064127,1654.608737,1852.080370,3570.534366,0.079638,0.250085,0.313986,0.299618,0.233613,0.288843,...,0,0,0,0,0,0,0,0,0,0
a_tribe_called_quest-bonita_applebum,0.180943,1954.297377,2037.378982,4195.438543,0.084297,0.610729,0.543563,0.366985,0.297738,0.280891,...,0,0,0,0,1,0,0,0,0,0
aaron_neville-tell_it_like_it_is,0.100450,1663.303440,1737.428955,3524.073698,0.081335,0.512765,0.367458,0.403097,0.290497,0.389738,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yo_la_tengo-tom_courtenay,0.172157,1864.981965,1783.119431,3699.409293,0.101611,0.251368,0.267746,0.265208,0.436673,0.848932,...,0,0,0,0,0,0,0,0,1,0
young_mc-bust_a_move,0.087365,2434.502580,2226.556759,5078.985593,0.128865,0.464683,0.456970,0.431741,0.406984,0.450832,...,0,0,0,0,1,0,0,0,0,0
young_rascals-baby_lets_wait,0.095571,1847.016557,1814.081993,3846.262958,0.102006,0.485589,0.306733,0.316068,0.342269,0.330873,...,0,0,0,0,0,0,0,0,0,0
zapp-dance_floor,0.092320,2453.228833,2029.780697,4781.795242,0.154305,0.389291,0.306309,0.423998,0.340021,0.344456,...,0,0,0,0,0,0,1,0,0,0


In [10]:
X = dataset.drop(columns=annotations_to_use)
y = dataset[annotations_to_use]

# preprocessing
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(400, 37) (100, 37) (400, 31) (100, 31)


### Model 1 - ANN

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the input and output dimensions
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

# Create a Sequential model
ANN = Sequential()

# Add input layer
ANN.add(Dense(50, input_dim=input_dim, activation='relu'))

# Add hidden layers
ANN.add(Dense(128, activation='relu'))
ANN.add(Dense(64, activation='relu'))
ANN.add(Dense(64, activation='relu'))
ANN.add(Dense(64, activation='relu'))
ANN.add(Dense(64, activation='relu'))
ANN.add(Dense(64, activation='relu'))
ANN.add(Dense(16, activation='relu'))

# Add output layer with sigmoid activation (for binary classification)
ANN.add(Dense(output_dim, activation='softmax'))  # Adjusted output_dim

# Compile the model
ANN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
ANN.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
%%time
history = ANN.fit(X_train, y_train, epochs=20, batch_size=5, validation_split=0.2)

Epoch 1/20


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.2960 - loss: 0.1099 - val_accuracy: 0.1500 - val_loss: 0.2295
Epoch 2/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3646 - loss: 0.1121 - val_accuracy: 0.1500 - val_loss: 0.2330
Epoch 3/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4270 - loss: 0.0900 - val_accuracy: 0.1375 - val_loss: 0.2407
Epoch 4/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4520 - loss: 0.0951 - val_accuracy: 0.1375 - val_loss: 0.2531
Epoch 5/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5034 - loss: 0.0904 - val_accuracy: 0.1500 - val_loss: 0.2574
Epoch 6/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4929 - loss: 0.0935 - val_accuracy: 0.1250 - val_loss: 0.2694
Epoch 7/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━