# Neural Network Model - Aidan Meens

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import librosa
import glob
import random
import keras

from os import path
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


2025-04-24 19:04:46.581645: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-24 19:04:46.959498: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-24 19:04:47.318303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745543087.604164   29176 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745543087.680335   29176 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745543088.245039   29176 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

### Step 1: Convert audio files to image files
Extracting features from audio files can be difficult, so we use Librosa to convert each audio file into Mel Spectrogram image files, which more ML models are designed for.

In [2]:
def mp3_to_png(file_name: str, overwrite: bool = False) -> str:
    '''
    If the given mp3 file doesn't already have a saved mel
    spectrogram, create it. Then, return the path of the image.
    :param file_name: the mp3 file to convert
    :param overwrite: if true, never fetches from cache
    :returns: the file path of the mel spectrogram
    '''
    out: str = f"{file_name}.png"
    if path.exists(out) and not overwrite:
        return out

    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    S = librosa.feature.melspectrogram(y = X, sr = sample_rate)

    plt.clf()
    ax = plt.axes()

    librosa.display.specshow(
        librosa.power_to_db(S, ref = np.max),
        x_axis = "time",
        y_axis = "mel",
        fmin = 50, fmax = 280, cmap = "gray"
    )

    plt.gcf().set_dpi(64)
    plt.gca().set_position((0, 0, 1, 1))

    plt.savefig(out)
    return(out)

Now, we can use the function on each audio file to create our images. NOTE: This can take a while to run.

In [3]:
ROOT_PATH: str = ""  # SET TO YOUR PATH FROM local TO csv
CSV_NAME: str = "clips.csv"  # SET TO .CSV FILE_NAME
INDEX_PATH: str = path.join(ROOT_PATH, CSV_NAME)
assert path.exists(INDEX_PATH)

index_df: pd.DataFrame = pd.read_csv(INDEX_PATH)
n: int = len(index_df)

for i, row in index_df.iterrows():
    file_name: str = path.join(ROOT_PATH, "clips", row["path"])
    percent_done: float = round(100.0 * (i / n), 2)
    print(f"{percent_done}%\t complete...")
    # mp3_to_png(file_name)

0.0%	 complete...
0.02%	 complete...
0.03%	 complete...
0.05%	 complete...
0.06%	 complete...
0.08%	 complete...
0.1%	 complete...
0.11%	 complete...
0.13%	 complete...
0.14%	 complete...
0.16%	 complete...
0.17%	 complete...
0.19%	 complete...
0.21%	 complete...
0.22%	 complete...
0.24%	 complete...
0.25%	 complete...
0.27%	 complete...
0.29%	 complete...
0.3%	 complete...
0.32%	 complete...
0.33%	 complete...
0.35%	 complete...
0.37%	 complete...
0.38%	 complete...
0.4%	 complete...
0.41%	 complete...
0.43%	 complete...
0.45%	 complete...
0.46%	 complete...
0.48%	 complete...
0.49%	 complete...
0.51%	 complete...
0.52%	 complete...
0.54%	 complete...
0.56%	 complete...
0.57%	 complete...
0.59%	 complete...
0.6%	 complete...
0.62%	 complete...
0.64%	 complete...
0.65%	 complete...
0.67%	 complete...
0.68%	 complete...
0.7%	 complete...
0.72%	 complete...
0.73%	 complete...
0.75%	 complete...
0.76%	 complete...
0.78%	 complete...
0.8%	 complete...
0.81%	 complete...
0.83%	 complete...


### Step 2: Load data as a Pandas DataFrame

In [4]:
# This can take about a minute to run.

X = index_df["path"].apply(lambda n: path.join(ROOT_PATH, "images", f"{n}.png"))
X = X.map(
    lambda x: keras.preprocessing.image.img_to_array(
        keras.preprocessing.image.load_img(
            x, target_size = (64, 64)
        )
    )
)

# One-hot encode y for three outputs (male, female, nonbinary)
encoder: OneHotEncoder = OneHotEncoder(sparse_output = False)
y = encoder.fit_transform(index_df[["gender"]])

# print(encoder.get_feature_names_out(["gender"]))  # DEBUG

### Step 3: Prepare the Data Set

In [5]:
# Split for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

# Type conversions for Keras
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = [np.array(y) for y in y_train]
y_test = [np.array(y) for y in y_test]

### Step 4: Create the Model

In [6]:
# Model Construction
model: keras.models.Sequential = keras.models.Sequential([
    keras.Input(shape = (64, 64, 3)),
    keras.layers.Rescaling(1.0 / 255.0),
    keras.layers.Flatten(),
    keras.layers.Dense(16, activation = "relu"),
    keras.layers.Dense(32, activation = "relu"),
    keras.layers.Dense(3, activation = "softmax"),
])

model.summary()

2025-04-24 19:06:25.466241: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


### Step 5: Compile and Fit the Model

In [7]:
model.compile(loss = "categorical_crossentropy",
              optimizer = "adam",
              metrics = ["accuracy"])

model.fit(
    np.array(X_train),
    np.array(y_train),
    verbose = True,
    epochs = 3,
    batch_size = 32
    )

model.save("trained_nn.keras", overwrite = True)

Epoch 1/3


2025-04-24 19:06:26.928932: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 247136256 exceeds 10% of free system memory.


[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.6464 - loss: 0.6702
Epoch 2/3
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8600 - loss: 0.3522
Epoch 3/3
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9118 - loss: 0.2476


### Step 6: Test the Model

In [9]:
_, training_accuracy = model.evaluate(
    np.array(X_train), np.array(y_train),
    batch_size = 4)

_, testing_accuracy = model.evaluate(
    np.array(X_test), np.array(y_test),
    batch_size = 4)

# Convert to percentages for style
training_accuracy = round(training_accuracy * 100.0, 3)
testing_accuracy = round(testing_accuracy * 100.0, 3)

# Print final results!
print(f"Training Accuracy:\t{training_accuracy}%")
print(f"Testing Accuracy:\t{testing_accuracy}%")

[1m   1/1257[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:04[0m 99ms/step - accuracy: 1.0000 - loss: 0.0580

2025-04-24 19:06:57.647916: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 247136256 exceeds 10% of free system memory.


[1m1257/1257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9374 - loss: 0.1906
[1m  8/315[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 7ms/step - accuracy: 0.9724 - loss: 0.0879  

2025-04-24 19:07:05.859060: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 61833216 exceeds 10% of free system memory.


[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9273 - loss: 0.2136
Training Accuracy:	93.854%
Testing Accuracy:	91.494%
