# Convolutional Neural Network Classifier

Jordan Dehmel

In [13]:
# Imports
from os import path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import librosa
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import tensorflow.image
import tensorflow.io
import keras

In [14]:
# Load index file
root_path: str = '/home/jorb/voice-data/en'
index_path: str = path.join(
    root_path, 'resampled_validated_with_durations.csv')
assert path.exists(index_path)

index_df: pd.DataFrame = pd.read_csv(index_path)

In [15]:
# Resource fn def

def mp3_to_png(filename: str, overwrite: bool = False) -> str:
    '''
    If the given mp3 file does not already have a saved
    spectrograph image, creates it. Either way, returns the path
    of the image.
    :param filename: The mp3 file to convert
    :param overwrite: If true, never fetches from cache
    :returns: The filepath of the spectograph version
    '''

    out: str = filename + '.png'
    if path.exists(out) and not overwrite:
        return out

    X, sample_rate = librosa.load(filename, res_type='kaiser_fast')
    S = librosa.feature.melspectrogram(y=X, sr=sample_rate)

    plt.clf()
    ax = plt.axes()

    librosa.display.specshow(
        librosa.power_to_db(S, ref=np.max), x_axis='time',
        y_axis='mel', fmin=50, fmax=280, cmap='gray')

    plt.gcf().set_dpi(64)
    plt.gca().set_position((0, 0, 1, 1))

    plt.savefig(out)

    return out


In [16]:
# Image-ify all of the things
# This cell will take a long time: At least an hour
# If cancelled, progress will be saved.

n: int = len(index_df)
for i, row in index_df.iterrows():
    filename: str = path.join(root_path, 'clips', row['path'])
    percent_done: float = round(100.0 * (i / n), 2)
    print(f'{percent_done}%\tdone...')
    mp3_to_png(filename)


0.0%	done...
0.02%	done...
0.03%	done...
0.05%	done...
0.06%	done...
0.08%	done...
0.1%	done...
0.11%	done...
0.13%	done...
0.14%	done...
0.16%	done...
0.17%	done...
0.19%	done...
0.21%	done...
0.22%	done...
0.24%	done...
0.25%	done...
0.27%	done...
0.29%	done...
0.3%	done...
0.32%	done...
0.33%	done...
0.35%	done...
0.37%	done...
0.38%	done...
0.4%	done...
0.41%	done...
0.43%	done...
0.45%	done...
0.46%	done...
0.48%	done...
0.49%	done...
0.51%	done...
0.52%	done...
0.54%	done...
0.56%	done...
0.57%	done...
0.59%	done...
0.6%	done...
0.62%	done...
0.64%	done...
0.65%	done...
0.67%	done...
0.68%	done...
0.7%	done...
0.72%	done...
0.73%	done...
0.75%	done...
0.76%	done...
0.78%	done...
0.8%	done...
0.81%	done...
0.83%	done...
0.84%	done...
0.86%	done...
0.87%	done...
0.89%	done...
0.91%	done...
0.92%	done...
0.94%	done...
0.95%	done...
0.97%	done...
0.99%	done...
1.0%	done...
1.02%	done...
1.03%	done...
1.05%	done...
1.07%	done...
1.08%	done...
1.1%	done...
1.11%	done...
1.13%	done...
1

In [17]:
# Prepare dataset
# This cell should take about a minute to run: It's loading
# thousands of files to memory

X = index_df['path'].apply(
    lambda n: path.join(root_path, 'clips', f'{n}.png'))

# One-hot encoding in y
encoder: OneHotEncoder = OneHotEncoder(sparse_output=False)

y = encoder.fit_transform(index_df[['gender']])

assert len(y) == len(X)

# Bring X to proper format for keras
X = X.map(
    lambda x: keras.preprocessing.image.img_to_array(
            keras.preprocessing.image.load_img(
                x, target_size=(64, 64))))


In [18]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = [np.array(y) for y in y_train]
y_test = [np.array(y) for y in y_test]

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

for x in X_train:
    assert x.shape == (64, 64, 3)

for x in X_test:
    assert x.shape == (64, 64, 3)

for y in y_train:
    assert y.shape == (3,)

for y in y_test:
    assert y.shape == (3,)

In [19]:
# Model construction
model: keras.models.Sequential = keras.models.Sequential([
    keras.Input(shape=(64, 64, 3)),
    keras.layers.Rescaling(1.0 / 255.0),
    keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
    keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
    keras.layers.Flatten(),
    keras.layers.Dense(3, activation='softmax'),
])

model.summary()

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Fit data to model
model.fit(
    np.array(X_train),
    np.array(y_train),
    verbose=True,
    epochs=3
)

model.save('trained_cnn.keras', overwrite=True)

Epoch 1/3


2025-04-14 09:26:05.408080: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 247136256 exceeds 10% of free system memory.


[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 274ms/step - accuracy: 0.6864 - loss: 0.6393
Epoch 2/3
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 257ms/step - accuracy: 0.9640 - loss: 0.1263
Epoch 3/3
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 261ms/step - accuracy: 0.9791 - loss: 0.0646


In [20]:
# Test the model
y_pred = model.predict(np.array(X_test))
mae = keras.losses.MeanAbsoluteError()(np.array(y_test), y_pred)

print(f'MAE: {mae}')

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step
MAE: 0.04115210846066475
