In [None]:
import tensorflow as tf

## Dataset

In [None]:
directory = "dataset"

train_data = tf.keras.utils.audio_dataset_from_directory(
    directory,
    labels="inferred",
    label_mode="categorical",
    batch_size=4,
    sampling_rate=None,
    output_sequence_length=48000,
    ragged=False,
    shuffle=True,
    validation_split=0.2,
    subset="training",
    seed=7
)

validation_data = tf.keras.utils.audio_dataset_from_directory(
    directory,
    labels="inferred",
    label_mode="categorical",
    batch_size=4,
    sampling_rate=None,
    output_sequence_length=48000,
    ragged=False,
    shuffle=False,
    validation_split=0.2,
    subset="validation",
    seed=7
)

## Model

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(32, kernel_size=80, activation='relu', strides=16, input_shape=(48000, 1)),
    tf.keras.layers.MaxPooling1D(4),
    tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(4),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(4),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(4),
    tf.keras.layers.Flatten(),
    # Dense(256, activation='relu'),
    tf.keras.layers.Dense(29, activation='softmax')
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(), 
    loss='categorical_crossentropy',
    metrics=['accuracy']
    )

## Train

In [None]:
history = model.fit(train_data, validation_data=validation_data, epochs=20)

In [None]:
model.save("weights/audio_classification.h5")

## Inference

In [None]:
import numpy as np


file_path = "input/voice_63.wav"
x = tf.io.read_file(file_path)
x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=48000,)
x = x[tf.newaxis, ...]

prediction = model.predict(x)
np.argmax(prediction)