# CRNNs for footstep classification

Download the dataset (esc50) which contains footstep audioclips

ONLY NEEDS TO BE RUN ONCE

In [None]:
!wget https://github.com/karoldvl/ESC-50/archive/master.zip -O esc50.zip
!unzip esc50.zip

--2025-11-30 13:12:43--  https://github.com/karoldvl/ESC-50/archive/master.zip
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/karolpiczak/ESC-50/archive/master.zip [following]
--2025-11-30 13:12:43--  https://github.com/karolpiczak/ESC-50/archive/master.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master [following]
--2025-11-30 13:12:44--  https://codeload.github.com/karolpiczak/ESC-50/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.116.10
Connecting to codeload.github.com (codeload.github.com)|140.82.116.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘esc50.zip’

esc50.zip               [    

## Import libraries

In [None]:
# data handling
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# specifically for handling audio
import librosa

# machine learning libraries
import tensorflow as tf
from tensorflow.keras import layers, Sequential

# for saving the model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint

## Apply settings

In [None]:
AUDIO_DIR = "ESC-50-master/audio"
SR = 22050
BATCH_SIZE = 8
N_FFT = 1024
HOP_LENGTH = 512
EPOCHS = 3

## Load and format the dataset

The esc-50 dataset contains other classes besides "footsteps". This code assigns "1" to the "footsteps" class, and "0" to every other class.

Here, we also use a dataset object. While the ESC-50 dataset is still fairly small, larger datasets quickly encounter problems when being loaded into RAM all at once. The dataset object handles this by loading the data in stages, only loading the files which the model is currently using.

In [None]:
#download dataset
metadata = pd.read_csv("ESC-50-master/meta/esc50.csv")

#assign labels
metadata["binary_label"] = (metadata["category"] == "footsteps").astype(int)

# loads audio files from dataset
def load_audio(filename):
    path = os.path.join(AUDIO_DIR, filename)              # finds file
    audio, sr = librosa.load(path, sr=SR)                 # sr=SR automatically resamples
    audio = tf.convert_to_tensor(audio, dtype=tf.float32) # prepares data to be fed into model
    return audio

# generates dataset
def make_dataset(df):
  def gen():
      for _, row in df.iterrows():        # iterate over each row of the metadata
          audio = load_audio(row["filename"])   # load each file referenced in the metadata
          label = row["binary_label"]           # store the binary label
          yield audio, label                    # return a single file after each iteration

  dataset = tf.data.Dataset.from_generator(
      gen,                                                  # our generator function, returning 1D waveform and label pairs
      output_signature=(                                    # tells TensorFlow the shapes and datatypes of what our function yields
          tf.TensorSpec(shape=(None,), dtype=tf.float32),   # variable-length waveform
          tf.TensorSpec(shape=(), dtype=tf.int32)           # Scalar label (0 or 1)
      )
  )

  # pad the dataset so that each audio file is the same length (within the batch)
  dataset = dataset.padded_batch(
      batch_size=BATCH_SIZE,
      padded_shapes=([None], [])  # only pad the audio data ([None]), the labels don't need padding ([])
    )
  return dataset

# split the data into train / validation
train_df, test_df = train_test_split(
    metadata,
    test_size=0.2,
    random_state=42,
    stratify=metadata['binary_label']  # ensures balance between classes
)

train_dataset = make_dataset(train_df)
test_dataset = make_dataset(test_df)

# Optional: shuffle & prefetch for better performance
train_dataset = train_dataset.shuffle(200).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)



## Spectrograms
Keras/Tensorflow contains functionality to be able to write custom layers. As there is no predefined "spectrogram" layer, we must write our own. This code applies creates a custom spectrogram layer for use later in the Keras Sequential model.

It should be noted that we use a log spectrogram here since human hearing is logarithmic. This generally performs better on audio within the range of human hearing.

In [None]:
class Spectrogram(layers.Layer):
    def __init__(self, n_fft=1024, hop_length=512):
        super().__init__()
        self.n_fft = n_fft
        self.hop_length = hop_length

    def call(self, audio):
        stft = tf.signal.stft(
            audio,                            # data to pre-process
            frame_length=self.n_fft,          # length of section to apply FFT to
            frame_step=self.hop_length,       # stride length
            window_fn=tf.signal.hann_window   # window processing method
        )
        spec = tf.abs(stft)                   # magnitude of the full spectrogram
        return tf.math.log1p(spec)            # log-spectrogram

## Build model

In [None]:
n_fft = 1024

model = Sequential([
    layers.Input(shape=(None,), dtype=tf.float32),  # raw waveform
    Spectrogram(n_fft=n_fft, hop_length=512),       # log spectrogram

    # add channel dimension for Conv2D
    layers.Lambda(lambda x: tf.expand_dims(x, -1)),  # (time, freq, 1)

    # CNN block
    layers.Conv2D(32, (3,3), padding='same', activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), padding='same', activation='relu'),
    layers.MaxPooling2D((2,2)),

    # reshape for RNN: flatten freq*channels per timestep
    layers.TimeDistributed(layers.Flatten()),

    # RNN block (switched from GRU to SimpleRNN)
    layers.Bidirectional(layers.SimpleRNN(64, return_sequences=True)),
    layers.Bidirectional(layers.SimpleRNN(64)),

    # Output
    layers.Dense(1, activation='sigmoid')
])

# compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy', # use BCE as the loss function as we only have 2 classes
    metrics=['accuracy']
)

model.summary() # For debugging purposes


## Train model

In [None]:
# ensure model is saved at regular checkpoints during training
checkpoint_cb = ModelCheckpoint(
    "model_weights.keras",    # filename
    save_best_only=False,     # save all checkpoints
    save_weights_only=False,  # save model architecture as well as model weights
    verbose=1                 # print short message each time model is saved
)

# train model
model.fit(
    train_dataset,                # train it on the training dataset
    validation_data=test_dataset,  # specify validation data so that Keras can test the model on unseen data at the end of each epoch
    epochs=EPOCHS,                # specify number of iterations throught the dataset
    callbacks=[checkpoint_cb]     # functions to run at the end of each epoch
)

## Load model from file
If you have an existing saved model, you can run it from the file in python like this...

In [None]:
# Load the model from a file
model = load_model(
    "my_full_model_epoch_01.h5",                # path to the saved model
    custom_objects={"Spectrogram": Spectrogram} # include the custom spectrogram layer
)

model.evaluate(test_dataset)
# predictions = model.predict(my_dataset) # use line to actually use your model

...or you can export it to a tensorflow lite model for use on a micrcontroller as seen below. Once you have generated your `.tflite` file, run `xxd -i model.tflite > model_data.cc` from a terminal in the same directory as your `.tflite` file to convert it to a C array. This means it's ready to run from C/C++ code which can be uploaded to a micrcontroller as firmware.

In [None]:
model = tf.keras.models.load_model("model_weights.keras")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite = converter.convert()

with open("model.tflite", "wb") as f:
    f.write(tflite)