# Lab 5: Google Speech Commands

## Imports

In [2]:
import copy
import wave
from pathlib import Path
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Conv1D, AvgPool1D, MaxPool1D, ZeroPadding1D, BatchNormalization, Flatten, Dense, Activation
from keras.utils.data_utils import get_file
from keras.utils.np_utils import to_categorical

## Load raw spoken digits data from Google Speech Commands

In [36]:
import os
import glob
import numpy as np
import soundfile as sf # install this in the kernel under Settings
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from pathlib import Path
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Input, Conv1D, AvgPool1D, MaxPool1D, ZeroPadding1D, BatchNormalization, Flatten, Dense, Activation
import tensorflow as tf

dataset_dir = 'datasets'
CLASSES = ["nightingale", "greenfinch", "quail"]

x = []
y = []

for class_name in CLASSES:
    class_dir = os.path.join(dataset_dir, class_name)
    for mp3_file in glob.glob(os.path.join(class_dir, "*.mp3")):
        data, sr = sf.read(mp3_file)
        data = data.astype(np.float32) # Convert to 32-bit floating-point
        data.resize((16000, 1)) # Resize to 1s (16kHz) with zero-padding, 1 channel
        
        x.append(data)
        y.append(CLASSES.index(class_name))
        

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)   

x_train = np.array(x_train)
y_train = to_categorical(np.array(y_train))
x_test = np.array(x_test)
y_test = to_categorical(np.array(y_test))

print("\nTRAIN\n")
print("DATA")
print(x_train)
print("\nLABEL")
print(y_train)

print("\nTEST\n")
print("DATA")
print(x_test)
print("\nLABEL")
print(y_test)




TRAIN

DATA
[[[ 0.0000000e+00]
  [ 0.0000000e+00]
  [ 0.0000000e+00]
  ...
  [ 3.2614678e-02]
  [-4.1543241e-03]
  [ 4.8602957e-02]]

 [[ 2.0583554e-03]
  [-7.7218879e-03]
  [ 2.1314288e-03]
  ...
  [ 7.5069265e-03]
  [ 9.1339042e-03]
  [ 7.2912467e-03]]

 [[ 0.0000000e+00]
  [ 7.1350287e-11]
  [ 2.1943704e-11]
  ...
  [-1.6543496e-02]
  [-1.6452722e-02]
  [-1.5552191e-02]]

 ...

 [[ 0.0000000e+00]
  [ 0.0000000e+00]
  [-3.0598895e-11]
  ...
  [ 6.8211480e-04]
  [ 2.3256075e-03]
  [ 2.9792378e-03]]

 [[-3.2963223e-06]
  [-5.5894852e-06]
  [-3.7297962e-06]
  ...
  [ 1.2903062e-04]
  [-1.5235022e-03]
  [-4.9856215e-05]]

 [[ 1.5592575e-03]
  [ 1.3773441e-03]
  [ 8.2981586e-04]
  ...
  [ 5.4007769e-03]
  [ 4.5046806e-03]
  [ 3.1348467e-03]]]

LABEL
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [



## Prepare for inference with fixed-point Q7.9 samples by scaling input data accordingly

In [37]:
FIXED_POINT = 9
x_train /= 2**FIXED_POINT
x_test  /= 2**FIXED_POINT

## Export small dataset (250 random vectors)

In [38]:
perms = np.random.permutation(len(y_test))[0:250]
x_test_250 = x_test[perms]
y_test_250 = y_test[perms]
np.savetxt('x_test_gsc_250.csv', x_test_250.reshape((x_test_250.shape[0], -1)), delimiter=',', fmt='%s')
np.savetxt('y_test_gsc_250.csv', y_test_250, delimiter=',', fmt='%s')

## Build model M5

In [39]:
x_train = np.expand_dims(x_train, axis=-1)
model = Sequential()

model.add(Input(shape=(16000, 1)))

model.add(Conv1D(filters=32, kernel_size=80, strides=16, activation='relu'))
model.add(BatchNormalization(momentum=0.1))
model.add(MaxPool1D(pool_size=4))

model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(BatchNormalization(momentum=0.1))

model.add(MaxPool1D(pool_size=4))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(BatchNormalization(momentum=0.1))
model.add(MaxPool1D(pool_size=4))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(BatchNormalization(momentum=0.1))

model.add(MaxPool1D(pool_size=4))

model.add(Flatten())
model.add(Dense(units=len(CLASSES)))
model.add(Activation('softmax')) # SoftMax activation needs to be separate from Dense to remove it later on

# EXPLORE Learning Rate
opt = tf.keras.optimizers.Adam(learning_rate=10e-3)
model.summary()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_13 (Conv1D)          (None, 996, 32)           2592      
                                                                 
 batch_normalization_12 (Bat  (None, 996, 32)          128       
 chNormalization)                                                
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 249, 32)          0         
 g1D)                                                            
                                                                 
 conv1d_14 (Conv1D)          (None, 247, 32)           3104      
                                                                 
 batch_normalization_13 (Bat  (None, 247, 32)          128       
 chNormalization)                                                
                                                      

## Train model

In [40]:
model.fit(x_train, y_train, epochs=4, batch_size=50, validation_data=(x_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f27bc24ed70>

## Evaluate model on test dataset

In [41]:
model.evaluate(x_test, y_test, verbose=2)
pred_test = model.predict(x_test)
print(tf.math.confusion_matrix(y_test.argmax(axis=1), pred_test.argmax(axis=1)))

1/1 - 0s - loss: 160.9853 - categorical_accuracy: 0.4167 - 22ms/epoch - 22ms/step
tf.Tensor(
[[5 0 0]
 [5 0 0]
 [2 0 0]], shape=(3, 3), dtype=int32)


## Evaluate model on small dataset

In [42]:
model.evaluate(x_test_250, y_test_250, verbose=2)
pred_test_250 = model.predict(x_test_250)
print(tf.math.confusion_matrix(y_test_250.argmax(axis=1), pred_test_250.argmax(axis=1)))

1/1 - 0s - loss: 160.9853 - categorical_accuracy: 0.4167 - 21ms/epoch - 21ms/step
tf.Tensor(
[[5 0 0]
 [5 0 0]
 [2 0 0]], shape=(3, 3), dtype=int32)


## Save trained model

In [49]:
model.save('bird_recognition.h5')





## Remove SoftMax layer

In [50]:
model = tf.keras.Model(model.input, model.layers[-2].output, name=model.name)

## Install MicroAI for C inference code generation (kerascnn2c module)

In [51]:
%pip install https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
import kerascnn2c

Defaulting to user installation because normal site-packages is not writeable
Collecting https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
  Downloading https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip (1.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m[31m1.6 MB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


## Generate C code for the trained model with 16-bit fixed-point representation

In [52]:
import copy
res = kerascnn2c.Converter(output_path=Path('gsc_output_fixed'),
                           fixed_point=FIXED_POINT, # Number of bits for the fractional part, Q7.9 format
                           number_type='int16_t', # Data type for weights/activations (16 bits quantization)
                           long_number_type='int32_t', # Data type for intermediate results
                           number_min=-(2**15), # Minimum value for 16-bit signed integers
                           number_max=(2**15)-1 # Maximum value for 16-bit signed integers
                          ).convert_model(copy.deepcopy(model))
with open('gsc_model_fixed.h', 'w') as f:
    f.write(res)





INFO:tensorflow:Assets written to: ram://5cdd2f3f-233b-4502-aefb-5f60342b07b0/assets


INFO:tensorflow:Assets written to: ram://5cdd2f3f-233b-4502-aefb-5f60342b07b0/assets






———————————————————————————————————————————————————————————————————————————————————————————————————————
Inputs                           | Layer                            | Outputs                         
———————————————————————————————————————————————————————————————————————————————————————————————————————
                                 | input_5                          | conv1d_13                       
-------------------------------------------------------------------------------------------------------
input_5                          | conv1d_13                        | batch_normalization_12          
-------------------------------------------------------------------------------------------------------
conv1d_13                        | batch_normalization_12           | max_pooling1d_12                
-------------------------------------------------------------------------------------------------------
batch_normalization_12           | max_pooling1d_12                 

## Compile the 16-bit fixed-point C code for x86 and evaluate on small dataset

In [56]:
!g++ -Wall -Wextra -pedantic -Ofast -o gsc_fixed -Igsc_output_fixed/ gsc_output_fixed/model.c main.cpp 
!./gsc_fixed x_test_gsc_250.csv y_test_gsc_250.csv

[01m[Kgsc_output_fixed/model.c:[m[K In function ‘[01m[Kvoid cnn(const number_t (*)[16000], number_t*)[m[K’:
  184 |     [01;35m[Kactivations2.max_pooling1d_15_output[m[K, // Last layer uses output passed as model parameter
      |     [01;35m[K~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~[m[K
Testing accuracy: 0.416667
