<font size="5">Create edge data set</font>

Create a balanced data set for edge learning.

In [1]:
# Import modules and dependencies

import os
from os import listdir
from os.path import isdir, join
import pathlib
import shutil

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from tensorflow.python.ops import gen_audio_ops as audio_ops


import akida
import akida_models
from akida_models import kws
from akida_models.kws import preprocessing
import cnn2snn
from cnn2snn import check_model_compatibility
from cnn2snn import quantize
from cnn2snn import quantize_layer
from cnn2snn import convert

import librosa
import scipy
from scipy import io
from scipy.io import wavfile
from scipy.io.wavfile import read
import python_speech_features
import random

<font size="5">1. Load the data set</font>

The Google Speech Command data set is loaded in this section of the code as a randomly ordered list of directories. 

In [2]:
# Define directories of the data
# Folder Just_Edge only includes the keywords, backword follow and forward
# Folder Silence only includes the silence category

data_dir = pathlib.Path('data/Modded_Google_Edge')
silence_dir = pathlib.Path('data/Modded_Google_Edge/silence')
edge_dir = pathlib.Path('data/Edge_Categories')

In [3]:
# Check if the keywords are the desired ones

targets = np.array(tf.io.gfile.listdir(str(data_dir)))

targets = targets[targets != 'README.md']

targets = targets[targets != 'follow']
targets = targets[targets != 'forward']
targets = targets[targets != 'backward']

targets_edge = np.array(tf.io.gfile.listdir(str(edge_dir)))

targets = np.append(targets, targets_edge)

print('Known and unknown commands:', targets)
print('Newly added commands for edge learning:', targets_edge)

Known and unknown commands: ['off' 'up' 'down' 'on' 'stop' 'yes' 'right' 'unknown' 'left' 'go' 'no'
 'silence' 'forward' 'follow' 'backward']
Newly added commands for edge learning: ['forward' 'follow' 'backward']


In [4]:
# Create list of file names, along with its target 

filenames = []
y = []
for i, target in enumerate(targets):
    print(join(data_dir, target))
    filenames.append(listdir(join(data_dir, target)))
    y.append(np.ones(len(filenames[i])) * i)

data/Modded_Google_Edge/off
data/Modded_Google_Edge/up
data/Modded_Google_Edge/down
data/Modded_Google_Edge/on
data/Modded_Google_Edge/stop
data/Modded_Google_Edge/yes
data/Modded_Google_Edge/right
data/Modded_Google_Edge/unknown
data/Modded_Google_Edge/left
data/Modded_Google_Edge/go
data/Modded_Google_Edge/no
data/Modded_Google_Edge/silence
data/Modded_Google_Edge/forward
data/Modded_Google_Edge/follow
data/Modded_Google_Edge/backward


In [5]:
# Create list with all the silence file names
# The targets in this case are note neccessary because these files are used later as added noise

silence_filenames = []
silence_filenames.append(listdir(silence_dir))
print(np.shape(silence_filenames))

(1, 402)


In [6]:
# Flatten the array to make it a list instead of a list of arrays

filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]

# Check if targets and file names have the same size

print('Filenames: ', len(filenames))
print('y: ', len(y))

Filenames:  101462
y:  101462


In [7]:
# Check if the file names are one second in length.

filenames_temp = []
y_temp=[]



for index, filename in enumerate(filenames):

    path = join(data_dir, targets[int(y[index])], filename)
    wav_loader = tf.io.read_file(path)
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    
    if wav_decoder[0].shape == (16000,1):
        filenames_temp.append(filename)
        y_temp.append(y[index])

filenames = filenames_temp
y = y_temp

print('Filenames: ', len(filenames))
print('y: ', len(y))

2022-01-17 15:26:11.512568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 15:26:11.576585: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 15:26:11.576763: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 15:26:11.577380: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Filenames:  91359
y:  91359


In [9]:
# Distribution of all labels

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 55955,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 398,
 12.0: 10,
 13.0: 11,
 14.0: 10}

In [10]:
# 10% should be unknown and silence
# 80 is here hardcoded because it can be easily seen in the distribution of the final data set
# Every keyword should have 50 utterances and the 40 from the new keywords

amount_sil_un = 80
print(int(amount_sil_un))

80


In [11]:
# Make file names of silence to one long list 

silence_filenames = np.squeeze(silence_filenames)

# Print the length in order to check if something has changed

print(len(silence_filenames))

402


In [12]:
# Include only the silence files which have a total of one second in length

silence_temp = []


for index, filename in enumerate(silence_filenames):

    path = join(silence_dir, filename)
    wav_loader = tf.io.read_file(path)
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    
    if wav_decoder[0].shape == (16000,1):
        silence_temp.append(filename)

silence_filenames = silence_temp

print('Silence Filenames: ', len(silence_filenames))

Silence Filenames:  398


In [13]:
# Shuffle file names of the silence category
# Print the first position of the list to check if it worked

print(silence_filenames[0])
random.shuffle(silence_filenames)
print(silence_filenames[0])

31dude_miaowing.wav
4doing_the_dishes.wav


In [14]:
# Only take 80 randomly selected files of unknown

filenames_target = list(zip(filenames, y))
random.shuffle(filenames_target)
#print(filenames_target)
counter = 0
index = 0
unknown_id = 7
result = []
for i in filenames_target:
    if filenames_target[index][1] != unknown_id:
        result.append(i)
    if filenames_target[index][1] == unknown_id:
        counter += 1
        if counter <= amount_sil_un:
            result.append(i)
    index += 1
filenames_target = result
    
print(len(filenames_target))
#print(filenames_target)

35484


In [15]:
# Check if the distribution fits

filenames, y = zip(*filenames_target)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 80,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 398,
 12.0: 10,
 13.0: 11,
 14.0: 10}

In [16]:
# Do the same for the silence category

counter = 0
index = 0
silence_id = 11
result = []
for i in filenames_target:
    if filenames_target[index][1] != silence_id:
        result.append(i)
    if filenames_target[index][1] == silence_id:
        counter += 1
        if counter <= amount_sil_un:
            result.append(i)
    index += 1
filenames_target = result
    
print(len(filenames_target))

35166


In [17]:
# Check if it worked

filenames, y = zip(*filenames_target)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 80,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 80,
 12.0: 10,
 13.0: 11,
 14.0: 10}

In [18]:
# Take 50 random samples from every keyword besides backward, follow and forward

forward_id = 12
backward_id = 14
follow_id = 13

filenames_target_temp=[]

target_amount = 50

for index, target in enumerate(targets):
    counter = 0
    i = 0
    for j in filenames_target:
        if (filenames_target[i][1] == backward_id
        or filenames_target[i][1] == forward_id
        or filenames_target[i][1] == follow_id
        or filenames_target[i][1] == silence_id
        or filenames_target[i][1] == unknown_id):
            if index == 0:
                filenames_target_temp.append(j)
        elif (filenames_target[i][1] == index):
            if counter < target_amount:
                filenames_target_temp.append(j)
                counter += 1
        i += 1
    
                
filenames_target = filenames_target_temp
            


In [19]:
# Check the distribution

filenames, y = zip(*filenames_target)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 50,
 1.0: 50,
 2.0: 50,
 3.0: 50,
 4.0: 50,
 5.0: 50,
 6.0: 50,
 7.0: 80,
 8.0: 50,
 9.0: 50,
 10.0: 50,
 11.0: 80,
 12.0: 10,
 13.0: 11,
 14.0: 10}

In [20]:
# Data is split in 10% validation and 90% training
# Compute what 10% of the data is

num_samples = (len(filenames_target)) - 31
train_amount = num_samples * 0.9
val_amount = num_samples - train_amount
print(train_amount)



594.0


In [21]:
# Split the data set

filenames_target_train = []
filenames_target_val = []

activ_train_amount_per_word = 4
train_non_edge_amount = 45
silence_unknown_amount = 72 

counter_train = 0
counter_train_edge_back = 0
counter_train_edge_foll = 0
counter_train_edge_for = 0

counter_off = 0
counter_up = 0
counter_down = 0
counter_on = 0
counter_stop = 0
counter_yes = 0
counter_right = 0
counter_left = 0
counter_go = 0
counter_no = 0

counter_unknown = 0
counter_silence = 0

index = 0
forward_id = 12
backward_id = 14
follow_id = 13

off_id = 0
up_id = 1
down_id = 2
on_id = 3
stop_id = 4
yes_id = 5
right_id = 6
left_id = 8
go_id = 9
no_id = 10


for i in filenames_target:
    
    if filenames_target[index][1] == off_id:
        if counter_off < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_off += 1
        else:
            filenames_target_val.append(i)
    
    elif filenames_target[index][1] == up_id:
        if counter_up < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_up += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == down_id:
        if counter_down < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_down += 1
        else:
            filenames_target_val.append(i)

    elif filenames_target[index][1] == on_id:
        if counter_on < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_on += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == stop_id:
        if counter_stop < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_stop += 1
        else:
            filenames_target_val.append(i)

    elif filenames_target[index][1] == yes_id:
        if counter_yes < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_yes += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == right_id:
        if counter_right < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_right += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == left_id:
        if counter_left < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_left += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == go_id:
        if counter_go < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_go += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == no_id:
        if counter_no < train_non_edge_amount:
            filenames_target_train.append(i)
            counter_no += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == unknown_id:
        if counter_unknown < silence_unknown_amount:
            filenames_target_train.append(i)
            counter_unknown += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == silence_id:
        if counter_silence < silence_unknown_amount:
            filenames_target_train.append(i)
            counter_silence += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == backward_id:
        if counter_train_edge_back < activ_train_amount_per_word:
            #filenames_target_train.append(i)
            filenames_target_train.extend([i] * 10)
            counter_train_edge_back += 1
        else:
            filenames_target_val.append(i)
            
    
    elif filenames_target[index][1] == forward_id:
        if counter_train_edge_for < activ_train_amount_per_word:
            filenames_target_train.extend([i] * 10)
            counter_train_edge_for += 1
        else:
            filenames_target_val.append(i)
            
    elif filenames_target[index][1] == follow_id:
        if counter_train_edge_foll < activ_train_amount_per_word:
            filenames_target_train.extend([i] * 10)
            counter_train_edge_foll += 1
        else:
            filenames_target_val.append(i)
    
    index += 1

In [22]:
# Check training data distribution

filenames, y = zip(*filenames_target_train)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 45,
 1.0: 45,
 2.0: 45,
 3.0: 45,
 4.0: 45,
 5.0: 45,
 6.0: 45,
 7.0: 72,
 8.0: 45,
 9.0: 45,
 10.0: 45,
 11.0: 72,
 12.0: 40,
 13.0: 40,
 14.0: 40}

In [23]:
# Check validation data distribution

filenames, y = zip(*filenames_target_val)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 5,
 1.0: 5,
 2.0: 5,
 3.0: 5,
 4.0: 5,
 5.0: 5,
 6.0: 5,
 7.0: 8,
 8.0: 5,
 9.0: 5,
 10.0: 5,
 11.0: 8,
 12.0: 6,
 13.0: 7,
 14.0: 6}

In [24]:
# Shuffle both sets

random.shuffle(filenames_target_train)
random.shuffle(filenames_target_val)

In [25]:
# Split and print all sets

filenames_train, y_train = zip(*filenames_target_train)
train_files = filenames_train
filenames_val, y_val = zip(*filenames_target_val)
val_files = filenames_val

print('Training set size', len(train_files))
print('Validation set size', len(val_files))

Training set size 714
Validation set size 85


In [26]:
# Does the label size match?

y_train = y_train
y_val = y_val


print('Training set size', len(y_train))
print('Validation set size', len(y_val))


Training set size 714
Validation set size 85


<font size="5">2. Define MFCC functions </font>

In this chapter the functions for calculating the MFCCs are defined.

In [27]:
# Helper function of the Akida library that helps defining the right MFCC settings 
    
model_settings = akida_models.kws.preprocessing.prepare_model_settings(sample_rate = 16000, clip_duration_ms = 1000, window_size_ms = 30, window_stride_ms = 10, feature_bin_count = 40)
print(model_settings)

{'desired_samples': 16000, 'window_size_samples': 480, 'window_stride_samples': 160, 'spectrogram_length': 98, 'fingerprint_width': 40, 'fingerprint_size': 3920}


In [28]:
# Function: Create MFCC from given path

def calc_mfcc(path,num_ceps):
    
    # Load wave file
    
    wav_loader = tf.io.read_file(path)
    #wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=model_settings['desired_samples'])
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    if wav_decoder[0].shape != (16000,1):
        return wav_decoder.audio
        
    # Randomly shift position to either 100ms forward or backward 
   
    position = random.randint(0,1)
    if position == 0:
        paddings = [[0, int(model_settings['desired_samples']*0.1)], [0, 0]]
        begin = [int(model_settings['desired_samples']*0.1),0]
    if position == 1:
        paddings = [[int(model_settings['desired_samples']*0.1), 0], [0, 0]]
        begin = [0,0]
    scaled_wav_decoder = tf.multiply(wav_decoder.audio, 1)
    padded_wav_decoder = tf.pad(scaled_wav_decoder, paddings)
    sliced_wav_decoder = tf.slice(padded_wav_decoder, begin = begin, size = [model_settings['desired_samples'], -1])

    # Take random silence file and add it to the signal in range between 0 and 0.1

    rand_silence = random.randint(0,len(silence_filenames) - 1)
    rand_amount = random.uniform(0, 0.1)
    silence_loader = tf.io.read_file(join(silence_dir,silence_filenames[rand_silence]))
    silence_decoder = tf.audio.decode_wav(silence_loader,desired_channels=1, desired_samples=model_settings['desired_samples'])
    silence_padded_wav = tf.add(sliced_wav_decoder, tf.multiply(silence_decoder.audio, rand_amount))
    silence_padded_wav = tf.squeeze(silence_padded_wav)
    silence_padded_wav = silence_padded_wav.numpy()

    # Create MFCC of the augmented signal 

    output = librosa.feature.mfcc(y=silence_padded_wav,sr=model_settings['desired_samples'],n_mfcc=model_settings['fingerprint_width'],n_fft=512,win_length=model_settings['window_size_samples'],hop_length=model_settings['window_stride_samples'])



    return output


In [31]:
# Function that creates the MFCC and check if they have the desired length 

def extract_features(samples, labels, num_ceps):
    prob_cnt = 0
    out_x = []
    out_y = []
        
    for index, filename in enumerate(samples):

        # Create path from given filename and target item
        path = join(data_dir, targets[int(labels[index])], 
                    filename)

        # Create MFCCs
        mfccs = calc_mfcc(path,num_ceps)

        if mfccs.shape[1] == num_ceps:
            out_x.append(mfccs)
            out_y.append(labels[index])
        else:
            print('Dropped:', index, mfccs.shape)
            prob_cnt += 1

    return out_x, out_y, prob_cnt

<font size="5">3. Save the MFCCs </font>

Save the MFCCs including their targets seperated into training and validation set. 

In [32]:
# Create train and validation set
# num_ceps is the desired length of the MFCC

x_train, y_train, prob = extract_features(train_files, y_train, num_ceps)
print('Removed percentage:', prob / len(y_train))
x_val, y_val, prob = extract_features(val_files, y_val, num_ceps)
print('Removed percentage:', prob / len(y_val))

Removed percentage: 0.0
Removed percentage: 0.0


In [33]:
# Save MFCCs and their target

feature_sets_file = 'stored_files_targets_edge.npz'
np.savez(feature_sets_file, 
         x_train=x_train, 
         y_train=y_train, 
         x_val=x_val, 
         y_val= y_val)



In [34]:
# Check if everything worked correctly 

feature_sets = np.load(feature_sets_file)
feature_sets.files

['x_train', 'y_train', 'x_val', 'y_val']