<font size="5">Create data set</font>

Create a data set from the "Google Speech Command Data Set"

In [1]:
# Import modules and dependencies

import os
from os import listdir
from os.path import isdir, join
import pathlib
import shutil

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from tensorflow.python.ops import gen_audio_ops as audio_ops


import akida
import akida_models
from akida_models import kws
from akida_models.kws import preprocessing
import cnn2snn
from cnn2snn import check_model_compatibility
from cnn2snn import quantize
from cnn2snn import quantize_layer
from cnn2snn import convert

import librosa
import scipy
from scipy import io
from scipy.io import wavfile
from scipy.io.wavfile import read
import python_speech_features
import random

<font size="5">1. Load the data set</font>

The Google Speech Command data set is loaded in this section of the code as a randomly ordered list of directories.

In [2]:
# If needed, set seed for experiment reproducibility

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [3]:
# Define directories of the data
# Folder Silence only includes the silence category

data_dir = pathlib.Path('data/Modded_Google')
silence_dir = pathlib.Path('data/Modded_Google/silence')

In [4]:
# Check if the keywords are the desired ones

targets = np.array(tf.io.gfile.listdir(str(data_dir)))

targets = targets[targets != 'README.md']

print('Known and unknown commands:', targets)

Known and unknown commands: ['off' 'up' 'down' 'on' 'stop' 'yes' 'right' 'unknown' 'left' 'go' 'no'
 'silence']


In [5]:
# Create list of file names, along with its target 

filenames = []
y = []
for i, target in enumerate(targets):
    print(join(data_dir, target))
    filenames.append(listdir(join(data_dir, target)))
    y.append(np.ones(len(filenames[i])) * i)

data/Modded_Google/off
data/Modded_Google/up
data/Modded_Google/down
data/Modded_Google/on
data/Modded_Google/stop
data/Modded_Google/yes
data/Modded_Google/right
data/Modded_Google/unknown
data/Modded_Google/left
data/Modded_Google/go
data/Modded_Google/no
data/Modded_Google/silence


In [6]:
# Create list with all the silence file names
# The targets in this case are note neccessary because these files are used later as added noise

silence_filenames = []
silence_filenames.append(listdir(silence_dir))
print(np.shape(silence_filenames))

(1, 402)


In [7]:
# Flatten the array to make it a list instead of a list of arrays

filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]
print('Filenames: ', len(filenames))
print('y: ', len(y))

Filenames:  106231
y:  106231


In [8]:
# Exclude files that are not one second in length

filenames_temp = []
y_temp=[]



for index, filename in enumerate(filenames):

    path = join(data_dir, targets[int(y[index])], filename)
    wav_loader = tf.io.read_file(path)
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    
    if wav_decoder[0].shape == (16000,1):
        filenames_temp.append(filename)
        y_temp.append(y[index])

filenames = filenames_temp
y = y_temp

print('Filenames: ', len(filenames))
print('y: ', len(y))

2022-01-21 14:24:20.406635: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-21 14:24:20.467036: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-21 14:24:20.467217: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-21 14:24:20.468119: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Filenames:  95792
y:  95792


In [9]:
# Distribution of all labels after the conversion

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 60419,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 398}

In [10]:
# There are 34975 files that are not unkown or silence 
# Create number that should be equal to 10 % of the final data set
# This number is later used, such that the category "silence" and "unkown" make up 10% of the data.

amount_sil_un = 34975 / 0.8
amount_sil_un = amount_sil_un * 0.1
print(int(amount_sil_un))


4371


In [11]:
# Make file names of silence to one long list 

silence_filenames = np.squeeze(silence_filenames)

# Print the length in order to check if something has changed

print(len(silence_filenames))

402


In [12]:
# Include only the silence files which have a total of one second in length

silence_temp = []


for index, filename in enumerate(silence_filenames):

    path = join(silence_dir, filename)
    wav_loader = tf.io.read_file(path)
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    
    if wav_decoder[0].shape == (16000,1):
        silence_temp.append(filename)

silence_filenames = silence_temp

print('Silence Filenames: ', len(silence_filenames))

Silence Filenames:  398


In [13]:
# Shuffle file names of the silence category
# Print the first position of the list to check if it worked

print(silence_filenames[0])
random.shuffle(silence_filenames)
print(silence_filenames[0])

31dude_miaowing.wav
19exercise_bike.wav


In [14]:
# Randomly select files of the "unknown" category in order to match the 10% constraint

filenames_target = list(zip(filenames, y))
random.shuffle(filenames_target)
#print(filenames_target)
counter = 0
index = 0
unknown_id = 7
result = []
for i in filenames_target:
    if filenames_target[index][1] != unknown_id:
        result.append(i)
    if filenames_target[index][1] == unknown_id:
        counter += 1
        if counter <= amount_sil_un:
            result.append(i)
    index += 1
filenames_target = result
    
print(len(filenames_target))
#print(filenames_target)

39744


In [15]:
# Check if the distribution still matches the one from the beginning. 

filenames, y = zip(*filenames_target)
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 4371,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 398}

In [16]:
# Duplicate silence files in random order till it is 10% of the data
# The randomness was introduced by previously shuffling the set


counter = 398
silence_id = 11
result = filenames_target
index = 0

while counter <= amount_sil_un:
    for i in filenames_target:
        if filenames_target[index][1] == silence_id:
            if counter <= amount_sil_un:
                result.append(i)
                counter +=1
        index += 1
    
    

filenames_target = result
#print(counter)
#print(len(filenames_target))
#print(filenames_target)

In [17]:
# Shuffle filenames again and unzip

#filenames_target = list(zip(filenames, y))
random.shuffle(filenames_target)
filenames, y = zip(*filenames_target)

# Check if data has the right length

print(len(filenames))

43718


In [18]:
# Check if nothing changed besideds "unknown" and "silence"

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0.0: 3427,
 1.0: 3269,
 2.0: 3580,
 3.0: 3471,
 4.0: 3563,
 5.0: 3692,
 6.0: 3448,
 7.0: 4371,
 8.0: 3502,
 9.0: 3478,
 10.0: 3545,
 11.0: 4372}

In [19]:
# Indexing such that no file gets left out due to the integer precision

num_samples = (len(filenames_target))

a = num_samples * 0.8
b = num_samples * 0.1
c = num_samples - int(a) - int(b) - int(b)
int(a)
int(b)
print('80% : ', int(a))
print('10% : ', int(b))
print('Number of total samples: ', num_samples)
print('80% + 10% + 10% : ', int(a) + int(b) + int(b))
print('Number of samples that would be left out : ', int(c))

80% :  34974
10% :  4371
Number of total samples:  43718
80% + 10% + 10% :  43716
Number of samples that would be left out :  2


In [20]:
# Split and print all sets

train_files = filenames[:int(a)+int(c)]
val_files = filenames[int(a)+int(c): int(a)+int(c) + int(b)]
test_files = filenames[-int(b):]

print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))

Training set size 34976
Validation set size 4371
Test set size 4371


In [21]:
# Same splitting procedure for target labels 

y_train = y[:int(a)+int(c)]
y_val = y[int(a)+int(c): int(a)+int(c) + int(b)]
y_test = y[-int(b):]

print('Training set size', len(y_train))
print('Validation set size', len(y_val))
print('Test set size', len(y_test))

Training set size 34976
Validation set size 4371
Test set size 4371


<font size="5">2. Define MFCC functions </font>

In this chapter the functions for calculating the MFCCs are defined.

In [22]:
 model_settings = akida_models.kws.preprocessing.prepare_model_settings(sample_rate = 16000, clip_duration_ms = 1000, window_size_ms = 30, window_stride_ms = 10, feature_bin_count = 40)
print(model_settings)

{'desired_samples': 16000, 'window_size_samples': 480, 'window_stride_samples': 160, 'spectrogram_length': 98, 'fingerprint_width': 40, 'fingerprint_size': 3920}


In [23]:
# Function: Create MFCC from given path

def calc_mfcc(path,num_ceps):
    
    # Load wave file
    
    wav_loader = tf.io.read_file(path)
    #wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=model_settings['desired_samples'])
    wav_decoder = tf.audio.decode_wav(wav_loader,desired_channels=1, desired_samples=-1)
    if wav_decoder[0].shape != (16000,1):
        return wav_decoder.audio
        
    
    # Randomly shift position to either 100ms forward or backward 

        
    position = random.randint(0,1)
    if position == 0:
        paddings = [[0, int(model_settings['desired_samples']*0.1)], [0, 0]]
        begin = [int(model_settings['desired_samples']*0.1),0]
    if position == 1:
        paddings = [[int(model_settings['desired_samples']*0.1), 0], [0, 0]]
        begin = [0,0]
    scaled_wav_decoder = tf.multiply(wav_decoder.audio, 1)
    padded_wav_decoder = tf.pad(scaled_wav_decoder, paddings)
    sliced_wav_decoder = tf.slice(padded_wav_decoder, begin = begin, size = [model_settings['desired_samples'], -1])

    # Take random silence file and add it to the signal in range between 0 and 0.1

    rand_silence = random.randint(0,len(silence_filenames) - 1)
    rand_amount = random.uniform(0, 0.1)
    silence_loader = tf.io.read_file(join(silence_dir,silence_filenames[rand_silence]))
    silence_decoder = tf.audio.decode_wav(silence_loader,desired_channels=1, desired_samples=model_settings['desired_samples'])
    silence_padded_wav = tf.add(sliced_wav_decoder, tf.multiply(silence_decoder.audio, rand_amount))
    silence_padded_wav = tf.squeeze(silence_padded_wav)
    silence_padded_wav = silence_padded_wav.numpy()

    # Create MFCC of the augmented signal

    output = librosa.feature.mfcc(y=silence_padded_wav,sr=model_settings['desired_samples'],n_mfcc=model_settings['fingerprint_width'],n_fft=512,win_length=model_settings['window_size_samples'],hop_length=model_settings['window_stride_samples'])



    return output


In [24]:
# Test if the function works properly and outputs the desired dimension of the UltraTrail setup

prob_cnt = 0
x_test_s = []
y_test_s = []
num_ceps = 101

for index, filename in enumerate(train_files):
    
    # Stop after 500
    if index >= 100:
        break
    
    # Create path from given filename and target item
    path = join(data_dir, targets[int(y_train[index])], 
                filename)
    
    # Create MFCCs
    mfccs = calc_mfcc(path,num_ceps)
    
    if mfccs.shape[1] == num_ceps:
        x_test_s.append(mfccs)
        y_test_s.append(y_train[index])
    else:
        print('Dropped:', index, mfccs.shape)
        prob_cnt += 1
        
print('% of problematic samples:', prob_cnt / 100)


% of problematic samples: 0.0


In [25]:
print(x_test_s[1].shape)

(40, 101)


In [26]:
# Function that creates the MFCC and check if they have the desired length 

def extract_features(samples, labels, num_ceps):
    prob_cnt = 0
    out_x = []
    out_y = []
        
    for index, filename in enumerate(samples):

        # Create path from given filename and target item
        path = join(data_dir, targets[int(labels[index])], 
                    filename)

        # Create MFCCs
        mfccs = calc_mfcc(path,num_ceps)

        if mfccs.shape[1] == num_ceps:
            out_x.append(mfccs)
            out_y.append(labels[index])
        else:
            print('Dropped:', index, mfccs.shape)
            prob_cnt += 1

    return out_x, out_y, prob_cnt

<font size="5">3. Save the MFCCs </font>

Save the MFCCs including their targets seperated into training, validation and test set.

In [28]:
# Create train, validation, and test sets

x_train, y_train, prob = extract_features(train_files, y_train, num_ceps)
print('Removed percentage:', prob / len(y_train))
x_val, y_val, prob = extract_features(val_files, y_val, num_ceps)
print('Removed percentage:', prob / len(y_val))
x_test, y_test, prob = extract_features(test_files, y_test, num_ceps)
print('Removed percentage:', prob / len(y_test))

Removed percentage: 0.0
Removed percentage: 0.0
Removed percentage: 0.0


In [29]:
# Save MFCCs and their target

feature_sets_file = 'stored_files_targets.npz'
np.savez(feature_sets_file, 
         x_train=x_train, 
         y_train=y_train, 
         x_val=x_val, 
         y_val= y_val, 
         x_test=x_test, 
         y_test=y_test)



In [30]:
# Check if everything worked correctly 

feature_sets = np.load(feature_sets_file)
feature_sets.files

['x_train', 'y_train', 'x_val', 'y_val', 'x_test', 'y_test']