# Classifying recorded audio files

In [7]:
import numpy as np
import pandas as pd

from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt

from PIL import Image
import matplotlib
import random

import scipy
import librosa
import matplotlib.image

import tensorflow as tf
from tensorflow import keras
from keras import layers


tf.config.run_functions_eagerly

<function tensorflow.python.eager.polymorphic_function.polymorphic_function.run_functions_eagerly(run_eagerly)>

In [28]:
import warnings
warnings.simplefilter('ignore')

In [4]:
# loading the saved best model ( see RNN.ipynb)
model = tf.keras.models.load_model('./model3.h5')

In [76]:
REFERENCE_LENGTH = 14550.0

spectrograms = []

test_labels = ["go", "down", "left", "no", "off", "on", "right", "stop", "up", "yes"]

for el in test_labels:
    
    path = "./recorded_audio_wav/s_"+el+".wav"

    # loading audio file
    song, fs = librosa.load(path)

    # stretching to Time duration equivalent to 64 pixels
    song_stretched = librosa.effects.time_stretch(song, song.shape[0]/REFERENCE_LENGTH)
    frequencies, times, spectrogram = signal.spectrogram(song_stretched, fs)

    # spectorgram path
    arr = path.split("/")
    spectrogram_path = "spectrograms/" + arr[1] + "/" + arr[2][:-4] + ".png"

    # scaling spectrograms to 64 x 64 pixels & 0 - 255 integers
    spectrogram = spectrogram[0:64, 0:64]
    spectrogram = (np.round(spectrogram/(spectrogram.max()/255.0))).astype(np.uint8)
    
    spectrograms.append(spectrogram)
    
# additional audio files - stop and right and yes (recorded by Patryk)
path = "./recorded_audio_wav/p_stop.wav"

# loading audio file
song, fs = librosa.load(path)

# stretching to Time duration equivalent to 64 pixels
song_stretched = librosa.effects.time_stretch(song, song.shape[0]/REFERENCE_LENGTH)
frequencies, times, spectrogram = signal.spectrogram(song_stretched, fs)

# spectorgram path
arr = path.split("/")
spectrogram_path = "spectrograms/" + arr[1] + "/" + arr[2][:-4] + ".png"

# scaling spectrograms to 64 x 64 pixels & 0 - 255 integers
spectrogram = spectrogram[0:64, 0:64]
spectrogram = (np.round(spectrogram/(spectrogram.max()/255.0))).astype(np.uint8)

spectrograms.append(spectrogram)

test_labels.append("stop")

# #####    2nd
path = "./recorded_audio_wav/p_right.wav"

# loading audio file
song, fs = librosa.load(path)

# stretching to Time duration equivalent to 64 pixels
song_stretched = librosa.effects.time_stretch(song, song.shape[0]/REFERENCE_LENGTH)
frequencies, times, spectrogram = signal.spectrogram(song_stretched, fs)

# spectorgram path
arr = path.split("/")
spectrogram_path = "spectrograms/" + arr[1] + "/" + arr[2][:-4] + ".png"

# scaling spectrograms to 64 x 64 pixels & 0 - 255 integers
spectrogram = spectrogram[0:64, 0:64]
spectrogram = (np.round(spectrogram/(spectrogram.max()/255.0))).astype(np.uint8)

spectrograms.append(spectrogram)

test_labels.append("right")

# #####    3rd
path = "./recorded_audio_wav/p_yes.wav"

# loading audio file
song, fs = librosa.load(path)

# stretching to Time duration equivalent to 64 pixels
song_stretched = librosa.effects.time_stretch(song, song.shape[0]/REFERENCE_LENGTH)
frequencies, times, spectrogram = signal.spectrogram(song_stretched, fs)

# spectorgram path
arr = path.split("/")
spectrogram_path = "spectrograms/" + arr[1] + "/" + arr[2][:-4] + ".png"

# scaling spectrograms to 64 x 64 pixels & 0 - 255 integers
spectrogram = spectrogram[0:64, 0:64]
spectrogram = (np.round(spectrogram/(spectrogram.max()/255.0))).astype(np.uint8)

spectrograms.append(spectrogram)

test_labels.append("yes")

In [77]:

dummies2 = pd.get_dummies(test_labels)

dataset_test = tf.data.Dataset.from_tensor_slices((spectrograms, dummies2)).batch(1)

In [78]:
model.predict(dataset_test)



array([[2.52355152e-04, 1.02243954e-02, 4.63765446e-06, 4.51811843e-07,
        8.57586265e-01, 1.30661756e-01, 1.08830291e-05, 3.19388841e-04,
        9.39792488e-04, 9.26538273e-12],
       [1.60823110e-02, 9.82429385e-01, 3.38265913e-08, 2.38241100e-05,
        2.92846118e-04, 1.00447750e-03, 1.73617900e-05, 2.24947107e-05,
        1.27327861e-04, 7.95751919e-12],
       [4.05616277e-07, 7.82548886e-06, 3.28175016e-02, 2.99161329e-04,
        8.49080237e-08, 8.99308290e-08, 8.57531846e-01, 3.34665237e-05,
        9.73415598e-02, 1.19680334e-02],
       [3.20640829e-05, 7.03668714e-01, 7.89307614e-06, 1.13613217e-03,
        9.94015019e-04, 5.33086350e-05, 1.35897544e-05, 1.09773675e-04,
        2.93984443e-01, 2.77075113e-10],
       [6.69290777e-04, 2.47741537e-03, 3.62146558e-04, 1.89485319e-03,
        7.77622452e-03, 1.69263060e-07, 4.36469636e-05, 3.41994502e-03,
        9.83353853e-01, 2.41673820e-06],
       [3.74716730e-03, 8.25264899e-04, 8.67528899e-04, 1.27036415e-07,
   

In [79]:
pred = model.predict(dataset_test)

aux = np.argmax(pred, axis = 1)



In [80]:
NumbersToWordsMap = {9: "yes", 3:"no",8:"up",0:"down",2:"left",6:"right",
                     5:"on",4:"off",7:"stop",1:"go"}


In [81]:
predicted_labels = np.vectorize(NumbersToWordsMap.get)(aux)

predicted_labels

array(['off', 'go', 'right', 'go', 'up', 'on', 'up', 'up', 'up', 'left',
       'stop', 'right', 'left'], dtype='<U5')

In [82]:
test_labels

['go',
 'down',
 'left',
 'no',
 'off',
 'on',
 'right',
 'stop',
 'up',
 'yes',
 'stop',
 'right',
 'yes']

# Conclusions

Perhaps 10 audio files recorded by another volunteer were too silent and that is why the model classified well only 2 out of 10 audio files coming from this person.

After adding 2 audio files ( "stop" and "right"), the model predicted both true labels with success.

In [68]:
from IPython.display import Video

Video("./recorded_audio_mp4/p_stop.mp4")

Even if there is a noise (I recorded it in Warsaw Swietokrzyska metro station, the model could discern the spoken word)

In [69]:
Video("./recorded_audio_mp4/p_right.mp4")

In [83]:
Video("./recorded_audio_mp4/p_yes.mp4")

'Yes' was misclassified as 'left'.

According to confusion matrix, 'left' was the most common wrong answer when the true label was 'yes' - perhaps it is a matter of accent.

Nevertheless, the results on real-life recorded audio files correspond to expectations from confusion matrix. Still, it is recommended to provide audible audio files with a good english accent.