In [1]:
import tensorflow as tf
import os
import numpy as np

from helper_functions import (
    create_spectrogram_features,
    lite_model_from_file_predicts_dataset,
    get_file_size, 
    convert_bytes,
    evaluate_prediction
)

from sklearn.metrics import confusion_matrix



In [7]:
desired_length_of_audio = 144000
sample_rate = 48000

### CNN

##### CNN initial model in keras format

In [4]:
cnn_initial_model_path = 'time_series_models_to_test_with_RIOT_ML/cnn_time_series_sr48000.keras'
convert_bytes(get_file_size(cnn_initial_model_path), "KB")

File size: 2562.857 Kilobytes


In [11]:
# CNN model predicts all data from directory
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

cnn_initial_model = tf.keras.models.load_model(cnn_initial_model_path)
# input data should be in numpy array
y_pred_prob = cnn_initial_model.predict(np.array(x_data), verbose=0)
y_pred = tf.argmax(y_pred_prob, axis=1).numpy()

# Evaluate
evaluate_prediction(y_data, y_pred)

Accuracy: 93.18%
Recall: 93.14%
Precision: 87.12%
F1-score: 89.97%


##### CNN tf Lite model (without any additional quantization techniques)

In [13]:
cnn_tflite_model_path = 'time_series_models_to_test_with_RIOT_ML/cnn_time_series_sr_48000.tflite'
convert_bytes(get_file_size(cnn_tflite_model_path), "KB")

File size: 850.012 Kilobytes


In [21]:
# CNN model predicts all data from directory
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

y_pred = lite_model_from_file_predicts_dataset(cnn_tflite_model_path, x_data, y_data)
# print("\nTrue: ", y_data)
# print("Predicted: ", y_pred)
if all(x == y for x, y in zip(y_data, y_pred)):
    print("All data points were predicted correctly!")
else:
    print("Prediction was not correct for some points.")

tn, fp, fn, tp = confusion_matrix(y_data, y_pred).ravel()
specificity = tn / (tn+fp)
print(f'Specificity: {specificity * 100:.2f}%')

(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144000, 1)
(144

##### CNN tf Lite model + Post Training Dynamic range quantization

In [16]:
cnn_tflite_drq_model_path = 'time_series_models_to_test_with_RIOT_ML/cnn_time_series_sr_48000_dynamic_range_quantization.tflite'
convert_bytes(get_file_size(cnn_tflite_drq_model_path), "KB")

File size: 217.617 Kilobytes


In [17]:
# CNN model predicts all data from directory
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

y_pred = lite_model_from_file_predicts_dataset(cnn_tflite_drq_model_path, x_data, y_data)
# print("\nTrue: ", y_data)
# print("Predicted: ", y_pred)
if all(x == y for x, y in zip(y_data, y_pred)):
    print("All data points were predicted correctly!")
else:
    print("Prediction was not correct for some points.")

tn, fp, fn, tp = confusion_matrix(y_data, y_pred).ravel()
specificity = tn / (tn+fp)
print(f'Specificity: {specificity * 100:.2f}%')

Accuracy: 93.18%
Recall: 93.14%
Precision: 87.12%
F1-score: 89.97%
Prediction was not correct for some points.
Specificity: 93.26%


### SqueezeNet

##### SqueezeNet initial model in keras format

In [29]:
squeezenet_initial_model_path = 'time_series_models_to_test_with_RIOT_ML/squeezenet_30%_time_series_sr_48000.keras'
convert_bytes(get_file_size(squeezenet_initial_model_path), "KB")

File size: 2165.623 Kilobytes


In [30]:
# SqueezeNet model predicts all data from directory
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

squeezenet_initial_model = tf.keras.models.load_model(squeezenet_initial_model_path)
# input data should be in numpy array
y_pred_prob = squeezenet_initial_model.predict(np.array(x_data), verbose=0)
y_pred = tf.argmax(y_pred_prob, axis=1).numpy()

# Evaluate
evaluate_prediction(y_data, y_pred)

Accuracy: 89.30%
Recall: 90.47%
Precision: 78.04%
F1-score: 85.23%


##### SqueezeNet tf Lite model (without any additional quantization techniques)

In [31]:
squeezenet_tflite_model_path = 'time_series_models_to_test_with_RIOT_ML/squeezenet_30%_time_series_sr_48000.tflite'
convert_bytes(get_file_size(squeezenet_tflite_model_path), "KB")

File size: 707.406 Kilobytes


In [33]:
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

y_pred = lite_model_from_file_predicts_dataset(squeezenet_tflite_model_path, x_data, y_data)
# print("\nTrue: ", y_data)
# print("Predicted: ", y_pred)
if all(x == y for x, y in zip(y_data, y_pred)):
    print("All data points were predicted correctly!")
else:
    print("Prediction was not correct for some points.")

tn, fp, fn, tp = confusion_matrix(y_data, y_pred).ravel()
specificity = tn / (tn+fp)
print(f'Specificity: {specificity * 100:.2f}%')

Accuracy: 89.30%
Recall: 90.47%
Precision: 78.04%
F1-score: 85.23%
Prediction was not correct for some points.
Specificity: 87.06%


##### SqueezeNet tf Lite model + Post Training Dynamic range quantization

In [35]:
squeezenet_tflite_drq_model_path = 'time_series_models_to_test_with_RIOT_ML/squeezenet_30%_time_series_sr_dynamic_range_quantization.tflite'
convert_bytes(get_file_size(squeezenet_tflite_drq_model_path), "KB")

File size: 220.531 Kilobytes


In [36]:
# SqueezeNet model predicts all data from directory
directory = 'C:/Users/polin/Bird_song_detection/dataset/testing'

x_data = []
y_data = []
for root, dirs, files in os.walk(directory):
    for file in files:
        full_file_name = os.path.join(root, file)

        if "non_target" in str(full_file_name):
            class_encoded = 0
        elif "target" in str(full_file_name):
            class_encoded = 1

        audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
        audio_length = tf.shape(audio)[0]
        audio = tf.squeeze(audio, axis=-1)
        if audio_length < desired_length_of_audio:
            audio = tf.pad(audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
        else:
            audio = audio[:desired_length_of_audio]
        audio_with_channel = tf.expand_dims(audio, axis=-1).numpy()

        x_data.append(audio_with_channel)
        y_data.append(class_encoded)

y_pred = lite_model_from_file_predicts_dataset(squeezenet_tflite_drq_model_path, x_data, y_data)
# print("\nTrue: ", y_data)
# print("Predicted: ", y_pred)
if all(x == y for x, y in zip(y_data, y_pred)):
    print("All data points were predicted correctly!")
else:
    print("Prediction was not correct for some points.")

tn, fp, fn, tp = confusion_matrix(y_data, y_pred).ravel()
specificity = tn / (tn+fp)
print(f'Specificity: {specificity * 100:.2f}%')

Accuracy: 89.02%
Recall: 89.70%
Precision: 78.50%
F1-score: 84.59%
Prediction was not correct for some points.
Specificity: 87.70%
