In [1]:
import numpy as np
import keras
import tensorflow as tf
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

2024-06-24 12:01:08.884422: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-24 12:01:08.935504: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from enum import Enum, auto

class ModelType(Enum):
    TORCH = auto()
    TFLITE = auto()

class InputType(Enum):
    MEL_SPEC = auto()
    TIME_SERIES = auto()

class ModelMeta:
    def __init__(self, name, path, model_type, input_type,quantized=False):
        self.name = name
        self.path = path
        self.type = model_type
        self.input_type = input_type
        self.quantized = quantized

#1 - Target, 0 - Non-target
def calculate_binary_classification_performance(predicted_labels, true_labels):
    y_true, y_pred = true_labels, predicted_labels
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    accuracy = accuracy_score(y_true, y_pred)

    f1 = f1_score(y_true, y_pred)

    recall = tp / (tp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    f2 = (1 + 2**2) * (precision * recall) / (2**2 * precision + recall)

    return {
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "TPR": recall,
        "FPR": fp / (fp + tn),
        "F1": f1,
        "F2": f2
    }

# data -> model -> softmax
# return softmax preds val idx 1 - Target, 0 - Non-target
#        true_labels
def inference_torch_model(model, data_loader, quantized=False):
    model.eval()
    model.to(DEVICE)
    prediction_softmax = []
    true_labels = []
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            outputs = model(inputs)
            preds = torch.softmax(outputs, dim=1).cpu().numpy()
            prediction_softmax.append(preds)
            true_labels.append(targets.cpu().numpy())


    prediction_softmax = np.concatenate(prediction_softmax)
    true_labels = np.concatenate(true_labels)
    prediction_softmax[:, [0, 1]] = prediction_softmax[:, [1, 0]]
    true_labels = 1 - true_labels
    return prediction_softmax, true_labels

def evaluate_torch_model(model, data_loader, quantized=False, threshold_vals=None):
    prediction_score, true_labels = inference_torch_model(model, data_loader, quantized)
    metrics_on_t = []
    for t in threshold_vals:
        preds = prediction_score[:, 1]
        preds = preds > t
        metrics = calculate_binary_classification_performance(preds, true_labels)
        metrics['t'] = t
        metrics_on_t.append(metrics)
    return metrics_on_t


In [3]:
metas = [
    ModelMeta("StreamingCNNTiny", 
              "./AudioClassifierCNNNoReluTiny_16K_3s_raw_signal/model.51.0.9442029595375061.pth",
              ModelType.TORCH, InputType.TIME_SERIES
             ),
    ModelMeta("StreamingTransformer", "./model_transformer.pth", ModelType.TORCH, InputType.TIME_SERIES),
    
    ModelMeta("SqueezNet-Time-Series", 
              "./tf_implementation/time_series_models_to_test_with_RIOT_ML/squeezenet/squeezenet30%_time_series_16kHz_full_int_q.tflite", 
              ModelType.TFLITE, InputType.TIME_SERIES, quantized=True
             ),
    
    ModelMeta("CNN-Mel-Spec", "./tf_implementation/spectrogram_models_to_test_with_RIOT_ML/cnn/cnn_mel_spec_16kHz_full_int_q.tflite", 
              ModelType.TFLITE, InputType.MEL_SPEC, quantized=True
             ),
    
    ModelMeta("SqueezNet-Mel-Spec", 
              "./tf_implementation/spectrogram_models_to_test_with_RIOT_ML/squeezenet/squeezenet_spec_16kHz_full_int_q.tflite", 
              ModelType.TFLITE, InputType.MEL_SPEC, quantized=True),
]

In [4]:
#pytorch init of data and model class
import librosa
def resample_audio(y, orig_sr, target_sr):
    y = librosa.resample(y, orig_sr=orig_sr, target_sr=target_sr, res_type='zero_order_hold')
    return y

from CNNModels import AudioClassifierCNNNoRelu, AudioClassifierCNNNoReluSmall, AudioClassifierCNNNoReluTiny, AudioClassifierCNNNoReluPico
from TransformerModel import RawAudioTransformerModel

from torch.utils.data import DataLoader
from AudioDataset import AudioDataset, RawAudioDataset

train_target_folder = "../BirdSound/roman_data/cut-data/training/target/"
train_non_target_folder = "../BirdSound/roman_data/cut-data/training/non_target/"

validation_target_folder = "../BirdSound/roman_data/cut-data/validation/target/"
validation_non_target_folder = "../BirdSound/roman_data/cut-data/validation/non_target/"

test_target_folder = "../BirdSound/roman_data/cut-data/testing/target/"
test_non_target_folder = "../BirdSound/roman_data/cut-data/testing/non_target/"

SAMPLE_RATE = 48000
RESAMPLE_RATE = 16000
FIXED_LEN_IN_SEC = 3
FIXED_LEN_WAVE = RESAMPLE_RATE * FIXED_LEN_IN_SEC
NUM_WORKERS = 10
BATCH_SIZE = 256
RESAMPLE_TRANSFORM = lambda x : torch.from_numpy(resample_audio(x.numpy(), SAMPLE_RATE, RESAMPLE_RATE))


train_dataset = RawAudioDataset(train_target_folder, train_non_target_folder,fixed_length_wave= FIXED_LEN_WAVE, transform=RESAMPLE_TRANSFORM)
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True, num_workers=NUM_WORKERS)

validation_dataset = RawAudioDataset(validation_target_folder, validation_non_target_folder,fixed_length_wave = FIXED_LEN_WAVE, transform=RESAMPLE_TRANSFORM)
validation_dataloader = DataLoader(validation_dataset, batch_size = BATCH_SIZE,  num_workers = NUM_WORKERS, shuffle = True)

test_dataset = RawAudioDataset(test_target_folder, test_non_target_folder,fixed_length_wave = FIXED_LEN_WAVE, transform=RESAMPLE_TRANSFORM)
test_dataloader = DataLoader(validation_dataset, batch_size = BATCH_SIZE,  num_workers = NUM_WORKERS, shuffle = False)

In [5]:
import tensorflow as tf
import os
import numpy as np
import tensorflow_io as tfio

from tf_implementation.helper_functions import (
    create_spectrogram_features,
    lite_model_from_file_predicts_dataset,
)

# Take all audio from testing dataset and create spectrograms from them
# We will use spectrograms for models testing
directory = '../BirdSound/roman_data/cut-data/testing/'

#create mel-spec

# x_data = []
# y_data = []
# for root, dirs, files in os.walk(directory):
#     for file in files:
#         full_file_name = os.path.join(root, file)
#         if "non_target" in str(full_file_name):
#             class_encoded = 0
#         elif "target" in str(full_file_name):
#             class_encoded = 1

#         audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
#         audio = tf.squeeze(audio, axis=-1)
#         resampled_audio = tfio.audio.resample(audio, rate_in=SAMPLE_RATE, rate_out=RESAMPLE_RATE)
#         # Prepare log mel spectrogram from audio
#         spectrogram_feature = create_spectrogram_features(resampled_audio, desired_length=FIXED_LEN_WAVE, sample_rate = RESAMPLE_RATE)
#         x_data.append(spectrogram_feature)
#         y_data.append(class_encoded)

# # input data should be in numpy array, not in list
# x_data_mel = np.array(x_data)
# y_data_labels = np.array(y_data)

# np.save("x_data_mel_test_tf.npy", x_data_mel)
# np.save("y_data_labels_test_tf.npy", y_data_labels)


#create raw audio
# x_data = []
# y_data = []
# desired_length_of_audio = FIXED_LEN_WAVE
# for root, dirs, files in os.walk(directory):
#     for file in files:
#         full_file_name = os.path.join(root, file)

#         if "non_target" in str(full_file_name):
#             class_encoded = 0
#         elif "target" in str(full_file_name):
#             class_encoded = 1

#         audio, sr = tf.audio.decode_wav(tf.io.read_file(full_file_name))
#         audio = tf.squeeze(audio, axis=-1)
#         resampled_audio = tfio.audio.resample(audio, rate_in=SAMPLE_RATE, rate_out=RESAMPLE_RATE)
#         audio_length = tf.shape(resampled_audio)[0]
#         if audio_length < desired_length_of_audio:
#             resampled_audio = tf.pad(resampled_audio, [[0, desired_length_of_audio - audio_length]], mode='CONSTANT')
#         else:
#             resampled_audio = resampled_audio[:desired_length_of_audio]
#         resampled_audio = tf.expand_dims(resampled_audio, axis=-1).numpy()

#         x_data.append(resampled_audio)
#         y_data.append(class_encoded)

# # input data should be in numpy array, not in list
# x_data_raw = np.array(x_data)
# y_data_raw = np.array(y_data)

In [8]:
x_data_mel_train = np.load("x_data_mel_train_tf.npy")
x_data_ts_train = np.load("x_data_ts_train_tf.npy")
y_data_labels_train = np.load("y_data_labels_train_tf.npy")
x_data_mel_test = np.load("x_data_mel_test_tf.npy")
x_data_ts_test = np.load("x_data_ts_test_tf.npy")
y_data_labels_test = np.load("y_data_labels_test_tf.npy")

def run_full_int_q_tflite_model(tflite_file, test_image_indices, x_data):
    # Initialize the interpreter
    interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    predictions = np.zeros((len(test_image_indices),), dtype=float)
    for i, test_image_index in enumerate(test_image_indices):
        test_data_point = x_data[test_image_index]

        # Check if the input type is quantized, then rescale input data to uint8
        if input_details['dtype'] == np.uint8:
            input_scale, input_zero_point = input_details["quantization"]
            test_data_point = test_data_point / input_scale + input_zero_point

        test_data_point = np.expand_dims(test_data_point, axis=0).astype(input_details["dtype"])
        interpreter.set_tensor(input_details["index"], test_data_point)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details["index"])[0]

        predictions[i] = tf.nn.softmax(tf.cast(output, tf.float32))[1]

    return predictions

def full_int_model_predict(tflite_file, x_data):
    test_image_indices = range(len(x_data))
    predictions = run_full_int_q_tflite_model(tflite_file, test_image_indices, x_data)
    return predictions

# data -> model -> softmax
# return softmax validx 1 - Target, 0 - Non-target
def inference_tflite_model(model_path, data, quantized=False):
    if quantized:
        prediction_score = full_int_model_predict(model_path, data)
    return prediction_score


def evaluate_tflite_model(model_path, x_data, y_data, quantized=False, threshold_vals=None):
    prediction_score, true_labels = inference_tflite_model(model_path, x_data, quantized), y_data
    metrics_on_t = []
    for t in threshold_vals:
        preds = prediction_score
        preds = preds >= t
        metrics = calculate_binary_classification_performance(preds, true_labels)
        metrics['t'] = t
        metrics_on_t.append(metrics)
    return metrics_on_t

In [9]:
#Metrics on training dataset


t = [x * 0.01 for x in range(0, 101)]

metrics_diff_models = []

for meta in metas:
    if meta.type == ModelType.TORCH:
        if meta.name == "StreamingCNNTiny":
            model = AudioClassifierCNNNoReluTiny()
        if meta.name == "StreamingTransformer":
            model = RawAudioTransformerModel(num_classes=2, n_embd=16, n_head=1, block_size=16, hidden_size=32, n_layers=1)
        model.load_state_dict(torch.load(meta.path))
        metrics = evaluate_torch_model(model, train_dataloader, False, t)

    elif meta.type == ModelType.TFLITE:
        if meta.input_type == InputType.MEL_SPEC:
            metrics = evaluate_tflite_model(meta.path, x_data_mel_train, y_data_labels_train, meta.quantized, t)
        elif meta.input_type == InputType.TIME_SERIES:
            metrics = evaluate_tflite_model(meta.path, x_data_ts_train, y_data_labels_train, meta.quantized, t)
    metrics_diff_models.append({'name': meta.name, 'path':  meta.path, 'metrics': metrics})
import pickle
with open('metrics_diff_models_train.pkl', 'wb') as f:
    pickle.dump(metrics_diff_models, f)

  f2 = (1 + 2**2) * (precision * recall) / (2**2 * precision + recall)


tensor([[[0.0512],
         [0.0000],
         [0.0283],
         ...,
         [0.0328],
         [0.0870],
         [0.0414]],

        [[0.0156],
         [0.0000],
         [0.0116],
         ...,
         [0.0187],
         [0.0190],
         [0.0398]],

        [[0.0445],
         [0.0033],
         [0.0485],
         ...,
         [0.0801],
         [0.0952],
         [0.0415]],

        ...,

        [[0.0340],
         [0.0075],
         [0.0801],
         ...,
         [0.1460],
         [0.1250],
         [0.0418]],

        [[0.0315],
         [0.0000],
         [0.0210],
         ...,
         [0.0297],
         [0.0412],
         [0.0403]],

        [[0.0176],
         [0.0046],
         [0.1105],
         ...,
         [0.1774],
         [0.0617],
         [0.0405]]], device='cuda:0')
tensor([[ 0.0827,  0.2663, -2.5262,  ...,  1.7148, -0.0072,  0.0568],
        [ 0.0671,  0.3053, -0.6931,  ..., -0.5864, -0.0161,  0.1282],
        [ 0.0282,  0.1059, -1.7672,  ...,  2.4431

tensor([[ 7.3637e-02,  3.0743e-01, -9.6164e-01,  ..., -2.9922e-01,
         -1.5512e-02,  1.2751e-01],
        [-3.4190e-02,  3.0722e-02,  1.9714e-01,  ...,  1.6550e+00,
          1.4169e-02, -3.5155e-02],
        [-1.2321e-03,  6.9179e-02, -1.0553e+00,  ...,  2.2376e+00,
          1.4835e-02, -2.7924e-02],
        ...,
        [ 6.9672e-02,  3.0621e-01, -8.0155e-01,  ..., -4.2336e-01,
         -1.6255e-02,  1.2940e-01],
        [-1.3868e-01,  1.9463e-01,  2.4458e+00,  ...,  1.4945e+00,
         -9.0336e-02,  1.3384e-01],
        [ 6.0375e-02,  2.9975e-01, -3.7985e-01,  ..., -8.1275e-01,
         -1.5750e-02,  1.3455e-01]], device='cuda:0')
tensor([[ -0.3522,   0.4187,   0.2136,  ...,   0.0427,  -0.2285,  -0.2504],
        [  1.8978,   5.1859,  -0.1545,  ...,  -0.8246,  -0.0446,  -1.2673],
        [ -0.6937,   2.0397,   0.9384,  ...,   0.3503,  -1.4247,  -1.5293],
        ...,
        [ -0.3347,   0.4112,   0.1655,  ...,   0.0384,  -0.2196,  -0.2457],
        [  7.7629,   6.7374,   1.5

tensor([[[3.5535e-02],
         [0.0000e+00],
         [2.9547e-02],
         ...,
         [3.5153e-02],
         [3.8140e-02],
         [4.0263e-02]],

        [[1.3172e-03],
         [0.0000e+00],
         [2.5852e-02],
         ...,
         [9.5937e-04],
         [2.9207e-05],
         [4.0071e-02]],

        [[1.7720e-02],
         [1.2790e-02],
         [7.9400e-02],
         ...,
         [1.3161e-01],
         [5.4312e-02],
         [4.0460e-02]],

        ...,

        [[1.6291e-01],
         [2.2311e-02],
         [1.0227e-01],
         ...,
         [2.0715e-01],
         [4.1551e-01],
         [4.8576e-02]],

        [[7.0700e-02],
         [1.2316e-04],
         [3.3554e-02],
         ...,
         [4.5937e-02],
         [1.2986e-01],
         [4.2321e-02]],

        [[1.6277e-02],
         [0.0000e+00],
         [1.0202e-02],
         ...,
         [2.0712e-02],
         [2.7207e-02],
         [3.9993e-02]]], device='cuda:0')
tensor([[ 7.1845e-02,  3.0637e-01, -8.6085e-0

tensor([[[4.8991e-03],
         [0.0000e+00],
         [2.4532e-03],
         ...,
         [1.4397e-02],
         [8.0331e-03],
         [3.9564e-02]],

        [[1.2926e-02],
         [2.2778e-02],
         [1.6931e-01],
         ...,
         [2.6619e-01],
         [5.5005e-02],
         [4.0334e-02]],

        [[1.3243e-01],
         [8.4250e-06],
         [1.0598e-01],
         ...,
         [8.1368e-02],
         [1.3718e-01],
         [4.2708e-02]],

        ...,

        [[5.2794e-03],
         [2.2139e-03],
         [7.2082e-02],
         ...,
         [1.1660e-01],
         [9.7618e-03],
         [3.9541e-02]],

        [[2.9706e-02],
         [0.0000e+00],
         [2.2536e-02],
         ...,
         [3.0483e-02],
         [3.6384e-02],
         [4.0185e-02]],

        [[5.3560e-04],
         [0.0000e+00],
         [4.8807e-04],
         ...,
         [1.2272e-02],
         [1.1005e-04],
         [3.9359e-02]]], device='cuda:0')
tensor([[ 0.0850,  0.2091, -2.9059,  ...,  2.

tensor([[[2.6068e-02],
         [2.0889e-03],
         [9.0831e-02],
         ...,
         [1.4595e-01],
         [6.5480e-02],
         [4.0638e-02]],

        [[1.4854e-01],
         [1.0645e-01],
         [3.4920e-01],
         ...,
         [5.5079e-01],
         [4.0304e-01],
         [4.8680e-02]],

        [[5.0429e-02],
         [6.8182e-03],
         [1.2211e-01],
         ...,
         [1.8293e-01],
         [9.7223e-02],
         [4.1390e-02]],

        ...,

        [[1.7551e-02],
         [4.5022e-03],
         [7.0016e-02],
         ...,
         [1.0985e-01],
         [2.9857e-02],
         [3.9977e-02]],

        [[5.1501e-03],
         [3.5098e-06],
         [4.2252e-02],
         ...,
         [7.2044e-02],
         [1.6824e-02],
         [3.9661e-02]],

        [[3.4504e-02],
         [0.0000e+00],
         [3.0851e-02],
         ...,
         [3.4060e-02],
         [3.3435e-02],
         [4.0178e-02]]], device='cuda:0')
tensor([[ 3.6853e-02,  1.1667e-01, -2.0033e+0

  f2 = (1 + 2**2) * (precision * recall) / (2**2 * precision + recall)


In [10]:
#Metrics on test dataset


t = [x * 0.01 for x in range(0, 101)]

metrics_diff_models = []

for meta in metas:
    if meta.type == ModelType.TORCH:
        if meta.name == "StreamingCNNTiny":
            model = AudioClassifierCNNNoReluTiny()
        if meta.name == "StreamingTransformer":
            model = RawAudioTransformerModel(num_classes=2, n_embd=16, n_head=1, block_size=16, hidden_size=32, n_layers=1)
        model.load_state_dict(torch.load(meta.path))
        metrics = evaluate_torch_model(model, test_dataloader, False, t)

    elif meta.type == ModelType.TFLITE:
        if meta.input_type == InputType.MEL_SPEC:
            metrics = evaluate_tflite_model(meta.path, x_data_mel_test, y_data_labels_test, meta.quantized, t)
        elif meta.input_type == InputType.TIME_SERIES:
            metrics = evaluate_tflite_model(meta.path, x_data_ts_test, y_data_labels_test, meta.quantized, t)
    metrics_diff_models.append({'name': meta.name, 'path':  meta.path, 'metrics': metrics})

# np.save("metrics_diff_models_test.npy", metrics_diff_models)
import pickle
with open('metrics_diff_models_test.pkl', 'wb') as f:
    pickle.dump(metrics_diff_models, f)

  f2 = (1 + 2**2) * (precision * recall) / (2**2 * precision + recall)


tensor([[[4.6554e-02],
         [0.0000e+00],
         [4.2209e-02],
         ...,
         [4.1034e-02],
         [4.5999e-02],
         [4.0479e-02]],

        [[2.8087e-02],
         [0.0000e+00],
         [2.2852e-02],
         ...,
         [2.7179e-02],
         [2.7513e-02],
         [4.0041e-02]],

        [[2.3926e-02],
         [0.0000e+00],
         [1.9530e-02],
         ...,
         [2.5265e-02],
         [2.1039e-02],
         [3.9887e-02]],

        ...,

        [[6.0151e-02],
         [0.0000e+00],
         [4.5165e-02],
         ...,
         [4.1661e-02],
         [7.2657e-02],
         [4.1089e-02]],

        [[2.6910e-02],
         [0.0000e+00],
         [2.0220e-02],
         ...,
         [2.6289e-02],
         [3.0083e-02],
         [4.0094e-02]],

        [[1.9915e-03],
         [0.0000e+00],
         [3.8741e-02],
         ...,
         [1.4750e-03],
         [7.3388e-05],
         [4.0418e-02]]], device='cuda:0')
tensor([[ 0.0711,  0.3032, -0.6857,  ..., -0.

  f2 = (1 + 2**2) * (precision * recall) / (2**2 * precision + recall)
