In [None]:
import os, math, glob, pathlib, time
import librosa
import matplotlib.pyplot as plt
import numpy as np
import sklearn
# from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.utils import Sequence
# from keras.utils import to_categorical
import sys

sys.path.append('..')

from preprocess.spectrogram import plot_mel_spect
# from preprocess.wav_helper import trim_audio_to_np_float

from preprocess.preprocess import preprocess, make_spects

from model.load import load_model


2025-10-28 11:35:51.626579: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-28 11:35:51.822378: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-28 11:35:56.585531: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
LABEL_NAMES = ['3S', 'BC', 'BD', 'BE', 'BhBl', 'BlBh', 'XlB', 'XsB']
SAMPLE_RATE = 48000
SAMPLE_SECONDS = 4
BATCH_SIZE =  32
EPOCHS = 60
AUDIO_EXTENSIONS = ['wav']

In [None]:
def load_and_normalize_audio(file_path, target_sr=SAMPLE_RATE):
    """
    Load audio file in various formats and normalize it
    """
    try:
        audio, sr = librosa.load(file_path, sr=target_sr, mono=False)
        if audio.ndim > 1:
            if audio.shape[0] > 1 and np.any(audio[1]):
                audio = np.mean(audio, axis=0)
            else:
                audio = audio[0]

        audio = librosa.util.normalize(audio)
        return audio

    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


def audio_generator(files, shuffle):
    """Generator that yields audio chunks and labels on demand"""
    indices = list(range(len(files)))

    if shuffle:
        np.random.shuffle(indices)

    for idx in indices:
        file_path: str = files[idx]

        audio = load_and_normalize_audio(file_path)
        yield audio



def make_audio_generator(directory):
    """
    Create a TensorFlow dataset from audio files in directory
    """

    data_dir = pathlib.Path(directory)
    files = list(data_dir.rglob("*.wav"))

    return lambda x: audio_generator(files, True)

In [3]:
classifier_model = load_model('../../.tstdata/models/zdenda-resnet-2.keras')

2025-10-28 10:33:09.149047: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2025-10-28 10:33:09.149126: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:171] verbose logging is disabled. Rerun with verbose logging (usually --v=1 or --vmodule=cuda_diagnostics=1) to get more diagnostic output from this module
2025-10-28 10:33:09.149140: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:176] retrieving CUDA diagnostic information for host: x3nomMint
2025-10-28 10:33:09.149150: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:183] hostname: x3nomMint
2025-10-28 10:33:09.149482: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:190] libcuda reported version is: 535.261.3
2025-10-28 10:33:09.149535: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:194] kernel reported version is: 535.261.3
2

In [4]:
classifier_model.summary()

In [5]:
pre_encoder_layer = classifier_model.get_layer("resnet152v2")
encoder_input = pre_encoder_layer.output

In [6]:
for layer in classifier_model.layers:
    layer.trainable = False

In [7]:
# Bottleneck

x = keras.layers.Dense(512, activation='relu')(encoder_input)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dense(16, activation='relu')(x)

# "FINAL" compressed layer
compressed = keras.layers.Dense(2, activation='linear', name='compressed')(x)

# decompress back to the same dimensionality as base_output
x = keras.layers.Dense(16, activation='relu')(compressed)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dense(512 , activation='relu')(x)
reconstructed = keras.layers.Dense(encoder_input.shape[-1], activation='linear', name='reconstructed')(x)



def feature_reconstruction_loss(y_true, y_pred):
    # dummy, we’ll ignore y_true; we’ll use y_pred internally
    return tf.reduce_mean(tf.square(y_pred["encoder_in"] - y_pred["encoder_out"]))


# new model takes *same input* as original classifier
compressor_model = keras.Model(inputs=classifier_model.input, outputs={
        "encoder_in": encoder_input,
        "encoder_out": reconstructed,
        "compressed": compressed
    }
)

In [8]:
# compressor_model.summary()

compressor_model.compile(
    optimizer='adam',
    loss=feature_reconstruction_loss,
    metrics=['accuracy']
)


In [None]:
DATA_DIR = "../../.tstdata/dataset"

train_data_gen = make_audio_generator(DATA_DIR)

In [None]:
training_start_time_str = int(time.time())
history = compressor_model.fit(
    train_data_gen,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    # steps_per_epoch=train_steps,
    # validation_steps=val_steps,
    callbacks=[
        keras.callbacks.ModelCheckpoint(
            filepath=f'../../.tstdata/ckpt/{training_start_time_str}_compressor-checkpoint.keras',
            monitor='val_accuracy',
            mode='max',
            save_best_only=True,
            save_freq="epoch"
        )
    ]    
)


In [None]:
def plot_history(history):
    """
    Plots training and validation accuracy/loss curves from a Keras History object.
    """
    # Extract training metrics
    acc = history.history.get('accuracy')
    val_acc = history.history.get('val_accuracy')
    loss = history.history.get('loss')
    val_loss = history.history.get('val_loss')

    epochs = range(1, len(loss) + 1)

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
    if val_acc:
        plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')

    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.6)

    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'bo-', label='Training Loss')
    if val_loss:
        plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.6)

    plt.tight_layout()
    plt.show()

In [None]:
plot_history(history)

In [None]:
FP = "/home/x3nom/Downloads/F000037.wav"

samples = preprocess(FP)

for sample in samples:
    # X = np.zeros((1, *spect.shape, 1), dtype=np.float32)
    # X[0, :, :, 0] = spect

    audio_tensor = tf.convert_to_tensor(
        np.asarray(sample).reshape(1, -1), dtype=tf.float32
    )

    prediction = classifier_model.predict(
        audio_tensor
    )


    pred_percent = dict(zip(LABEL_NAMES, map(lambda x: f"{round(float(x), 2) * 100}%", prediction.flatten())))
    percent_str = ' | '.join([ f"{k}:{pred_percent[k]}" for k in pred_percent.keys() ])

    print(percent_str)
    # plot_mel_spect(, title=percent_str)
    