In [6]:
import numpy as np
import librosa
import pywt
from scipy.fftpack import dct
from tensorflow.keras.models import load_model

# ----------------------------#
# 2. LOAD TRAINED MODEL
# ----------------------------#
model = load_model("saved_models/audio_classification.hdf5")
print("Model Loaded Successfully!")

Model Loaded Successfully!


In [21]:
# DCT sparse transform
def sparse_dct(signal):
    return dct(signal, norm='ortho')

# DWT sparse transform
def sparse_dwt(signal):
    coeffs = pywt.wavedec(signal, 'db1', level=4)
    sparse, _ = pywt.coeffs_to_array(coeffs)
    return sparse


In [22]:
def gaussian_matrix(m, n):
    return np.random.randn(m, n)

def random_matrix(m, n):
    return np.random.rand(m, n)

def binary_random_matrix(m, n):
    return np.random.randint(0, 2, size=(m, n))

def bernoulli_matrix(m, n):
    return np.random.choice([-1, 1], size=(m, n))


In [23]:
def compress_signal(Phi, sparse_signal):
    return np.dot(Phi, sparse_signal)


In [24]:
def mfcc_from_compressed(comp, sr):
    comp = comp.astype(float)

    # Avoid error if length < 2048
    if len(comp) < 2048:
        comp = np.pad(comp, (0, 2048 - len(comp)))

    mfccs = librosa.feature.mfcc(y=comp, sr=sr, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)


In [25]:
def cs_pipeline_block_based(audio, sr, 
                            sparse_method="dct", 
                            matrix_type="gaussian", 
                            keep_ratio=0.4):

    FRAME_SIZE = 1024
    features_list = []

    # Pad audio to multiple of 1024
    pad_len = FRAME_SIZE - (len(audio) % FRAME_SIZE)
    if pad_len != FRAME_SIZE:
        audio = np.pad(audio, (0, pad_len))

    # Process frame by frame (1024 samples)
    for i in range(0, len(audio), FRAME_SIZE):
        frame = audio[i:i+FRAME_SIZE]

        # 1. Sparse Transform
        if sparse_method == "dct":
            sparse = sparse_dct(frame)
        elif sparse_method == "dwt":
            sparse = sparse_dwt(frame)

        n = len(sparse)
        m = int(n * keep_ratio)

        # 2. Measurement Matrix
        if matrix_type == "gaussian":
            Phi = gaussian_matrix(m, n)
        elif matrix_type == "random":
            Phi = random_matrix(m, n)
        elif matrix_type == "binary":
            Phi = binary_random_matrix(m, n)
        elif matrix_type == "bernoulli":
            Phi = bernoulli_matrix(m, n)

        # 3. Compressed Signal
        comp = compress_signal(Phi, sparse)

        # 4. MFCC Extraction
        mfcc = mfcc_from_compressed(comp, sr)
        features_list.append(mfcc)

    # Averaging features of all frames â†’ 40 features
    final_features = np.mean(features_list, axis=0)
    return final_features


In [27]:
classes = ["car_horn", "children_playing", "dog_bark", 
           "drilling", "engine_idling", "gun_shot", "street_music"]

predicted_class = classes[np.argmax(prediction)]
confidence = np.max(prediction)

print("Predicted Class:", predicted_class)
print("Confidence:", confidence)

filename = "dog.wav"

audio, sr = librosa.load(filename, sr=None)

features = cs_pipeline_block_based(
    audio,
    sr,
    sparse_method="dct",       # dct OR dwt
    matrix_type="gaussian",    # gaussian, random, binary, bernoulli
    keep_ratio=0.4             # 40% retained
)

features = features.reshape(1, 40)

prediction = model.predict(features)
print("Prediction:", prediction)


Predicted Class: dog_bark
Confidence: 0.65117925
Prediction: [[1.2232539e-04 1.8145913e-03 6.4513850e-01 4.9266402e-02 4.0083389e-05
  2.0512629e-01 9.8491795e-02]]


In [30]:
filename = "12812-5-0-0.wav"

audio, sr = librosa.load(filename, sr=None)

features = cs_pipeline_block_based(
    audio,
    sr,
    sparse_method="dct",       # or "dwt"
    matrix_type="gaussian",    # or random, binary, bernoulli
    keep_ratio=0.4
)

features = features.reshape(1, 40)

prediction = model.predict(features)
print("Raw Prediction:", prediction)


Raw Prediction: [[0.06391614 0.10069289 0.24405399 0.16552915 0.05523312 0.20272852
  0.16784622]]


In [31]:
classes = [
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "street_music"
]

predicted_class = classes[np.argmax(prediction)]
confidence = np.max(prediction)

print("Predicted Class:", predicted_class)
print("Confidence:", confidence)


Predicted Class: dog_bark
Confidence: 0.24405399
