In [None]:
# setup
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
sys.path.append(os.path.abspath('../src'))
from model import build_cnn_model, build_cnn_lstm_model

# load features and labels
mfccs = np.load("../data/features/mfccs.npy")
spectrograms = np.load("../data/features/spectrograms.npy")
labels = np.load("../data/features/feature_labels.npy")

# encode labels
le = LabelEncoder()
labels_int = le.fit_transform(labels)
labels_onehot = to_categorical(labels_int)

# train/val/test split (80/10/10)
X_mfcc_train, X_mfcc_temp, X_spec_train, X_spec_temp, y_train, y_temp = train_test_split(
    mfccs, spectrograms, labels_onehot, test_size=0.2, stratify=labels_int, random_state=42
)
X_mfcc_val, X_mfcc_test, X_spec_val, X_spec_test, y_val, y_test = train_test_split(
    X_mfcc_temp, X_spec_temp, y_temp, test_size=0.5, stratify=np.argmax(y_temp, axis=1), random_state=42
)

In [None]:
# check current shape
print("Original shape:", X_mfcc_train.shape)

# transpose to (batch, time, features)
if X_mfcc_train.shape[1] == 13:
    X_mfcc_train = np.transpose(X_mfcc_train, (0, 2, 1))
    X_mfcc_val = np.transpose(X_mfcc_val, (0, 2, 1))

# add channel dimension, 4D
X_mfcc_train = X_mfcc_train[..., np.newaxis]
X_mfcc_val = X_mfcc_val[..., np.newaxis]

print("Transformed shape:", X_mfcc_train.shape)


Original shape: (14555, 13, 400)
Transformed shape: (14555, 400, 13, 1)


In [7]:
from tensorflow.keras.callbacks import EarlyStopping

# build model
input_shape = X_mfcc_train.shape[1:]
num_classes = y_train.shape[1]
mfcc_model = build_cnn_model(input_shape, num_classes)

# train model
mfcc_model.fit(
    X_mfcc_train, y_train,
    validation_data=(X_mfcc_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)],
    verbose=2
)

Epoch 1/10
455/455 - 49s - 108ms/step - accuracy: 0.2347 - loss: 1.7032 - val_accuracy: 0.2314 - val_loss: 1.5978
Epoch 2/10
455/455 - 46s - 102ms/step - accuracy: 0.2315 - loss: 1.5964 - val_accuracy: 0.2314 - val_loss: 1.5953
Epoch 3/10
455/455 - 45s - 100ms/step - accuracy: 0.2315 - loss: 1.5954 - val_accuracy: 0.2314 - val_loss: 1.5950
Epoch 4/10
455/455 - 46s - 101ms/step - accuracy: 0.2315 - loss: 1.5958 - val_accuracy: 0.2314 - val_loss: 1.5949


<keras.src.callbacks.history.History at 0x27efc5f2bd0>

In [None]:
# check current shape
print("Original spectrogram shape:", X_spec_train.shape)

# transpose (batch, time, frequency)
if X_spec_train.shape[1] == 128:
    X_spec_train = np.transpose(X_spec_train, (0, 2, 1))
    X_spec_val = np.transpose(X_spec_val, (0, 2, 1))

# add channel dimension, 4D
X_spec_train = X_spec_train[..., np.newaxis]
X_spec_val = X_spec_val[..., np.newaxis]

print("Transformed spectrogram shape:", X_spec_train.shape)


Original spectrogram shape: (14555, 128, 400)
Transformed spectrogram shape: (14555, 400, 128, 1)


In [10]:
# build model
input_shape = X_spec_train.shape[1:]
spec_model = build_cnn_lstm_model(input_shape, num_classes)

# train model
spec_model.fit(
    X_spec_train, y_train,
    validation_data=(X_spec_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)],
    verbose=2
)

Epoch 1/10
455/455 - 528s - 1s/step - accuracy: 0.2189 - loss: 1.6247 - val_accuracy: 0.2314 - val_loss: 1.5969
Epoch 2/10
455/455 - 523s - 1s/step - accuracy: 0.2273 - loss: 1.5992 - val_accuracy: 0.2314 - val_loss: 1.5963
Epoch 3/10
455/455 - 522s - 1s/step - accuracy: 0.2298 - loss: 1.5973 - val_accuracy: 0.2336 - val_loss: 1.5945
Epoch 4/10
455/455 - 517s - 1s/step - accuracy: 0.2269 - loss: 1.5965 - val_accuracy: 0.2336 - val_loss: 1.5946
Epoch 5/10
455/455 - 511s - 1s/step - accuracy: 0.2285 - loss: 1.5956 - val_accuracy: 0.2336 - val_loss: 1.5944
Epoch 6/10
455/455 - 512s - 1s/step - accuracy: 0.2288 - loss: 1.5958 - val_accuracy: 0.2336 - val_loss: 1.5942


<keras.src.callbacks.history.History at 0x28100462a50>

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout

from model import build_mfcc_branch, build_spec_branch

mfcc_input = Input(shape=X_mfcc_train.shape[1:])
spec_input = Input(shape=X_spec_train.shape[1:])

# build branches
mfcc_branch = build_mfcc_branch(mfcc_input)
spec_branch = build_spec_branch(spec_input)

# combine and build model
x = Concatenate()([mfcc_branch, spec_branch])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(num_classes, activation='softmax')(x)

dual_model = Model(inputs=[mfcc_input, spec_input], outputs=output)
dual_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# train model
dual_model.fit(
    [X_mfcc_train, X_spec_train], y_train,
    validation_data=([X_mfcc_val, X_spec_val], y_val),
    epochs=10,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)],
    verbose=2
)


Epoch 1/10


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def evaluate_model(model, X_test, y_test, label_encoder, dual_input=False, spec_X_test=None):
    if dual_input:
        predictions = model.predict([X_test, spec_X_test])
    else:
        predictions = model.predict(X_test)
    
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(predictions, axis=1)

    print("Classification Report:\n")
    print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
    print("\nConfusion Matrix:\n")
    print(confusion_matrix(y_true, y_pred))

In [None]:
print("MFCC-Only Model Evaluation:")
evaluate_model(mfcc_model, X_mfcc_test, y_test, le)

print("\nSpectrogram-Only Model Evaluation:")
evaluate_model(spec_model, X_spec_test, y_test, le)

print("\nCombined Model Evaluation:")
evaluate_model(dual_model, X_mfcc_test, y_test, le, dual_input=True, spec_X_test=X_spec_test)
