In [11]:
import numpy as np
import librosa as lr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
import pickle


## FE

In [12]:
def cnn_extract_features(audio_path, max_length=345):
    audio_data, _ = lr.load(audio_path, sr=None)
    mfccs = lr.feature.mfcc(y=audio_data, sr=44100, n_mfcc=40)
    if mfccs.shape[1] < max_length:
        mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
    elif mfccs.shape[1] > max_length:
        mfccs = mfccs[:, :max_length]
    return mfccs

In [13]:
cnn_extract_features('../../data/snsd/generated/noise_high/clnsp0-high.wav').shape

(40, 345)

## Model

In [14]:
def build_cnn_model(input_shape):
    model = models.Sequential()
    model.add(layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(3, activation='softmax'))  # Output layer with 3 classes: low, medium, high
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [19]:
def train_model(X_train, y_train, X_val, y_val):
    input_shape = (X_train.shape[1], X_train.shape[2], 1)  # Input shape for CNN
    X_train = X_train.reshape(X_train.shape[0], *input_shape)
    X_val = X_val.reshape(X_val.shape[0], *input_shape)
    model = build_cnn_model(input_shape)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)
    return model


In [16]:
def load_data():
    features = []
    files = pickle.load(open('files-dict-list.pkl', 'rb'))
    for f in files:
        features.append(cnn_extract_features(f['path']))
    # Convert features to numpy array
    X = np.array(features)
    y = np.array([
        0 if x['label'] == 'low'
        else 1 if x['label'] == 'medium'
        else 2
        for x in files
    ])
    return X, y

In [None]:
X, y = load_data()


In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = train_model(X_train, y_train, X_val, y_val)

Epoch 1/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 409ms/step - accuracy: 0.3658 - loss: 5.7504 - val_accuracy: 0.4621 - val_loss: 1.0616
Epoch 2/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 347ms/step - accuracy: 0.4578 - loss: 1.0486 - val_accuracy: 0.4015 - val_loss: 1.0761
Epoch 3/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 379ms/step - accuracy: 0.4492 - loss: 1.0263 - val_accuracy: 0.4667 - val_loss: 0.9951
Epoch 4/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 381ms/step - accuracy: 0.4910 - loss: 0.9821 - val_accuracy: 0.4091 - val_loss: 1.0029
Epoch 5/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 405ms/step - accuracy: 0.4962 - loss: 0.9550 - val_accuracy: 0.4455 - val_loss: 0.9879
Epoch 6/20
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 401ms/step - accuracy: 0.5245 - loss: 0.9360 - val_accuracy: 0.4667 - val_loss: 0.9822
Epoch 7/20
[1m83/83[

In [24]:
model.save('bgnoise_cnn_model.keras')

In [None]:
model.predict(extract)