# Audio Classification - Environmental Sounds - CNN-DNN-Librosa

We are going to use a subset of the data from ESC-50 dataset from https://dagshub.com/kinkusuma/esc50-dataset. The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.
We will develop and train a model to classify 8 differnet environment sounds from the above dataset that has 50+ environment sound audio files for classification.

# Common Imports

In [40]:
import tensorflow as tf
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,Dense,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.image import resize
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback
import librosa
from sklearn.model_selection import train_test_split
import os
import numpy as np

# Defining labels for classification

In [30]:
audio_data_path=r'D:\Kaggle-Competitions\Audio Classification\Environmental-Sound-Classification\data'
inference_categories=os.listdir(audio_data_path)
category_count=len(inference_categories)

# Data Processing

In [32]:
# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(200, 200)):
    data = []
    labels = []
    
    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                audio_data, sample_rate = librosa.load(file_path, sr=None)
                # Perform preprocessing (e.g., convert to Mel spectrogram and resize)
                mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
                mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
                data.append(mel_spectrogram)
                labels.append(i)
    
    return np.array(data), np.array(labels)

In [33]:
# Split data into training and testing sets
data, labels = load_and_preprocess_data(audio_data_path, inference_categories)
labels = to_categorical(labels, num_classes=len(inference_categories))  # Convert labels to one-hot encoding
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [34]:
X_train[0].shape

(200, 200, 1)

# Defining callbacks

In [35]:
class myCallback(Callback):
  def on_epoch_end(self, epoch, logs={}):
    '''
    Halts the training when the loss falls below 0.1

    Args:
      epoch (integer) - index of epoch (required but unused in the function definition below)
      logs (dict) - metric results from the training epoch
    '''
    # Check the loss
    if(logs.get('loss') < 0.1):
      # Stop if threshold is met
      print("\nLoss is lower than 0.1 so cancelling training!")
      self.model.stop_training = True
                
    
# Instantiate class
callbacks = myCallback()

# Creating and compiling model

In [36]:
def create_model():
    model=Sequential([
        Conv2D(64,(3,3),activation='relu',input_shape=X_train[0].shape),
        MaxPooling2D(2,2),
        Conv2D(128,(3,3),activation='relu'),
        MaxPooling2D(2,2),
        Conv2D(256,(3,3),activation='relu'),
        MaxPooling2D(2,2),
        Flatten(),
        Dense(512,activation='relu'),
        Dropout(0.3),
        Dense(256,activation='relu'),
        Dropout(0.3),
        Dense(category_count,activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])
    
    return model

# Training the model

In [37]:
model=create_model()
model.fit(X_train,y_train,epochs=100,verbose=1,batch_size=25,validation_data=(X_test,y_test),callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Loss is lower than 0.1 so cancelling training!


<keras.src.callbacks.History at 0x254d84b4810>

# Save the model

In [38]:
#Validate the model on test dataset (X_test,y_test) 
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

# Save the model
model.save('audio_classification_model.h5')

0.625


  saving_api.save_model(


# Evaluate with a random .wav file data 

In [44]:
# Load the saved model
model = load_model('audio_classification_model.h5')

# Define the target shape for input spectrograms
target_shape = (200, 200)

# Function to preprocess and classify an audio file
def test_audio(file_path, model):
    # Load and preprocess the audio file
    audio_data, sample_rate = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)
    mel_spectrogram = tf.reshape(mel_spectrogram, (1,) + target_shape + (1,))

    # Make predictions
    predictions = model.predict(mel_spectrogram)

    # Get the class probabilities
    class_probabilities = predictions[0]

    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)

    return class_probabilities, predicted_class_index

# Test an audio file
test_audio_file = './tmp/audio-test1.wav'
class_probabilities, predicted_class_index = test_audio(test_audio_file, model)

# Display results for all classes
for i, class_label in enumerate(inference_categories):
    probability = class_probabilities[i]
    #print(f'Class: {class_label}, Probability: {probability:.4f}')

# Calculate and display the predicted class and accuracy
predicted_class = inference_categories[predicted_class_index]
accuracy = class_probabilities[predicted_class_index]
print(f'The audio is classified as: {predicted_class}')
print(f'Accuracy: {accuracy:.4f}')

The audio is classified as: dog
Accuracy: 1.0000
