In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
file_path = 'sound_folder/clip_0.wav' 
audio, _ = librosa.load(file_path, sr=22050, duration=0.1)
mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
print(mfccs)
mfccs.shape
mfccs.T


[[-8.62672180e+02 -6.29520630e+02 -3.89662781e+02 -3.14668488e+02
  -2.89507355e+02]
 [ 0.00000000e+00  5.48909492e+01  2.99132233e+01  4.06137390e+01
   8.44670563e+01]
 [ 0.00000000e+00 -5.14149017e+01 -4.34424591e+01 -2.81129761e+01
  -1.47698698e+01]
 [ 0.00000000e+00 -4.58642244e+00  1.18799162e+00  9.45555401e+00
  -4.95697290e-01]
 [ 0.00000000e+00 -3.93269615e+01 -4.49370956e+01 -3.73889084e+01
  -4.23351860e+01]
 [ 0.00000000e+00  6.43753099e+00  5.65182304e+00  7.29870462e+00
   1.72041166e+00]
 [ 0.00000000e+00  4.85045147e+00  3.33478403e+00  3.69341493e+00
   6.71556807e+00]
 [ 0.00000000e+00 -1.73491669e+00 -5.17387247e+00 -4.79320526e+00
   1.21403193e+00]
 [ 0.00000000e+00  1.36123390e+01  2.99189448e-01 -8.77024651e+00
  -1.14590988e+01]
 [ 0.00000000e+00  1.36239994e+00  3.19477749e+00 -6.24655819e+00
  -1.56038942e+01]
 [ 0.00000000e+00 -7.03666973e+00 -1.16040325e+01 -1.83685265e+01
  -2.02493973e+01]
 [ 0.00000000e+00 -3.70566678e+00  7.55454600e-02 -5.53397942e+00

array([[-8.62672180e+02,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [-6.29520630e+02,  5.48909492e+01, -5.14149017e+01,
        -4.58642244e+00, -3.93269615e+01,  6.43753099e+00,
         4.85045147e+00, -1.73491669e+00,  1.36123390e+01,
         1.36239994e+00, -7.03666973e+00, -3.70566678e+00,
        -2.24396477e+01],
       [-3.89662781e+02,  2.99132233e+01, -4.34424591e+01,
         1.18799162e+00, -4.49370956e+01,  5.65182304e+00,
         3.33478403e+00, -5.17387247e+00,  2.99189448e-01,
         3.19477749e+00, -1.16040325e+01,  7.55454600e-02,
        -1.99098511e+01],
       [-3.14668488e+02,  4.06137390e+01, -2.81129761e+01,
         9.45555401e+00, -3.73889084e+01,  7.29870462e+00,
         3.69341493e+00, -4.79320526e+00, -8.77024651e+00,
        -6.24655819e+00, -1.83685265e

In [3]:
# Function to extract MFCC features from audio files
def extract_features(file_path):
    audio, _ = librosa.load(file_path, sr=22050, duration=0.1)
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
    return mfccs



In [4]:
# Function to prepare the dataset
def prepare_dataset(sound_folder, no_sound_folder):
    sound_files = os.listdir(sound_folder)
    no_sound_files = os.listdir(no_sound_folder)

    sound_data = []
    no_sound_data = []
    labels = []

    for file in sound_files:
        features = extract_features(os.path.join(sound_folder, file))
        sound_data.append(features.T)
        labels.append('sound')

    for file in no_sound_files:
        features = extract_features(os.path.join(no_sound_folder, file))
        no_sound_data.append(features.T)
        labels.append('no_sound')

    X = np.vstack((sound_data, no_sound_data))
    y = np.array(labels)

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    return X, y

In [5]:
X, y = prepare_dataset('sound_folder', 'nosound_folder')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((896, 5, 13), (224, 5, 13), (896,), (224,))

In [8]:
# Reshape the MFCC features to fit the CNN input shape
X_train_cnn = X_train[..., np.newaxis]
X_test_cnn = X_test[..., np.newaxis]

In [9]:
X_train_cnn.shape, X_train.shape

((896, 5, 13, 1), (896, 5, 13))

In [10]:
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=X_train_cnn.shape[1:]),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # binary classification (sound or no sound)
])

In [11]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, validation_data=(X_test_cnn, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f1f8da08b0>

In [13]:
# import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix




In [14]:
model.save('sound_detection_module.h5')

In [15]:
loaded_model = tf.keras.models.load_model('sound_detection_module.h5')

In [16]:
# Convert the model to TensorFlow Lite format
# If want to use in the mobile or low end device which do not contain tensorflow convert it to the TFLite format
converter = tf.lite.TFLiteConverter.from_keras_model(loaded_model)
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
with open('sound_detection_model.tflite', 'wb') as f:
    f.write(tflite_model)



INFO:tensorflow:Assets written to: C:\Users\baral\AppData\Local\Temp\tmp7eoi46ml\assets


INFO:tensorflow:Assets written to: C:\Users\baral\AppData\Local\Temp\tmp7eoi46ml\assets
