In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix




In [2]:
# Define constants for data preprocessing
num_mfcc_coefficients = 13  # Number of MFCC coefficients (adjust as needed)
desired_shape = (457, num_mfcc_coefficients, 1)  # Replace with your model's input shape
main_folder = ['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']  # Replace with your class names
folder_name = 'donateacry'  # Replace with your dataset folder path
num_classes = len(main_folder)  # Number of classes

# Initialize lists to store preprocessed data
preprocessed_data = []
preprocessed_labels = []

In [3]:
# Function to preprocess an audio file
def preprocess_audio(audio_file, label):
    # Load audio
    audio, sr = librosa.load(audio_file, sr=None)

    # Perform feature extraction (e.g., MFCCs)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc_coefficients)

    # Normalize the MFCCs (optional but recommended)
    mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)

    # Reshape or pad the MFCCs to match the desired input shape
    num_frames = mfccs.shape[1]
    if num_frames < desired_shape[0]:
        mfccs = np.pad(mfccs, ((0, 0), (0, desired_shape[0] - num_frames)), mode='constant')
    elif num_frames > desired_shape[0]:
        mfccs = mfccs[:, :desired_shape[0]]

    # Append the preprocessed data and label
    preprocessed_data.append(mfccs.T[:, :, np.newaxis])  # Transpose the data
    preprocessed_labels.append(label)

In [4]:
# Loop through each class folder and preprocess audio files
for index, cls in enumerate(main_folder):
    class_folder = os.path.join(folder_name, cls)
    for file in os.listdir(class_folder)[:15]:
        audio_file = os.path.join(class_folder, file)
        preprocess_audio(audio_file, label=index)

# Convert lists to NumPy arrays
preprocessed_data = np.array(preprocessed_data)
preprocessed_labels = np.array(preprocessed_labels)

print("Total audio files:", len(preprocessed_data))


Total audio files: 68


In [5]:
# Define the CNN model
model = keras.Sequential([
    layers.Input(shape=desired_shape),  # Specify the input shape (e.g., (num_frames, num_features, num_channels))
    
    # Convolutional layers
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Flatten the output
    layers.Flatten(),
    
    # Fully connected layers
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),  # Dropout layer to reduce overfitting
    layers.Dense(num_classes, activation='softmax')  # Output layer with the number of classes
])





In [6]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use 'categorical_crossentropy' if one-hot encoding
              metrics=['accuracy'])

# Print the model summary
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 457, 13, 64)       640       
                                                                 
 max_pooling2d (MaxPooling2  (None, 228, 6, 64)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 228, 6, 128)       73856     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 114, 3, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 114, 3, 128)       147584    
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 57, 1, 128)        0

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_data, preprocessed_labels, test_size=0.1, random_state=42
)



In [8]:
len(X_train)*len(X_train)

3721

In [9]:
len(X_test)*len(X_test)

49

In [10]:
len(preprocessed_labels)*len(preprocessed_labels)

4624

In [11]:
# Early stopping to avoid overfitting of model
early_stop=EarlyStopping(monitor='val_accuracy',mode='max', verbose=1, patience=15, restore_best_weights=True)

In [12]:
# Fit the model
model.fit(X_train, y_train, epochs=30, batch_size=36,callbacks=[early_stop], validation_data=(X_test,y_test))

Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 25: early stopping


<keras.src.callbacks.History at 0x1ab1b4c98b0>

In [13]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Training loss: {loss:.4f}")
print(f"Training accuracy: {accuracy:.4f}")

Training loss: 1.1267
Training accuracy: 0.6230


In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

Test loss: 2.2172
Test accuracy: 0.4286


In [15]:
Test_preprocess_data = []
Test_preprocess_label = []

def Test_preprocess_audio(audio_file, label):
    # Load audio
    audio, sr = librosa.load(audio_file, sr=None)

    # Perform feature extraction (e.g., MFCCs)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc_coefficients)

    # Normalize the MFCCs (optional but recommended)
    mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)

    # Reshape or pad the MFCCs to match the desired input shape
    num_frames = mfccs.shape[1]
    if num_frames < desired_shape[0]:
        mfccs = np.pad(mfccs, ((0, 0), (0, desired_shape[0] - num_frames)), mode='constant')
    elif num_frames > desired_shape[0]:
        mfccs = mfccs[:, :desired_shape[0]]

    # Append the preprocessed data and label
    Test_preprocess_data.append(mfccs.T[:, :, np.newaxis])  # Transpose the data
    Test_preprocess_label.append(label)

    return np.array(Test_preprocess_data),np.array(Test_preprocess_label)

In [16]:
##['belly_pain', 'burping', 'discomfort', 'hungry', 'tired']
def Predict_Label(audio_file):
    processed_data,processed_label = (Test_preprocess_audio(audio_file, label=None))
    y_pred=model.predict(processed_data)
    y_pred=np.argmax(y_pred,axis=1).any()
    if y_pred == [0]:
        print('belly_pain')
    if y_pred == [1]:
        print('burping')
    if y_pred == [2]:
        print('discomfort')
    if y_pred == [3]:
        print('hungry')
    if y_pred == [4]:
        print('tired')

In [17]:
Predict_Label("donateacry/burping/F24DE44B-762C-4149-AC92-96A5E57ED118-1430816949-1.0-m-04-bu.wav")

burping


In [18]:
model.save("Neonatal_cry_model.h5", save_format="h5")

  saving_api.save_model(
