In [1]:
import os
import pathlib
import librosa
import seaborn
import numpy as np
import tensorflow as tf
from IPython import display
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras import models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
extract_dir = '/content/drive/MyDrive/data0-9'
# List the extracted files and folders
extracted_files = os.listdir(extract_dir)
print("Extracted files and folders:")
print(len(extracted_files))

Extracted files and folders:
10


In [5]:
# List to store MFCCs for all digits
all_mfccs = []
labels = []

# Loop through all folders (digits) in the dataset directory
for digit_folder in os.listdir(extract_dir):
    digit_folder_path = os.path.join(extract_dir, digit_folder)

    # Loop through all files (audio samples) in the digit folder
    for file_name in os.listdir(digit_folder_path):
        # Check if the file is a WAV file
        if file_name.endswith(".wav"):
            # Construct the full path to the audio file
            audio_file_path = os.path.join(digit_folder_path, file_name)

            # Load the audio file using Librosa
            audio_data, sample_rate = librosa.load(audio_file_path)

            # Compute MFCCs for the audio file
            mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)

            # Append MFCCs to the list
            all_mfccs.append(mfccs)
            labels.append(int(digit_folder))  # Use the folder name as the digit label


In [35]:
# Calculate the maximum length of MFCCs
max_length = max(len(mfcc[0]) for mfcc in all_mfccs)

# Pad or trim each MFCC to the maximum length
all_mfccs_padded = []
for mfcc in all_mfccs:
    num_columns = mfcc.shape[1]
    if num_columns < max_length:
        # Pad with zeros
        pad_width = max_length - num_columns
        padded_mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        # Trim to max_length
        padded_mfcc = mfcc[:, :max_length]
    all_mfccs_padded.append(padded_mfcc)

# Convert the list to a NumPy array
all_mfccs = np.array(all_mfccs_padded)
labels = np.array(labels)

print(all_mfccs.shape)


(23678, 40, 44)


In [36]:
### Split the dataset into independent and dependent dataset
X = all_mfccs
y = labels

In [37]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Assuming y is your 1D array containing class labels
y_encoded = label_encoder.fit_transform(y)

print(y_encoded)

[0 0 0 ... 9 9 9]


In [38]:
np.save('/content/drive/My Drive/X.npy', X)
np.save('/content/drive/My Drive/y_encoded.npy', y_encoded)

In [39]:
# Load X and y_encoded
X = np.load('/content/drive/My Drive/X.npy')
y_encoded = np.load('/content/drive/My Drive/y_encoded.npy')

In [40]:
# Shuffle the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (18942, 40, 44)
y_train shape: (18942,)
X_test shape: (4736, 40, 44)
y_test shape: (4736,)


In [44]:
# Reshape the data to match the model's input shape
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (18942, 40, 44, 1)
X_test shape: (4736, 40, 44, 1)


In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Adjusted input shape for MFCCs
INPUTSHAPE = (40, 44, 1)

model = Sequential([
    layers.Reshape(INPUTSHAPE, input_shape=(40, 44, 1)),  # Adjust input shape
    layers.Conv2D(128, (2, 2), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(2, padding='same'),
    layers.Conv2D(128, (2, 2), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.Conv2D(128, (2, 2), activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D(2, padding='same'),
    layers.Dropout(0.3),
    layers.GlobalAveragePooling2D(),
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_3 (Reshape)         (None, 40, 44, 1)         0         
                                                                 
 conv2d_9 (Conv2D)           (None, 40, 44, 128)       640       
                                                                 
 batch_normalization_12 (Ba  (None, 40, 44, 128)       512       
 tchNormalization)                                               
                                                                 
 max_pooling2d_9 (MaxPoolin  (None, 20, 22, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_10 (Conv2D)          (None, 20, 22, 128)       65664     
                                                                 
 batch_normalization_13 (Ba  (None, 20, 22, 128)      

In [46]:
batch_size = 16
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=8, verbose=0, mode='auto',
    baseline=None, restore_best_weights=False)

history = model.fit(X_train,y_train ,validation_data=(X_test,y_test),
            epochs=40,
            callbacks = [callback],batch_size=batch_size)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40


In [47]:
# Assuming you have a test set X_test and y_test
# Evaluate the model on the test set
loss, acc = model.evaluate(X_test, y_test)
print("Test accuracy:", acc)

Test accuracy: 0.9619932174682617


In [48]:
# Define the path where you want to save the model
model_save_path = '/content/drive/My Drive/audio_classification_model.h5'

# Save the model
model.save(model_save_path)

  saving_api.save_model(
