In [1]:
import os
def get_dataset_size_and_labels(data_path):
    class_labels = []
    dataset_size = 0
    for folder in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder)
        if os.path.isdir(folder_path):
            class_labels.append(folder)
            for filename in os.listdir(folder_path):
                dataset_size += 1
    return dataset_size, class_labels
data_path = r"C:\Users\SRINI\Downloads\archive (3)"
dataset_size, class_labels = get_dataset_size_and_labels(data_path)
print("Dataset Size:", dataset_size)

Dataset Size: 28536


In [2]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
def load_and_preprocess_data(data_path, sample_rate=16000, duration=1):
    labels = []
    features = []
    max_pad_len = 20 
    class_labels = os.listdir(data_path)
    for label in class_labels:
        label_path = os.path.join(data_path, label)
        for audio_file in os.listdir(label_path):
            audio_path = os.path.join(label_path, audio_file)
            if not audio_file.endswith(('.wav', '.mp3', '.flac')):
                continue
            try:
                audio, _ = librosa.load(audio_path, sr=sample_rate, duration=duration)
                mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
                if mfccs.shape[1] > max_pad_len:
                    mfccs = mfccs[:, :max_pad_len]
                else:
                    pad_width = max_pad_len - mfccs.shape[1]
                    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
                features.append(mfccs)
                labels.append(label)
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")
    return np.array(features), np.array(labels)
data_path = r"C:\Users\SRINI\Downloads\archive (3)"
X, y = load_and_preprocess_data(data_path)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Number of classes:", len(np.unique(y)))

X_train shape: (22828, 13, 20)
X_test shape: (5707, 13, 20)
Number of classes: 15


In [3]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
def load_and_preprocess_data(data_path, sample_rate=16000, duration=1):
    labels = []
    features = []
    max_pad_len = 20  
    class_labels = os.listdir(data_path)
    for label in class_labels:
        label_path = os.path.join(data_path, label) 
        for audio_file in os.listdir(label_path):
            audio_path = os.path.join(label_path, audio_file)
            if not audio_file.endswith(('.wav', '.mp3', '.flac')):
                continue
            try:
                audio, _ = librosa.load(audio_path, sr=sample_rate, duration=duration)
                mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
                if mfccs.shape[1] > max_pad_len:
                    mfccs = mfccs[:, :max_pad_len]
                else:
                    pad_width = max_pad_len - mfccs.shape[1]
                    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
                features.append(mfccs)
                labels.append(label)
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")
    return np.array(features), np.array(labels)
data_path = r"C:\Users\SRINI\Downloads\archive (3)"
X, y = load_and_preprocess_data(data_path)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X = np.expand_dims(X, axis=-1)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(np.unique(y)), activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
epochs = 10  
model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 7.99%


In [4]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_original = le.inverse_transform(y_test)
y_pred_original = le.inverse_transform(y_pred_classes)
print("Classification Report:")
print(classification_report(y_test_original, y_pred_original,zero_division=1))
conf_mat = confusion_matrix(y_test_original, y_pred_original)
print("Confusion Matrix:")
print(conf_mat)

Classification Report:
              precision    recall  f1-score   support

         bed       1.00      0.00      0.00       325
        bird       1.00      0.00      0.00       348
         cat       1.00      0.00      0.00       336
         dog       1.00      0.00      0.00       359
        down       1.00      0.00      0.00       460
       eight       1.00      0.00      0.00       476
        five       1.00      0.00      0.00       512
        four       1.00      0.00      0.00       501
          go       0.08      1.00      0.15       456
       happy       1.00      0.00      0.00       357
       house       1.00      0.00      0.00       370
        left       1.00      0.00      0.00       457
      marvin       1.00      0.00      0.00       332
        nine       1.00      0.00      0.00       418

    accuracy                           0.08      5707
   macro avg       0.93      0.07      0.01      5707
weighted avg       0.93      0.08      0.01      5707

Co

In [6]:
import librosa
import numpy as np
def preprocess_single_audio(audio_path, sample_rate=16000, duration=1, max_pad_len=20):
    try:
        audio, _ = librosa.load(audio_path, sr=sample_rate, duration=duration)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
        if mfccs.shape[1] > max_pad_len:
            mfccs = mfccs[:, :max_pad_len]
        else:
            pad_width = max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mfccs = np.expand_dims(mfccs, axis=-1)
        return mfccs
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None
single_audio_path = r"C:\Users\SRINI\Downloads\archive (3)\go\ffd2ba2f_nohash_4.wav"
preprocessed_audio = preprocess_single_audio(single_audio_path)
if preprocessed_audio is not None:
    predictions = model.predict(np.array([preprocessed_audio]))
    predicted_class = np.argmax(predictions)
    predicted_label = le.inverse_transform([predicted_class])[0]
    print(f"Predicted Class: {predicted_class}")
    print(f"Predicted Label: {predicted_label}")

Predicted Class: 9
Predicted Label: go
