In [587]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import models, layers, utils
# import Sequential
from tensorflow.keras import Sequential
import sounddevice as sd


In [588]:
# Step 1: Data Collection
sound_folder = "sound"
words = []
sound_data = []

In [589]:
# Iterate over sound files in the folder
for filename in os.listdir(sound_folder):
    file_path = os.path.join(sound_folder, filename)
    word = filename.split(".")[0]  # Extract the word from the filename
    words.append(word)
    audio_data, sample_rate = librosa.load(file_path, sr=None)  # Load audio file
    sound_data.append(audio_data)

In [590]:
word

'xa'

In [591]:
# Step 2: Preprocessing
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(words)
sound_data = [librosa.feature.mfcc(y=data, sr=sample_rate) for data in sound_data]

In [592]:
labels.shape

(4,)

In [593]:
len(sound_data)

4

In [594]:
sound_data

[array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
         -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32),
 array([[-5.7717633e+02, -5.7717633e+02, -5.7717633e+02, ...,
         -3.6652533e+02, -4.3389639e+02, -5.7697833e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.4595978e+02,  1.1842316e+02,  2.7988636e-01],
        [ 

In [595]:
sound_data[0]

array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
        -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32)

In [596]:
sound_data[0].shape

(20, 77)

In [597]:
# Split the dataset into training and testing sets

In [598]:
import numpy as np

# Determine the maximum length of the arrays in x_train
max_length = max(data.shape[1] for data in sound_data)

# Pad the sequences to the maximum length
padded_x_train = []
for data in sound_data:
    if data.shape[1] < max_length:
        padding = np.zeros((data.shape[0], max_length - data.shape[1]))
        padded_data = np.concatenate([data, padding], axis=1)
    else:
        padded_data = data
    padded_x_train.append(padded_data)

# Convert padded_x_train to a NumPy array
padded_x_train = np.array(padded_x_train)

# Print the padded sequences
print(padded_x_train)


[[[-6.26148071e+02 -6.26148071e+02 -6.26148071e+02 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]]

 [[-5.77176331e+02 -5.77176331e+02 -5.77176331e+02 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.0

In [599]:
sound_data = padded_x_train

In [600]:
sound_data.shape

(4, 20, 433)

In [601]:
sound_data

array([[[-6.26148071e+02, -6.26148071e+02, -6.26148071e+02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        ...,
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

       [[-5.77176331e+02, -5.77176331e+02, -5.77176331e+02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e

In [602]:
x_train, x_test, y_train, y_test = train_test_split(sound_data, labels, test_size=0.2, random_state=42)


In [603]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3, 20, 433), (1, 20, 433), (3,), (1,))

In [604]:
y_train[0]

3

In [605]:
padded_x_train[1].shape

(20, 433)

In [606]:
# y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
# y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [607]:
y_train, y_test

(array([3, 0, 2], dtype=int64), array([1], dtype=int64))

In [608]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3, 20, 433), (1, 20, 433), (3,), (1,))

In [609]:
# Step 3: Model Training
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(x_train.shape[1], x_train.shape[2], 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))


In [610]:
x_train[0].shape, x_train[1].shape, x_train[2].shape


((20, 433), (20, 433), (20, 433))

In [611]:
y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [612]:
y_train

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]], dtype=float32)

In [613]:
np.expand_dims(x_train, axis=-1).shape, y_train.shape, np.expand_dims(x_test, axis=-1).shape, y_test.shape

((3, 20, 433, 1), (3, 4), (1, 20, 433, 1), (1, 4))

In [614]:
y_train.shape, y_test.shape

((3, 4), (1, 4))

In [615]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(np.expand_dims(x_train, axis=-1), y_train, epochs=10, batch_size=32, validation_data=(np.expand_dims(x_test, axis=-1), y_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29f3fca0970>

In [None]:
new_sound_file = "path_to_new_sound_file.wav"
new_sound_data, sample_rate = librosa.load(new_sound_file, sr=None)
new_sound_data = librosa.feature.mfcc(y=new_sound_data, sr=sample_rate)
new_sound_data = np.expand_dims(new_sound_data, axis=0)
new_sound_data = np.expand_dims(new_sound_data, axis=-1)