In [93]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import models, layers, utils
# import Sequential
from tensorflow.keras import Sequential
import sounddevice as sd
import re


In [110]:
# Step 1: Data Collection
sound_folder = "hello_split"
words = []
sound_data = []
# sound_name = ['Miro.', 'Naam.', 'Saurabh.', 'Baral.','ho.','tapai.','Ko.', 'xa.','I.','love.','you.']

In [111]:
# Iterate over sound files in the folder
for filename in os.listdir(sound_folder):
    file_path = os.path.join(sound_folder, filename)
    word = filename.split(".")[0]  # Extract the word from the filename
    # Removes the number and _ from the word Ex. 1_hello.wav -> hello
    word = re.sub(r'[^A-Za-z]+', '', word)
    if word not in words:
        words.append(word)
    audio_data, sample_rate = librosa.load(file_path, sr=None)  # Load audio file
    sound_data.append(audio_data)

In [118]:
audio_data

array([ 0.04330444,  0.04238892,  0.04058838, ..., -0.03952026,
       -0.03607178, -0.03314209], dtype=float32)

In [54]:
# Step 2: Preprocessing
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(words)
sound_data = [librosa.feature.mfcc(y=data, sr=sample_rate) for data in sound_data]

In [55]:
labels.shape

(14,)

In [56]:
len(sound_data)

14

In [57]:
sound_data

[array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
         -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32),
 array([[-581.26733  , -581.26733  , -581.26733  , ..., -362.37054  ,
         -364.9951   , -405.26328  ],
        [   0.       ,    0.       ,    0.       , ...,  164.43127  ,
          164.48148  ,  151.402    ],
        [   0.       ,    0.      

In [58]:
sound_data[0]

array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
        -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32)

In [59]:
sound_data[0].shape

(20, 77)

In [60]:
# Split the dataset into training and testing sets

In [61]:
import numpy as np

# Determine the maximum length of the arrays in x_train
max_length = max(data.shape[1] for data in sound_data)

# Pad the sequences to the maximum length
padded_x_train = []
for data in sound_data:
    if data.shape[1] < max_length:
        padding = np.zeros((data.shape[0], max_length - data.shape[1]))
        padded_data = np.concatenate([data, padding], axis=1)
    else:
        padded_data = data
    padded_x_train.append(padded_data)

# Convert padded_x_train to a NumPy array
padded_x_train = np.array(padded_x_train)

# Print the padded sequences
print(padded_x_train)


[[[-626.14807129 -626.14807129 -626.14807129 ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  ...
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]]

 [[-581.26733398 -581.26733398 -581.26733398 ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  ...
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.            0.         ...    0.
      0.            0.        ]
  [   0.            0.       

In [62]:
sound_data = padded_x_train

In [63]:
sound_data.shape

(14, 20, 199)

In [64]:
sound_data

array([[[-626.14807129, -626.14807129, -626.14807129, ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ],
        ...,
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ]],

       [[-581.26733398, -581.26733398, -581.26733398, ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.        ,    0.        ,    0.        ],
        [   0.        ,    0.        ,    0.        , ...,
            0.

In [65]:
new_label = utils.to_categorical(labels, num_classes=len(label_encoder.classes_))
new_label

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)

In [66]:
x_train, x_test, y_train, y_test = train_test_split(sound_data, labels, test_size=0.2, random_state=42)


In [67]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((11, 20, 199), (3, 20, 199), (11,), (3,))

In [68]:
y_train[0]

12

In [69]:
padded_x_train[1].shape

(20, 199)

In [70]:
# y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
# y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [71]:
y_train, y_test

(array([12,  4,  7,  1,  0, 13,  3,  6,  9,  2,  5], dtype=int64),
 array([ 8, 10, 11], dtype=int64))

In [72]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((11, 20, 199), (3, 20, 199), (11,), (3,))

In [73]:
# Step 3: Model Training
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(x_train.shape[1], x_train.shape[2], 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))


In [74]:
x_train[0].shape, x_train[1].shape, x_train[2].shape


((20, 199), (20, 199), (20, 199))

In [75]:
y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [76]:
labels

array([11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13],
      dtype=int64)

In [77]:
np.expand_dims(x_train, axis=-1).shape, y_train.shape, np.expand_dims(x_test, axis=-1).shape, y_test.shape

((11, 20, 199, 1), (11, 14), (3, 20, 199, 1), (3, 14))

In [78]:
y_train.shape, y_test.shape

((11, 14), (3, 14))

In [79]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(np.expand_dims(x_train, axis=-1), y_train, epochs=10, batch_size=32, validation_data=(np.expand_dims(x_test, axis=-1), y_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e9bad98b50>

In [80]:
y_pred = model.predict(np.expand_dims(x_test, axis=-1))



In [81]:
y_pred

array([[7.8824902e-05, 2.4161686e-20, 2.6174180e-06, 2.6891285e-25,
        1.3244120e-07, 1.9204458e-09, 0.0000000e+00, 4.0246675e-17,
        2.4076865e-08, 6.5730978e-03, 2.9268620e-16, 2.4138533e-15,
        9.9334526e-01, 1.0305218e-13],
       [1.5241039e-04, 1.7200381e-14, 4.5731207e-04, 2.0606879e-24,
        1.0061948e-10, 7.2921196e-08, 0.0000000e+00, 3.2368727e-18,
        6.4054660e-08, 1.2705149e-05, 4.8108192e-17, 2.6716057e-16,
        9.9937743e-01, 3.4866101e-13],
       [6.8584033e-10, 1.6862705e-20, 9.2845243e-05, 2.4298627e-18,
        2.0837663e-14, 2.9009389e-08, 0.0000000e+00, 7.1192670e-11,
        3.6818037e-13, 6.5682354e-20, 3.5315995e-19, 1.5513531e-10,
        9.9904662e-01, 8.6048857e-04]], dtype=float32)

In [82]:
y_test

array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
      dtype=float32)

In [83]:
# import accuracy_score
y_test, y_train

(array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
       dtype=float32),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32))

In [84]:
new_sound_file = "path_to_new_sound_file.wav"
new_sound_data, sample_rate = librosa.load(new_sound_file, sr=None)
new_sound_data = librosa.feature.mfcc(y=new_sound_data, sr=sample_rate)
new_sound_data = np.expand_dims(new_sound_data, axis=0)
new_sound_data = np.expand_dims(new_sound_data, axis=-1)

  new_sound_data, sample_rate = librosa.load(new_sound_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_new_sound_file.wav'