In [412]:
import os
import numpy as np
# import models, layers, utils
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import Sequential
from tensorflow.keras import models, layers, utils
from tensorflow.keras import Sequential
import sounddevice as sd
import re


In [413]:
# Step 1: Data Collection
sound_folder = "hello_split"
words = []
sound_data = []
label = []
name = []
# sound_name = ['Miro.', 'Naam.', 'Saurabh.', 'Baral.','ho.','tapai.','Ko.', 'xa.','I.','love.','you.']

In [414]:
# Iterate over sound files in the folder
i = -1
for filename in os.listdir(sound_folder):
    file_path = os.path.join(sound_folder, filename)
    word = filename.split(".")[0]  # Extract the word from the filename
    # Removes the number and _ from the word Ex. 1_hello.wav -> hello
    word = re.sub(r'[^A-Za-z]+', '', word)
    if word not in words:
        words.append(word)
    
    label.append(words.index(word))
    name.append(word)

    audio_data, sample_rate = librosa.load(file_path, sr=None)  # Load audio file
    sound_data.append(audio_data)

In [415]:
label

[0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 3,
 4,
 3,
 5,
 6,
 7,
 6,
 8,
 9,
 10,
 11,
 4,
 4,
 4,
 4,
 0,
 0,
 0,
 0]

In [416]:
words

['Naam',
 'Saurabh',
 'Baral',
 'ho',
 'Miro',
 'tapai',
 'Ko',
 'tapay',
 'xa',
 'I',
 'love',
 'you']

In [417]:
sample_rate

48000

In [418]:
len(sound_data)

28

In [419]:
# Step 2: Preprocessing
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(words)
sound_data = [librosa.feature.mfcc(y=data, sr=sample_rate) for data in sound_data]

In [420]:
label_encoder.classes_

array(['Baral', 'I', 'Ko', 'Miro', 'Naam', 'Saurabh', 'ho', 'love',
       'tapai', 'tapay', 'xa', 'you'], dtype='<U7')

In [421]:
labels = utils.to_categorical(label, num_classes=len(label_encoder.classes_))


In [422]:
labels[:5]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [423]:
labels.shape

(28, 12)

In [424]:
len(sound_data)

28

In [425]:
sound_data[0].shape

(20, 126)

In [426]:
# Split the dataset into training and testing sets

In [427]:
import numpy as np

# Determine the maximum length of the arrays in x_train
max_length = max(data.shape[1] for data in sound_data)

# Pad the sequences to the maximum length
padded_x_train = []
for data in sound_data:
    if data.shape[1] < max_length:
        padding = np.zeros((data.shape[0], max_length - data.shape[1]))
        padded_data = np.concatenate([data, padding], axis=1)
    else:
        padded_data = data
    padded_x_train.append(padded_data)

# Convert padded_x_train to a NumPy array
padded_x_train = np.array(padded_x_train)

# Print the padded sequences
print(padded_x_train)


[[[-2.99812622e+02 -2.68945435e+02 -2.52896881e+02 ... -2.60914032e+02
   -2.64518402e+02 -2.22407089e+02]
  [ 1.78257141e+02  2.12234970e+02  2.32832626e+02 ...  2.45919907e+02
    2.43404022e+02  2.03795563e+02]
  [ 6.77690220e+00 -2.28766918e+01 -2.92725182e+01 ... -5.17024326e+00
    2.25792122e+00  3.54791565e+01]
  ...
  [-6.11440277e+00 -9.37044716e+00 -1.29157467e+01 ...  1.59050226e-02
   -9.85843420e-01  2.01994514e+00]
  [-1.17798119e+01 -1.04930725e+01 -4.18545485e+00 ...  3.72642446e+00
    6.65921402e+00  5.80807257e+00]
  [-5.29475069e+00 -4.86009884e+00 -4.92752504e+00 ... -1.08880687e+00
   -2.02197981e+00  4.78213596e+00]]

 [[-2.23669174e+02 -2.45219788e+02 -2.68495178e+02 ... -2.28076248e+02
   -2.23399124e+02 -2.24804199e+02]
  [ 1.91913513e+02  2.09324524e+02  2.32184387e+02 ...  2.61799927e+02
    2.63875793e+02  2.66698914e+02]
  [ 3.92412262e+01  2.60527668e+01  7.46072340e+00 ... -5.39751005e+00
   -4.00017071e+00  7.70471931e-01]
  ...
  [-1.29654717e+00 -4.0

In [428]:
sound_data[0].shape, sound_data[1].shape

((20, 126), (20, 126))

In [429]:
sound_data = padded_x_train

In [430]:
sound_data.shape

(28, 20, 126)

In [431]:
sound_data

array([[[-2.99812622e+02, -2.68945435e+02, -2.52896881e+02, ...,
         -2.60914032e+02, -2.64518402e+02, -2.22407089e+02],
        [ 1.78257141e+02,  2.12234970e+02,  2.32832626e+02, ...,
          2.45919907e+02,  2.43404022e+02,  2.03795563e+02],
        [ 6.77690220e+00, -2.28766918e+01, -2.92725182e+01, ...,
         -5.17024326e+00,  2.25792122e+00,  3.54791565e+01],
        ...,
        [-6.11440277e+00, -9.37044716e+00, -1.29157467e+01, ...,
          1.59050226e-02, -9.85843420e-01,  2.01994514e+00],
        [-1.17798119e+01, -1.04930725e+01, -4.18545485e+00, ...,
          3.72642446e+00,  6.65921402e+00,  5.80807257e+00],
        [-5.29475069e+00, -4.86009884e+00, -4.92752504e+00, ...,
         -1.08880687e+00, -2.02197981e+00,  4.78213596e+00]],

       [[-2.23669174e+02, -2.45219788e+02, -2.68495178e+02, ...,
         -2.28076248e+02, -2.23399124e+02, -2.24804199e+02],
        [ 1.91913513e+02,  2.09324524e+02,  2.32184387e+02, ...,
          2.61799927e+02,  2.63875793e

In [432]:
# new_label = utils.to_categorical(labels, num_classes=len(label_encoder.classes_))
# new_label

In [433]:
sound_data.shape

(28, 20, 126)

In [434]:
labels.shape

(28, 12)

In [435]:
label = np.array(label)

In [436]:
x_train, x_test, y_train, y_test = train_test_split(sound_data, label, test_size=0.2, random_state=42)


In [437]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((22, 20, 126), (6, 20, 126), (22,), (6,))

In [438]:
x_train.shape

(22, 20, 126)

In [439]:
y_train[0]

9

In [440]:
padded_x_train[1].shape

(20, 126)

In [441]:
# y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
# y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [442]:
y_train, y_test

(array([ 9,  4,  3,  6,  6,  0,  1,  1,  1,  8,  4,  1,  0,  0, 10,  0,  4,
         2,  4,  7, 11,  2]),
 array([3, 0, 2, 4, 0, 5]))

In [443]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((22, 20, 126), (6, 20, 126), (22,), (6,))

In [444]:
from tensorflow import keras
model = keras.Sequential()
# Adding the first CNN layers
# input shape= (height, width, 1)
model.add(keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(x_train.shape[1], x_train.shape[2], 1)))
# Adding the second layer with maxpooling
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))


# Adding the third layer
model.add(keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
# Adding the fourth layer with maxpooling
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))

# Adding the flatten layer
model.add(keras.layers.Flatten())
# Adding the first dense layer
# units = 128 is the number of neurons
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dropout(rate=0.5))



model.add(keras.layers.Dense(units=64, activation='relu'))
model.add(keras.layers.Dropout(rate=0.5))


# Add the second dense layer (output layer)
model.add(keras.layers.Dense(units=len(words), activation='softmax'))

In [445]:
x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)

In [446]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_14 (Conv2D)          (None, 18, 124, 32)       320       
                                                                 
 max_pooling2d_14 (MaxPoolin  (None, 9, 62, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_15 (Conv2D)          (None, 7, 60, 64)         18496     
                                                                 
 max_pooling2d_15 (MaxPoolin  (None, 3, 30, 64)        0         
 g2D)                                                            
                                                                 
 flatten_7 (Flatten)         (None, 5760)              0         
                                                                 
 dense_20 (Dense)            (None, 128)              

In [447]:
x_train[0].shape, x_train[1].shape, x_train[2].shape


((20, 126, 1), (20, 126, 1), (20, 126, 1))

In [448]:
# y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
# y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [449]:
labels

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.,

In [450]:
np.expand_dims(x_train, axis=-1).shape, y_train.shape, np.expand_dims(x_test, axis=-1).shape, y_test.shape

((22, 20, 126, 1, 1), (22,), (6, 20, 126, 1, 1), (6,))

In [451]:
y_train.shape, y_test.shape

((22,), (6,))

In [452]:
x_train.shape

(22, 20, 126, 1)

In [453]:
np.expand_dims(x_train, axis=-1).shape

(22, 20, 126, 1, 1)

In [454]:
y_train.shape

(22,)

In [455]:
y_train

array([ 9,  4,  3,  6,  6,  0,  1,  1,  1,  8,  4,  1,  0,  0, 10,  0,  4,
        2,  4,  7, 11,  2])

In [456]:
y_test

array([3, 0, 2, 4, 0, 5])

In [457]:
x_train.shape

(22, 20, 126, 1)

In [458]:
y_train.shape

(22,)

In [459]:
x_train.shape

(22, 20, 126, 1)

In [460]:
y_train.shape

(22,)

In [461]:
y_train

array([ 9,  4,  3,  6,  6,  0,  1,  1,  1,  8,  4,  1,  0,  0, 10,  0,  4,
        2,  4,  7, 11,  2])

In [462]:
y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))


In [463]:
y_train

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.,

In [464]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16c633083d0>

In [465]:
y_pred = model.predict(np.expand_dims(x_test, axis=-1))



In [466]:
# import  accuracy score, matrix
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred.round())
confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))



array([[1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [467]:
y_test

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [468]:
y_test.argmax(axis=1), y_pred.argmax(axis=1)

(array([3, 0, 2, 4, 0, 5], dtype=int64),
 array([6, 1, 0, 2, 0, 2], dtype=int64))

In [469]:
new_sound_file = "baral.wav"
new_sound_data, sample_rate = librosa.load(new_sound_file, sr=None)
new_sound_data = librosa.feature.mfcc(y=new_sound_data, sr=sample_rate)


In [470]:
max_length = max(new_sound_data.shape[1], max_length)
padding = np.zeros((new_sound_data.shape[0], max_length - new_sound_data.shape[1]))
padded_new_sound_data = np.concatenate([new_sound_data, padding], axis=1)

In [471]:
padded_new_sound_data = np.expand_dims(padded_new_sound_data, axis=-1)
padded_new_sound_data = np.expand_dims(padded_new_sound_data, axis=0)


In [472]:
prediction = model.predict(padded_new_sound_data)



In [473]:
words

['Naam',
 'Saurabh',
 'Baral',
 'ho',
 'Miro',
 'tapai',
 'Ko',
 'tapay',
 'xa',
 'I',
 'love',
 'you']

In [474]:
np.argmax(prediction)

0

In [475]:
words[np.argmax(prediction)]

'Naam'