In [170]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import models, layers, utils
# import Sequential
from tensorflow.keras import Sequential
import sounddevice as sd


In [171]:
# Step 1: Data Collection
sound_folder = "sound"
words = []
sound_data = []

In [172]:
# Iterate over sound files in the folder
for filename in os.listdir(sound_folder):
    file_path = os.path.join(sound_folder, filename)
    word = filename.split(".")[0]  # Extract the word from the filename
    words.append(word)
    audio_data, sample_rate = librosa.load(file_path, sr=None)  # Load audio file
    sound_data.append(audio_data)

In [173]:
# Step 2: Preprocessing
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(words)
sound_data = [librosa.feature.mfcc(y=data, sr=sample_rate) for data in sound_data]

In [174]:
len(sound_data)

4

In [175]:
sound_data

[array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
         -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32),
 array([[-5.7717633e+02, -5.7717633e+02, -5.7717633e+02, ...,
         -3.6652533e+02, -4.3389639e+02, -5.7697833e+02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.4595978e+02,  1.1842316e+02,  2.7988636e-01],
        [ 

In [176]:
sound_data[0]

array([[-6.2614807e+02, -6.2614807e+02, -6.2614807e+02, ...,
        -3.4769238e+02, -3.5615570e+02, -4.2055841e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         1.7945123e+02,  1.7194107e+02,  1.4836133e+02],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -2.2997681e+01, -1.9970545e+01, -8.0884438e+00],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -8.3571301e+00, -6.0509596e+00, -6.1979780e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -1.1280119e+01, -3.8040481e+00,  3.1977230e-01],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.9512634e-01,  5.2454481e+00,  5.4574995e+00]], dtype=float32)

In [177]:
sound_data[0].shape

(20, 77)

In [178]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(sound_data, labels, test_size=0.2, random_state=42)

In [179]:
y_train

array([3, 0, 2], dtype=int64)

In [188]:
x_train[0].shape[1]

87

In [189]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Determine the maximum length of the arrays in x_train
max_length = max(data.shape[1] for data in x_train)

# Pad the sequences to the maximum length
padded_x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post', dtype='float32')

# Convert padded_x_train to a NumPy array
padded_x_train = np.array(padded_x_train)

ValueError: Shape of sample (77,) of sequence at position 1 is different from expected shape (87,)

In [180]:
# Convert data to numpy arrays
X_train = np.array(x_train)
X_test = np.array(x_test)

  X_train = np.array(x_train)


ValueError: could not broadcast input array from shape (20,87) into shape (20,)

In [None]:
y_train = utils.to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test = utils.to_categorical(y_test, num_classes=len(label_encoder.classes_))

In [None]:
y_train, y_test

(array([[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]], dtype=float32),
 array([[0., 1., 0., 0.]], dtype=float32))

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(x_train[0].shape[0], x_train[0].shape[1], 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 18, 85, 32)        320       
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 9, 42, 32)        0         
 2D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 12096)             0         
                                                                 
 dense_2 (Dense)             (None, 64)                774208    
                                                                 
 dense_3 (Dense)             (None, 4)                 260       
                                                                 
Total params: 774,788
Trainable params: 774,788
Non-trainable params: 0
________________________________________________

In [None]:
x_train[0].shape, x_train[1].shape, x_train[2].shape

((20, 87), (20, 77), (20, 433))

In [None]:
x_train = np.stack(x_train)

ValueError: all input arrays must have the same shape