In [1]:
import keras
import librosa

DATA_PATH = 'E:\\Datasets\\Speech\\train\\audio\\'

Using TensorFlow backend.


# Process audio files

In [None]:
from tqdm import tqdm

def wav2mfcc(file_path, max_len=11):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = wave[::3]
    mfcc = librosa.feature.mfcc(wave, sr=16000)

    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

def save_data_to_array(path=DATA_PATH, max_len=11):
    labels, _, _ = get_labels(path)

    for label in labels:
        # Init mfcc vectors
        mfcc_vectors = []

        wavfiles = [path + label + '\\' + wavfile for wavfile in os.listdir(path + '\\' + label)]
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            mfcc = wav2mfcc(wavfile, max_len=max_len)
            mfcc_vectors.append(mfcc)
        np.save('data\\speech\\'+label + '.npy', mfcc_vectors)
        print('saved data : ', label)
        
save_data_to_array()

In [12]:
import numpy as np
import os
from keras.utils import to_categorical

def get_labels(path=DATA_PATH):
    labels = os.listdir(path)
    label_indices = np.arange(0, len(labels))
    return labels, label_indices, to_categorical(label_indices)

In [13]:
get_labels()

(['bed',
  'bird',
  'cat',
  'dog',
  'down',
  'eight',
  'five',
  'four',
  'go',
  'happy',
  'house',
  'left',
  'marvin',
  'nine',
  'no',
  'off',
  'on',
  'one',
  'right',
  'seven',
  'sheila',
  'six',
  'stop',
  'three',
  'tree',
  'two',
  'up',
  'wow',
  'yes',
  'zero'],
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0., 

In [14]:
from sklearn.model_selection import train_test_split

def get_train_test(split_ratio=0.9, random_state=42):
    # get available class labels
    labels, indices, _ = get_labels()
    
    # Getting first arrays
    X = np.load('data\\speech\\'+labels[0] + '.npy')
    y = np.zeros(X.shape[0])

    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        x = np.load('data\\speech\\'+label + '.npy')
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))

    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size= (1 - split_ratio), random_state=random_state, shuffle=True)


In [15]:
train_x, test_x, train_y, test_y = get_train_test()

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(58248, 20, 11)
(6473, 20, 11)
(58248,)
(6473,)


In [16]:
x, y, z = train_x.shape
train_x = train_x.reshape(x, y, z, 1)
test_x = test_x.reshape(test_x.shape[0], y, z, 1)

train_y_categorical = to_categorical(train_y)
test_y_categorical = to_categorical(test_y)

print(train_x.shape)
print(test_x.shape)
print(train_y_categorical.shape)
print(test_y_categorical.shape)

(58248, 20, 11, 1)
(6473, 20, 11, 1)
(58248, 30)
(6473, 30)


# Build the netowrk

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

model = Sequential()
model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(20, 11, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(30, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 19, 10, 32)        160       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 9, 5, 32)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 9, 5, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1440)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               184448    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                3870      
Total para

In [None]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['categorical_accuracy'])
model.fit(train_x, train_y_categorical, batch_size=100, epochs=200, validation_data=(test_x, test_y_categorical))

# LSTM 

In [18]:
train_x, test_x, train_y, test_y = get_train_test()

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(58248, 20, 11)
(6473, 20, 11)
(58248,)
(6473,)


In [19]:
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

model = Sequential()

model.add(layers.GRU(128, activation='relu', input_shape=(None, 11)))
model.add(layers.Dense(30, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_2 (GRU)                  (None, 128)               53760     
_________________________________________________________________
dense_4 (Dense)              (None, 30)                3870      
Total params: 57,630
Trainable params: 57,630
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=5e-4),
              metrics=['categorical_accuracy'])
model.fit(train_x, train_y_categorical, batch_size=100, epochs=50, validation_data=(test_x, test_y_categorical))

Train on 58248 samples, validate on 6473 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x22181366908>