# Training

In [3]:
import json
import numpy as np
import tensorflow.keras as keras # Keras is a high-level API to build and train deep learning models.
from sklearn.model_selection import train_test_split

DATA_PATH = "speech_commands_v0.01_data.json"
SAVED_MODEL_PATH = "model.h5"

LEARNING_RATE = 0.0001
EPOCHS = 40
BATH_SIZE = 32

NUM_KEYWORDS = 14

In [1]:
def load_dataset(data_path):
    
    with open(data_path, "r") as fp:
        data = json.load(fp)
    
    # extract features (inputs) and targets an np.arrays.
    X = np.array(data["MFCCs"])
    y = np.array(data["labels"])
    
    return X, y
    
    
def get_data_splits(data_path, test_size = 0.1, test_validation = 0.1):
    
    # load datasets
    X, y = load_dataset(data_path)
    # Create train/validation/test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=test_validation)
    
    
    # convert inputs from 2d to 3d arrays
    # (# segments, 13) -> (# segments, 13, 1)
    X_train = X_train[..., np.newaxis] #...gives all the data in the array + np.newaxis adds a new dimension
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [4]:
# Build the CNN model
# Detailed explanation of particular layers "Audio DeepLearning" notebook"

def build_model(input_shape, learning_rate, error="sparse_categorical_crossentropy"):

    # Build network topology
    model = keras.Sequential()  # Sequential model is a linear stack of layers, one leads to nother
    
    # conv. layer 1
    # args: number of folters (64), kernel size (3,3) ( size of the filter), activation function (relu), 
    # args: input shape (shabe of the input data), regularizer (to avoid overfitting).
    model.add(keras.layers.Conv2D(64, (3, 3), activation="relu",
                                  input_shape=input_shape, 
                                  kernel_regularizer=keras.regularizers.l2(0.001)))
    # batch notmalization layer - speedup training
    model.add(keras.layers.BatchNormalization())
    # maxpooling - reduce the size of the feature maps (downsamples output of conv. layer)
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding="same"))
    
    # conv. layer 2
    model.add(keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding="same"))
    
    # conv. layer 3
    model.add(keras.layers.Conv2D(32, (2, 2), activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPooling2D((2, 2), strides=(2,2), padding="same"))
    
    # flatten output and feed it into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dropout(0.3)) # dropout layer to avoid overfitting, 
    # shoots down 30% of the neurons in the dense layer during training.
    
    # softmax classifier
    model.add(keras.layers.Dense(NUM_KEYWORDS, activation="softmax")) # outputs eg.: [0.1, 0.7, 0.2] -> [0, 1, 0]
    
    
    # Compile the model
    optimiser = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimiser,
                  loss=error, metrics=["accuracy"])
    
    # Print model overview
    model.summary()
    
    return model

In [5]:
def main():
    
    # Load train/validation/test data splits
    X_train, X_validation, X_test, y_train, y_validation, y_test = get_data_splits(DATA_PATH)

    # Build the CNN model
    # Extracting input shape from X_train, index 1, 2, 3 to 3D shape o the input
    input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3]) # 1D - number of segments, 2D - number of coefficients , 3D - number of channels (DEPTH) of an image (1 - MFCC - grayscale)
    model = build_model(input_shape, LEARNING_RATE)
    
    # Train the model
    model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=32, epochs=EPOCHS)
    
    # Evaluate the model
    test_error, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test error: {test_error}, test accuracy: {test_accuracy}")
    
    # Save the model
    model.save(SAVED_MODEL_PATH)
    
    # Save test file paths to json
    with open("test_files.json", "w") as fp:
        json.dump({"path": X_test.tolist()}, fp)
    
    
if __name__ == "__main__":
    main()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 42, 11, 64)        640       
                                                                 
 batch_normalization (Batch  (None, 42, 11, 64)        256       
 Normalization)                                                  
                                                                 
 max_pooling2d (MaxPooling2  (None, 21, 6, 64)         0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 19, 4, 32)         18464     
                                                                 
 batch_normalization_1 (Bat  (None, 19, 4, 32)         128       
 chNormalization)                                                
                                                        

  saving_api.save_model(
