In [None]:
import os
import warnings
import sklearn.decomposition

with warnings.catch_warnings():
    # Suppress TF and Keras warnings when importing
    warnings.simplefilter("ignore")
    from keras.models import Model
    from keras.layers import (
        Input, Conv2D, BatchNormalization, MaxPooling2D,
        Flatten, Activation, Lambda
    )
    import keras.regularizers as regularizers
    from kapre.time_frequency import Spectrogram, Melspectrogram

In [None]:
def _construct_mel128_audio_network():
    """
    Returns an uninitialized model object for an audio network with a Mel
    spectrogram input (with 128 frequency bins).
    Returns
    -------
    model : keras.models.Model
        Model object.
    """

    weight_decay = 1e-5
    n_dft = 2048
    n_mels = 128
    n_hop = 242
    asr = 48000
    audio_window_dur = 1

    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # MELSPECTROGRAM PREPROCESSING
    # 128 x 199 x 1
    y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels,
                      sr=asr, power_melgram=1.0, htk=True,
                      return_decibel_melgram=True, padding='same')(x_a)
    y_a = BatchNormalization()(y_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    filt_size_a_4 = (3, 3)
    pool_size_a_4 = (16, 24)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4, filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer', padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)

    pool_size = (4, 8)
    y_a = MaxPooling2D(pool_size=pool_size, padding='same')(y_a)
    y_a = Flatten()(y_a)

    m = Model(inputs=x_a, outputs=y_a)
    return m

In [None]:
model = _construct_mel128_audio_network()

In [None]:
model.summary()

In [None]:
################# Output #################
# Model: "model_2"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# input_2 (InputLayer)         (None, 1, 48000)          0         
# _________________________________________________________________
# melspectrogram_2 (Melspectro (None, 128, 199, 1)       4329600   
# _________________________________________________________________
# batch_normalization_9 (Batch (None, 128, 199, 1)       4         
# _________________________________________________________________
# conv2d_8 (Conv2D)            (None, 128, 199, 64)      640       
# _________________________________________________________________
# batch_normalization_10 (Batc (None, 128, 199, 64)      256       
# _________________________________________________________________
# activation_8 (Activation)    (None, 128, 199, 64)      0         
# _________________________________________________________________
# conv2d_9 (Conv2D)            (None, 128, 199, 64)      36928     
# _________________________________________________________________
# batch_normalization_11 (Batc (None, 128, 199, 64)      256       
# _________________________________________________________________
# activation_9 (Activation)    (None, 128, 199, 64)      0         
# _________________________________________________________________
# max_pooling2d_4 (MaxPooling2 (None, 64, 99, 64)        0         
# _________________________________________________________________
# conv2d_10 (Conv2D)           (None, 64, 99, 128)       73856     
# _________________________________________________________________
# batch_normalization_12 (Batc (None, 64, 99, 128)       512       
# _________________________________________________________________
# activation_10 (Activation)   (None, 64, 99, 128)       0         
# _________________________________________________________________
# conv2d_11 (Conv2D)           (None, 64, 99, 128)       147584    
# _________________________________________________________________
# batch_normalization_13 (Batc (None, 64, 99, 128)       512       
# _________________________________________________________________
# activation_11 (Activation)   (None, 64, 99, 128)       0         
# _________________________________________________________________
# max_pooling2d_5 (MaxPooling2 (None, 32, 49, 128)       0         
# _________________________________________________________________
# conv2d_12 (Conv2D)           (None, 32, 49, 256)       295168    
# _________________________________________________________________
# batch_normalization_14 (Batc (None, 32, 49, 256)       1024      
# _________________________________________________________________
# activation_12 (Activation)   (None, 32, 49, 256)       0         
# _________________________________________________________________
# conv2d_13 (Conv2D)           (None, 32, 49, 256)       590080    
# _________________________________________________________________
# batch_normalization_15 (Batc (None, 32, 49, 256)       1024      
# _________________________________________________________________
# activation_13 (Activation)   (None, 32, 49, 256)       0         
# _________________________________________________________________
# max_pooling2d_6 (MaxPooling2 (None, 16, 24, 256)       0         
# _________________________________________________________________
# conv2d_14 (Conv2D)           (None, 16, 24, 512)       1180160   
# _________________________________________________________________
# batch_normalization_16 (Batc (None, 16, 24, 512)       2048      