<a href="https://colab.research.google.com/github/xiangj1/E2E/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import keras.backend as K
from keras.models import Model
from keras.layers import *
from keras.regularizers import l2
from keras.initializers import random_normal
from keras.utils.conv_utils import conv_output_length
from keras.layers import GaussianNoise

from keras.activations import *

In [None]:
def clipped_relu(x):
    return relu(x, max_value=20)

In [None]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args

    # hack for load_model
    import tensorflow as tf

    ''' from TF: Input requirements
    1. sequence_length(b) <= time for all b
    2. max(labels.indices(labels.indices[:, 1] == b, 2)) <= sequence_length(b) for all b.
    '''

    print("CTC lambda inputs / shape")
    print("y_pred:", y_pred.shape)  # (?, 778, 30)
    print("labels:", labels.shape)  # (?, 80)
    print("input_length:", input_length.shape)  # (?, 1)
    print("label_length:", label_length.shape)  # (?, 1)

    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [None]:
def ds2_gru_model(input_dim=161, fc_size=1024, rnn_size=512, output_dim=29, initialization='glorot_uniform',
                  conv_layers=1, gru_layers=1, use_conv=True):
    """ DeepSpeech 2 implementation

    Architecture:
        Input Spectrogram TIMEx161
        1 Batch Normalisation layer on input
        1-3 Convolutional Layers
        1 Batch Normalisation layer
        1-7 BiDirectional GRU Layers
        1 Batch Normalisation layer
        1 Fully connected Dense
        1 Softmax output

    Details:
       - Uses Spectrogram as input rather than MFCC
       - Did not use BN on the first input
       - Network does not dynamically adapt to maximum audio size in the first convolutional layer. Max conv
          length padded at 2048 chars, otherwise use_conv=False

    Reference:
        https://arxiv.org/abs/1512.02595
    """

    K.set_learning_phase(1)

    input_data = Input(shape=(None, input_dim), name='the_input')
    x = BatchNormalization(axis=-1, momentum=0.99,
                           epsilon=1e-3, center=True, scale=True)(input_data)

    if use_conv:
        conv = ZeroPadding1D(padding=(0, 4096))(x)
        for l in range(conv_layers):
            x = Conv1D(filters=fc_size, name='conv_{}'.format(
                l+1), kernel_size=11, padding='valid', activation='relu', strides=2)(conv)
    else:
        for l in range(conv_layers):
            x = TimeDistributed(Dense(fc_size, name='fc_{}'.format(
                l + 1), activation='relu'))(x)  # >>(?, time, fc_size)

    x = BatchNormalization(axis=-1, momentum=0.99,
                           epsilon=1e-3, center=True, scale=True)(x)

    for l in range(gru_layers):
        x = Bidirectional(GRU(rnn_size, name='fc_{}'.format(l + 1), return_sequences=True, activation='relu', kernel_initializer=initialization),
                          merge_mode='sum')(x)

    x = BatchNormalization(axis=-1, momentum=0.99,
                           epsilon=1e-3, center=True, scale=True)(x)

    # Last Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation=clipped_relu))(x)
    y_pred = TimeDistributed(
        Dense(output_dim, name="y_pred", activation="softmax"))(x)

    # labels = K.placeholder(name='the_labels', ndim=1, dtype='int32')
    labels = Input(name='the_labels', shape=[None, ], dtype='int32')
    input_length = Input(name='input_length', shape=[1], dtype='int32')
    label_length = Input(name='label_length', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                       labels,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length,
                          label_length], outputs=loss_out)

    return model


In [None]:
model = ds2_gru_model()

CTC lambda inputs / shape
y_pred: (None, None, 29)
labels: (None, None)
input_length: (None, 1)
label_length: (None, 1)


In [None]:
model.get_config()

{'input_layers': [['the_input', 0, 0],
  ['the_labels', 0, 0],
  ['input_length', 0, 0],
  ['label_length', 0, 0]],
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, None, 161),
    'dtype': 'float32',
    'name': 'the_input',
    'ragged': False,
    'sparse': False},
   'inbound_nodes': [],
   'name': 'the_input'},
  {'class_name': 'BatchNormalization',
   'config': {'axis': ListWrapper([2]),
    'beta_constraint': None,
    'beta_initializer': {'class_name': 'Zeros', 'config': {}},
    'beta_regularizer': None,
    'center': True,
    'dtype': 'float32',
    'epsilon': 0.001,
    'gamma_constraint': None,
    'gamma_initializer': {'class_name': 'Ones', 'config': {}},
    'gamma_regularizer': None,
    'momentum': 0.99,
    'moving_mean_initializer': {'class_name': 'Zeros', 'config': {}},
    'moving_variance_initializer': {'class_name': 'Ones', 'config': {}},
    'name': 'batch_normalization_10',
    'scale': True,
    'trainable': True},
   'inbound

In [3]:
!pip install kenlm

Collecting kenlm
[?25l  Downloading https://files.pythonhosted.org/packages/57/54/0cc492b8d7aceb17a9164c6e6b9c9afc2c73706bb39324e8f6fa02f7134a/kenlm-0.tar.gz (1.4MB)
[K     |████████████████████████████████| 1.5MB 4.1MB/s 
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (setup.py) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.0.0-cp36-cp36m-linux_x86_64.whl size=2277862 sha256=5cd4ce4ddb014c38daca31ac50ff7c43fc571bc4758e7634cfee5d75d44a8407
  Stored in directory: /root/.cache/pip/wheels/e9/cf/f4/1a1aab56f87f4132667a7a47045a750384f19d646099ab4858
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.0.0
