From 1a319fbf49df61a38e148b8462be5bf35c80e334 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 7 Jun 2017 16:37:13 +0800
Subject: [PATCH 1/3] Support variable input batch and sortagrad.

---
 deep_speech_2/audio_data_utils.py | 56 ++++++++++++++++++++--------
 deep_speech_2/train.py            | 61 +++++++++++--------------------
 2 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py
index c717bcf182..abb7f1e993 100644
--- a/deep_speech_2/audio_data_utils.py
+++ b/deep_speech_2/audio_data_utils.py
@@ -8,6 +8,7 @@
 import random
 import soundfile
 import numpy as np
+import itertools
 import os
 
 RANDOM_SEED = 0
@@ -62,6 +63,7 @@ def __init__(self,
         self.__stride_ms__ = stride_ms
         self.__window_ms__ = window_ms
         self.__max_frequency__ = max_frequency
+        self.__epoc__ = 0
         self.__random__ = random.Random(RANDOM_SEED)
         # load vocabulary (dictionary)
         self.__vocab_dict__, self.__vocab_list__ = \
@@ -245,9 +247,33 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False):
             new_batch.append((padded_audio, text))
         return new_batch
 
+    def __batch_shuffle__(self, manifest, batch_size):
+        """
+        1. Sort the audio clips by duration.
+        2. Generate a random number `k`, k in [0, batch_size).
+        3. Randomly remove `k` instances in order to make different mini-batches,
+           then make minibatches and each minibatch size is batch_size.
+        4. Shuffle the minibatches.
+
+        :param manifest: manifest file.
+        :type manifest: list
+        :param batch_size: batch size.
+        :type batch_size: int
+        """
+        manifest.sort(key=lambda x: x["duration"])
+        shift_len = self.__random__.randint(0, batch_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
+        self.__random__.shuffle(batch_manifest)
+        batch_manifest = list(sum(batch_manifest, ()))
+        res_len = len(manifest) - shift_len - len(batch_manifest)
+        batch_manifest.extend(manifest[-res_len:])
+        batch_manifest.extend(manifest[0:shift_len])
+        return batch_manifest
+
     def instance_reader_creator(self,
                                 manifest_path,
-                                sort_by_duration=True,
+                                batch_size,
+                                sortagrad=True,
                                 shuffle=False):
         """
         Instance reader creator for audio data. Creat a callable function to
@@ -258,18 +284,14 @@ def instance_reader_creator(self,
 
         :param manifest_path: Filepath of manifest for audio clip files.
         :type manifest_path: basestring
-        :param sort_by_duration: Sort the audio clips by duration if set True
-                                 (for SortaGrad).
-        :type sort_by_duration: bool
+        :param sortagrad: Sort the audio clips by duration in the first epoc
+                          if set True.
+        :type sortagrad: bool
         :param shuffle: Shuffle the audio clips if set True.
         :type shuffle: bool
         :return: Data reader function.
         :rtype: callable
         """
-        if sort_by_duration and shuffle:
-            sort_by_duration = False
-            logger.warn("When shuffle set to true, "
-                        "sort_by_duration is forced to set False.")
 
         def reader():
             # read manifest
@@ -278,16 +300,17 @@ def reader():
                 max_duration=self.__max_duration__,
                 min_duration=self.__min_duration__)
             # sort (by duration) or shuffle manifest
-            if sort_by_duration:
+            if self.__epoc__ == 0 and sortagrad:
                 manifest.sort(key=lambda x: x["duration"])
-            if shuffle:
-                self.__random__.shuffle(manifest)
+            elif shuffle:
+                manifest = self.__batch_shuffle__(manifest, batch_size)
             # extract spectrogram feature
             for instance in manifest:
                 spectrogram = self.__audio_featurize__(
                     instance["audio_filepath"])
                 transcript = self.__text_featurize__(instance["text"])
                 yield (spectrogram, transcript)
+            self.__epoc__ += 1
 
         return reader
 
@@ -296,7 +319,7 @@ def batch_reader_creator(self,
                              batch_size,
                              padding_to=-1,
                              flatten=False,
-                             sort_by_duration=True,
+                             sortagrad=False,
                              shuffle=False):
         """
         Batch data reader creator for audio data. Creat a callable function to
@@ -317,9 +340,9 @@ def batch_reader_creator(self,
         :param flatten: If set True, audio data will be flatten to be a 1-dim
                         ndarray. Otherwise, 2-dim ndarray. Default is False.
         :type flatten: bool
-        :param sort_by_duration: Sort the audio clips by duration if set True
-                                 (for SortaGrad).
-        :type sort_by_duration: bool
+        :param sortagrad: Sort the audio clips by duration in the first epoc
+                          if set True.
+        :type sortagrad: bool
         :param shuffle: Shuffle the audio clips if set True.
         :type shuffle: bool
         :return: Batch reader function, producing batches of data when called.
@@ -329,7 +352,8 @@ def batch_reader_creator(self,
         def batch_reader():
             instance_reader = self.instance_reader_creator(
                 manifest_path=manifest_path,
-                sort_by_duration=sort_by_duration,
+                batch_size=batch_size,
+                sortagrad=sortagrad,
                 shuffle=shuffle)
             batch = []
             for instance in instance_reader():
diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py
index e6a7d076bb..55577b0d87 100644
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -85,23 +85,27 @@ def train():
     """
     DeepSpeech2 training.
     """
+
     # initialize data generator
-    data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
-        normalizer_manifest_path=args.normalizer_manifest_path,
-        normalizer_num_samples=200,
-        max_duration=20.0,
-        min_duration=0.0,
-        stride_ms=10,
-        window_ms=20)
+    def data_generator():
+        return DataGenerator(
+            vocab_filepath=args.vocab_filepath,
+            normalizer_manifest_path=args.normalizer_manifest_path,
+            normalizer_num_samples=200,
+            max_duration=20.0,
+            min_duration=0.0,
+            stride_ms=10,
+            window_ms=20)
 
+    train_generator = data_generator()
+    test_generator = data_generator()
     # create network config
-    dict_size = data_generator.vocabulary_size()
+    dict_size = train_generator.vocabulary_size()
+    # paddle.data_type.dense_array is used for variable batch input.
+    # the size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be set at each batch.
     audio_data = paddle.layer.data(
-        name="audio_spectrogram",
-        height=161,
-        width=2000,
-        type=paddle.data_type.dense_vector(322000))
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
     text_data = paddle.layer.data(
         name="transcript_text",
         type=paddle.data_type.integer_value_sequence(dict_size))
@@ -122,28 +126,16 @@ def train():
         cost=cost, parameters=parameters, update_equation=optimizer)
 
     # prepare data reader
-    train_batch_reader_sortagrad = data_generator.batch_reader_creator(
-        manifest_path=args.train_manifest_path,
-        batch_size=args.batch_size,
-        padding_to=2000,
-        flatten=True,
-        sort_by_duration=True,
-        shuffle=False)
-    train_batch_reader_nosortagrad = data_generator.batch_reader_creator(
+    train_batch_reader = train_generator.batch_reader_creator(
         manifest_path=args.train_manifest_path,
         batch_size=args.batch_size,
-        padding_to=2000,
-        flatten=True,
-        sort_by_duration=False,
+        sortagrad=True,
         shuffle=True)
-    test_batch_reader = data_generator.batch_reader_creator(
+    test_batch_reader = test_generator.batch_reader_creator(
         manifest_path=args.dev_manifest_path,
         batch_size=args.batch_size,
-        padding_to=2000,
-        flatten=True,
-        sort_by_duration=False,
         shuffle=False)
-    feeding = data_generator.data_name_feeding()
+    feeding = train_generator.data_name_feeding()
 
     # create event handler
     def event_handler(event):
@@ -169,17 +161,8 @@ def event_handler(event):
                 time.time() - start_time, event.pass_id, result.cost)
 
     # run train
-    # first pass with sortagrad
-    if args.use_sortagrad:
-        trainer.train(
-            reader=train_batch_reader_sortagrad,
-            event_handler=event_handler,
-            num_passes=1,
-            feeding=feeding)
-        args.num_passes -= 1
-    # other passes without sortagrad
     trainer.train(
-        reader=train_batch_reader_nosortagrad,
+        reader=train_batch_reader,
         event_handler=event_handler,
         num_passes=args.num_passes,
         feeding=feeding)

From 1738700ee6982250cae5454549884b440f8d7d2f Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 12 Jun 2017 19:06:55 +0800
Subject: [PATCH 2/3] refine audio_data_utils.py

---
 deep_speech_2/audio_data_utils.py | 68 ++++++++++++++-----------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py
index abb7f1e993..692a42809f 100644
--- a/deep_speech_2/audio_data_utils.py
+++ b/deep_speech_2/audio_data_utils.py
@@ -247,22 +247,25 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False):
             new_batch.append((padded_audio, text))
         return new_batch
 
-    def __batch_shuffle__(self, manifest, batch_size):
+    def __batch_shuffle__(self, manifest, batch_shuffle_size):
         """
         1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_size).
+        2. Generate a random number `k`, k in [0, batch_shuffle_size).
         3. Randomly remove `k` instances in order to make different mini-batches,
-           then make minibatches and each minibatch size is batch_size.
+           then make minibatches and each minibatch size is batch_shuffle_size.
         4. Shuffle the minibatches.
 
         :param manifest: manifest file.
         :type manifest: list
-        :param batch_size: batch size.
-        :type batch_size: int
+        :param batch_shuffle_size: This size is uesed to generate a random number,
+                                   it usually equals to batch size.
+        :type batch_shuffle_size: int
+        :return: batch shuffled mainifest.
+        :rtype: list
         """
         manifest.sort(key=lambda x: x["duration"])
-        shift_len = self.__random__.randint(0, batch_size - 1)
-        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
+        shift_len = self.__random__.randint(0, batch_shuffle_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size)
         self.__random__.shuffle(batch_manifest)
         batch_manifest = list(sum(batch_manifest, ()))
         res_len = len(manifest) - shift_len - len(batch_manifest)
@@ -270,11 +273,7 @@ def __batch_shuffle__(self, manifest, batch_size):
         batch_manifest.extend(manifest[0:shift_len])
         return batch_manifest
 
-    def instance_reader_creator(self,
-                                manifest_path,
-                                batch_size,
-                                sortagrad=True,
-                                shuffle=False):
+    def instance_reader_creator(self, manifest):
         """
         Instance reader creator for audio data. Creat a callable function to
         produce instances of data.
@@ -282,35 +281,19 @@ def instance_reader_creator(self,
         Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
         tokenized and indexed transcription text.
 
-        :param manifest_path: Filepath of manifest for audio clip files.
-        :type manifest_path: basestring
-        :param sortagrad: Sort the audio clips by duration in the first epoc
-                          if set True.
-        :type sortagrad: bool
-        :param shuffle: Shuffle the audio clips if set True.
-        :type shuffle: bool
+        :param manifest: Filepath of manifest for audio clip files.
+        :type manifest: basestring
         :return: Data reader function.
         :rtype: callable
         """
 
         def reader():
-            # read manifest
-            manifest = self.__read_manifest__(
-                manifest_path=manifest_path,
-                max_duration=self.__max_duration__,
-                min_duration=self.__min_duration__)
-            # sort (by duration) or shuffle manifest
-            if self.__epoc__ == 0 and sortagrad:
-                manifest.sort(key=lambda x: x["duration"])
-            elif shuffle:
-                manifest = self.__batch_shuffle__(manifest, batch_size)
             # extract spectrogram feature
             for instance in manifest:
                 spectrogram = self.__audio_featurize__(
                     instance["audio_filepath"])
                 transcript = self.__text_featurize__(instance["text"])
                 yield (spectrogram, transcript)
-            self.__epoc__ += 1
 
         return reader
 
@@ -320,7 +303,7 @@ def batch_reader_creator(self,
                              padding_to=-1,
                              flatten=False,
                              sortagrad=False,
-                             shuffle=False):
+                             batch_shuffle=False):
         """
         Batch data reader creator for audio data. Creat a callable function to
         produce batches of data.
@@ -343,18 +326,28 @@ def batch_reader_creator(self,
         :param sortagrad: Sort the audio clips by duration in the first epoc
                           if set True.
         :type sortagrad: bool
-        :param shuffle: Shuffle the audio clips if set True.
-        :type shuffle: bool
+        :param batch_shuffle: Shuffle the audio clips if set True. It is
+                              not a thorough instance-wise shuffle,
+                              but a specific batch-wise shuffle.
+        :type batch_shuffle: bool
         :return: Batch reader function, producing batches of data when called.
         :rtype: callable
         """
 
         def batch_reader():
-            instance_reader = self.instance_reader_creator(
+            # read manifest
+            manifest = self.__read_manifest__(
                 manifest_path=manifest_path,
-                batch_size=batch_size,
-                sortagrad=sortagrad,
-                shuffle=shuffle)
+                max_duration=self.__max_duration__,
+                min_duration=self.__min_duration__)
+
+            # sort (by duration) or shuffle manifest
+            if self.__epoc__ == 0 and sortagrad:
+                manifest.sort(key=lambda x: x["duration"])
+            elif batch_shuffle:
+                manifest = self.__batch_shuffle__(manifest, batch_size)
+
+            instance_reader = self.instance_reader_creator(manifest)
             batch = []
             for instance in instance_reader():
                 batch.append(instance)
@@ -363,6 +356,7 @@ def batch_reader():
                     batch = []
             if len(batch) > 0:
                 yield self.__padding_batch__(batch, padding_to, flatten)
+            self.__epoc__ += 1
 
         return batch_reader
 

From cb6da079919f448eb8b29bae6373ca19190a2c80 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 12 Jun 2017 19:53:41 +0800
Subject: [PATCH 3/3] add more comments and update train.py

---
 deep_speech_2/audio_data_utils.py | 30 ++++++++++++++++++++----------
 deep_speech_2/train.py            |  6 +++---
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py
index 692a42809f..1cd29be114 100644
--- a/deep_speech_2/audio_data_utils.py
+++ b/deep_speech_2/audio_data_utils.py
@@ -247,25 +247,34 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False):
             new_batch.append((padded_audio, text))
         return new_batch
 
-    def __batch_shuffle__(self, manifest, batch_shuffle_size):
+    def __batch_shuffle__(self, manifest, batch_size):
         """
+        The instances have different lengths and they cannot be
+        combined into a single matrix multiplication. It usually
+        sorts the training examples by length and combines only
+        similarly-sized instances into minibatches, pads with
+        silence when necessary so that all instances in a batch
+        have the same length. This batch shuffle fuction is used
+        to make similarly-sized instances into minibatches and
+        make a batch-wise shuffle.
+
         1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_shuffle_size).
+        2. Generate a random number `k`, k in [0, batch_size).
         3. Randomly remove `k` instances in order to make different mini-batches,
-           then make minibatches and each minibatch size is batch_shuffle_size.
+           then make minibatches and each minibatch size is batch_size.
         4. Shuffle the minibatches.
 
         :param manifest: manifest file.
         :type manifest: list
-        :param batch_shuffle_size: This size is uesed to generate a random number,
-                                   it usually equals to batch size.
-        :type batch_shuffle_size: int
+        :param batch_size: Batch size. This size is also used for generate
+                           a random number for batch shuffle.
+        :type batch_size: int
         :return: batch shuffled mainifest.
         :rtype: list
         """
         manifest.sort(key=lambda x: x["duration"])
-        shift_len = self.__random__.randint(0, batch_shuffle_size - 1)
-        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size)
+        shift_len = self.__random__.randint(0, batch_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
         self.__random__.shuffle(batch_manifest)
         batch_manifest = list(sum(batch_manifest, ()))
         res_len = len(manifest) - shift_len - len(batch_manifest)
@@ -327,8 +336,9 @@ def batch_reader_creator(self,
                           if set True.
         :type sortagrad: bool
         :param batch_shuffle: Shuffle the audio clips if set True. It is
-                              not a thorough instance-wise shuffle,
-                              but a specific batch-wise shuffle.
+                              not a thorough instance-wise shuffle, but a
+                              specific batch-wise shuffle. For more details,
+                              please see `__batch_shuffle__` function.
         :type batch_shuffle: bool
         :return: Batch reader function, producing batches of data when called.
         :rtype: callable
diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py
index eb9b56de7f..957c24267c 100644
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -143,12 +143,12 @@ def data_generator():
     train_batch_reader = train_generator.batch_reader_creator(
         manifest_path=args.train_manifest_path,
         batch_size=args.batch_size,
-        sortagrad=True,
-        shuffle=True)
+        sortagrad=True if args.init_model_path is None else False,
+        batch_shuffle=True)
     test_batch_reader = test_generator.batch_reader_creator(
         manifest_path=args.dev_manifest_path,
         batch_size=args.batch_size,
-        shuffle=False)
+        batch_shuffle=False)
     feeding = train_generator.data_name_feeding()
 
     # create event handler