From 1a319fbf49df61a38e148b8462be5bf35c80e334 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 7 Jun 2017 16:37:13 +0800 Subject: [PATCH 1/3] Support variable input batch and sortagrad. --- deep_speech_2/audio_data_utils.py | 56 ++++++++++++++++++++-------- deep_speech_2/train.py | 61 +++++++++++-------------------- 2 files changed, 62 insertions(+), 55 deletions(-) diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py index c717bcf182..abb7f1e993 100644 --- a/deep_speech_2/audio_data_utils.py +++ b/deep_speech_2/audio_data_utils.py @@ -8,6 +8,7 @@ import random import soundfile import numpy as np +import itertools import os RANDOM_SEED = 0 @@ -62,6 +63,7 @@ def __init__(self, self.__stride_ms__ = stride_ms self.__window_ms__ = window_ms self.__max_frequency__ = max_frequency + self.__epoc__ = 0 self.__random__ = random.Random(RANDOM_SEED) # load vocabulary (dictionary) self.__vocab_dict__, self.__vocab_list__ = \ @@ -245,9 +247,33 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False): new_batch.append((padded_audio, text)) return new_batch + def __batch_shuffle__(self, manifest, batch_size): + """ + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly remove `k` instances in order to make different mini-batches, + then make minibatches and each minibatch size is batch_size. + 4. Shuffle the minibatches. + + :param manifest: manifest file. + :type manifest: list + :param batch_size: batch size. + :type batch_size: int + """ + manifest.sort(key=lambda x: x["duration"]) + shift_len = self.__random__.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + self.__random__.shuffle(batch_manifest) + batch_manifest = list(sum(batch_manifest, ())) + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest + def instance_reader_creator(self, manifest_path, - sort_by_duration=True, + batch_size, + sortagrad=True, shuffle=False): """ Instance reader creator for audio data. Creat a callable function to @@ -258,18 +284,14 @@ def instance_reader_creator(self, :param manifest_path: Filepath of manifest for audio clip files. :type manifest_path: basestring - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Data reader function. :rtype: callable """ - if sort_by_duration and shuffle: - sort_by_duration = False - logger.warn("When shuffle set to true, " - "sort_by_duration is forced to set False.") def reader(): # read manifest @@ -278,16 +300,17 @@ def reader(): max_duration=self.__max_duration__, min_duration=self.__min_duration__) # sort (by duration) or shuffle manifest - if sort_by_duration: + if self.__epoc__ == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) - if shuffle: - self.__random__.shuffle(manifest) + elif shuffle: + manifest = self.__batch_shuffle__(manifest, batch_size) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( instance["audio_filepath"]) transcript = self.__text_featurize__(instance["text"]) yield (spectrogram, transcript) + self.__epoc__ += 1 return reader @@ -296,7 +319,7 @@ def batch_reader_creator(self, batch_size, padding_to=-1, flatten=False, - sort_by_duration=True, + sortagrad=False, shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to @@ -317,9 +340,9 @@ def batch_reader_creator(self, :param flatten: If set True, audio data will be flatten to be a 1-dim ndarray. Otherwise, 2-dim ndarray. Default is False. :type flatten: bool - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Batch reader function, producing batches of data when called. @@ -329,7 +352,8 @@ def batch_reader_creator(self, def batch_reader(): instance_reader = self.instance_reader_creator( manifest_path=manifest_path, - sort_by_duration=sort_by_duration, + batch_size=batch_size, + sortagrad=sortagrad, shuffle=shuffle) batch = [] for instance in instance_reader(): diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index e6a7d076bb..55577b0d87 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -85,23 +85,27 @@ def train(): """ DeepSpeech2 training. """ + # initialize data generator - data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - normalizer_manifest_path=args.normalizer_manifest_path, - normalizer_num_samples=200, - max_duration=20.0, - min_duration=0.0, - stride_ms=10, - window_ms=20) + def data_generator(): + return DataGenerator( + vocab_filepath=args.vocab_filepath, + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + train_generator = data_generator() + test_generator = data_generator() # create network config - dict_size = data_generator.vocabulary_size() + dict_size = train_generator.vocabulary_size() + # paddle.data_type.dense_array is used for variable batch input. + # the size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be set at each batch. audio_data = paddle.layer.data( - name="audio_spectrogram", - height=161, - width=2000, - type=paddle.data_type.dense_vector(322000)) + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) @@ -122,28 +126,16 @@ def train(): cost=cost, parameters=parameters, update_equation=optimizer) # prepare data reader - train_batch_reader_sortagrad = data_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, - batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=True, - shuffle=False) - train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, + sortagrad=True, shuffle=True) - test_batch_reader = data_generator.batch_reader_creator( + test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, shuffle=False) - feeding = data_generator.data_name_feeding() + feeding = train_generator.data_name_feeding() # create event handler def event_handler(event): @@ -169,17 +161,8 @@ def event_handler(event): time.time() - start_time, event.pass_id, result.cost) # run train - # first pass with sortagrad - if args.use_sortagrad: - trainer.train( - reader=train_batch_reader_sortagrad, - event_handler=event_handler, - num_passes=1, - feeding=feeding) - args.num_passes -= 1 - # other passes without sortagrad trainer.train( - reader=train_batch_reader_nosortagrad, + reader=train_batch_reader, event_handler=event_handler, num_passes=args.num_passes, feeding=feeding) From 1738700ee6982250cae5454549884b440f8d7d2f Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 12 Jun 2017 19:06:55 +0800 Subject: [PATCH 2/3] refine audio_data_utils.py --- deep_speech_2/audio_data_utils.py | 68 ++++++++++++++----------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py index abb7f1e993..692a42809f 100644 --- a/deep_speech_2/audio_data_utils.py +++ b/deep_speech_2/audio_data_utils.py @@ -247,22 +247,25 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False): new_batch.append((padded_audio, text)) return new_batch - def __batch_shuffle__(self, manifest, batch_size): + def __batch_shuffle__(self, manifest, batch_shuffle_size): """ 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). + 2. Generate a random number `k`, k in [0, batch_shuffle_size). 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_size. + then make minibatches and each minibatch size is batch_shuffle_size. 4. Shuffle the minibatches. :param manifest: manifest file. :type manifest: list - :param batch_size: batch size. - :type batch_size: int + :param batch_shuffle_size: This size is uesed to generate a random number, + it usually equals to batch size. + :type batch_shuffle_size: int + :return: batch shuffled mainifest. + :rtype: list """ manifest.sort(key=lambda x: x["duration"]) - shift_len = self.__random__.randint(0, batch_size - 1) - batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + shift_len = self.__random__.randint(0, batch_shuffle_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size) self.__random__.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) res_len = len(manifest) - shift_len - len(batch_manifest) @@ -270,11 +273,7 @@ def __batch_shuffle__(self, manifest, batch_size): batch_manifest.extend(manifest[0:shift_len]) return batch_manifest - def instance_reader_creator(self, - manifest_path, - batch_size, - sortagrad=True, - shuffle=False): + def instance_reader_creator(self, manifest): """ Instance reader creator for audio data. Creat a callable function to produce instances of data. @@ -282,35 +281,19 @@ def instance_reader_creator(self, Instance: a tuple of a numpy ndarray of audio spectrogram and a list of tokenized and indexed transcription text. - :param manifest_path: Filepath of manifest for audio clip files. - :type manifest_path: basestring - :param sortagrad: Sort the audio clips by duration in the first epoc - if set True. - :type sortagrad: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param manifest: Filepath of manifest for audio clip files. + :type manifest: basestring :return: Data reader function. :rtype: callable """ def reader(): - # read manifest - manifest = self.__read_manifest__( - manifest_path=manifest_path, - max_duration=self.__max_duration__, - min_duration=self.__min_duration__) - # sort (by duration) or shuffle manifest - if self.__epoc__ == 0 and sortagrad: - manifest.sort(key=lambda x: x["duration"]) - elif shuffle: - manifest = self.__batch_shuffle__(manifest, batch_size) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( instance["audio_filepath"]) transcript = self.__text_featurize__(instance["text"]) yield (spectrogram, transcript) - self.__epoc__ += 1 return reader @@ -320,7 +303,7 @@ def batch_reader_creator(self, padding_to=-1, flatten=False, sortagrad=False, - shuffle=False): + batch_shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to produce batches of data. @@ -343,18 +326,28 @@ def batch_reader_creator(self, :param sortagrad: Sort the audio clips by duration in the first epoc if set True. :type sortagrad: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param batch_shuffle: Shuffle the audio clips if set True. It is + not a thorough instance-wise shuffle, + but a specific batch-wise shuffle. + :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): - instance_reader = self.instance_reader_creator( + # read manifest + manifest = self.__read_manifest__( manifest_path=manifest_path, - batch_size=batch_size, - sortagrad=sortagrad, - shuffle=shuffle) + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + + # sort (by duration) or shuffle manifest + if self.__epoc__ == 0 and sortagrad: + manifest.sort(key=lambda x: x["duration"]) + elif batch_shuffle: + manifest = self.__batch_shuffle__(manifest, batch_size) + + instance_reader = self.instance_reader_creator(manifest) batch = [] for instance in instance_reader(): batch.append(instance) @@ -363,6 +356,7 @@ def batch_reader(): batch = [] if len(batch) > 0: yield self.__padding_batch__(batch, padding_to, flatten) + self.__epoc__ += 1 return batch_reader From cb6da079919f448eb8b29bae6373ca19190a2c80 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 12 Jun 2017 19:53:41 +0800 Subject: [PATCH 3/3] add more comments and update train.py --- deep_speech_2/audio_data_utils.py | 30 ++++++++++++++++++++---------- deep_speech_2/train.py | 6 +++--- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py index 692a42809f..1cd29be114 100644 --- a/deep_speech_2/audio_data_utils.py +++ b/deep_speech_2/audio_data_utils.py @@ -247,25 +247,34 @@ def __padding_batch__(self, batch, padding_to=-1, flatten=False): new_batch.append((padded_audio, text)) return new_batch - def __batch_shuffle__(self, manifest, batch_shuffle_size): + def __batch_shuffle__(self, manifest, batch_size): """ + The instances have different lengths and they cannot be + combined into a single matrix multiplication. It usually + sorts the training examples by length and combines only + similarly-sized instances into minibatches, pads with + silence when necessary so that all instances in a batch + have the same length. This batch shuffle fuction is used + to make similarly-sized instances into minibatches and + make a batch-wise shuffle. + 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_shuffle_size). + 2. Generate a random number `k`, k in [0, batch_size). 3. Randomly remove `k` instances in order to make different mini-batches, - then make minibatches and each minibatch size is batch_shuffle_size. + then make minibatches and each minibatch size is batch_size. 4. Shuffle the minibatches. :param manifest: manifest file. :type manifest: list - :param batch_shuffle_size: This size is uesed to generate a random number, - it usually equals to batch size. - :type batch_shuffle_size: int + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int :return: batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) - shift_len = self.__random__.randint(0, batch_shuffle_size - 1) - batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_shuffle_size) + shift_len = self.__random__.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) self.__random__.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) res_len = len(manifest) - shift_len - len(batch_manifest) @@ -327,8 +336,9 @@ def batch_reader_creator(self, if set True. :type sortagrad: bool :param batch_shuffle: Shuffle the audio clips if set True. It is - not a thorough instance-wise shuffle, - but a specific batch-wise shuffle. + not a thorough instance-wise shuffle, but a + specific batch-wise shuffle. For more details, + please see `__batch_shuffle__` function. :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index eb9b56de7f..957c24267c 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -143,12 +143,12 @@ def data_generator(): train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - sortagrad=True, - shuffle=True) + sortagrad=True if args.init_model_path is None else False, + batch_shuffle=True) test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, - shuffle=False) + batch_shuffle=False) feeding = train_generator.data_name_feeding() # create event handler