Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as

## What's New?

- (11/14/2020) Supported Gradient Accumulation for Training in Larger Batch Size
- (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft`
- (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
- (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621)
Expand Down
14 changes: 7 additions & 7 deletions examples/conformer/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ learning_config:

dataset_config:
train_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/train-clean-100/transcripts.tsv
eval_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/Data/ML/ASR/Raw/LibriSpeech/dev-other/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-other/transcripts.tsv
test_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/test-clean/transcripts.tsv
tfrecords_dir: null

optimizer_config:
Expand All @@ -83,10 +83,10 @@ learning_config:
epsilon: 1e-9

running_config:
batch_size: 2
accumulation_steps: 1
batch_size: 4
accumulation_steps: 4
num_epochs: 20
outdir: /mnt/Projects/asrk16/trained/local/librispeech/conformer
outdir: /mnt/d/SpeechProcessing/Trained/local/conformer
log_interval_steps: 300
eval_interval_steps: 500
save_interval_steps: 1000
2 changes: 1 addition & 1 deletion examples/conformer/tflite_subword_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# build model
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
conformer._build(speech_featurizer.shape)
conformer.load_weights(args.saved)
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=150)
conformer.add_featurizers(speech_featurizer, text_featurizer)

Expand Down
6 changes: 5 additions & 1 deletion examples/conformer/train_ga_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
parser.add_argument("--ebs", type=int, default=None,
help="Evaluation batch size per replica")

parser.add_argument("--acs", type=int, default=None,
help="Train accumulation steps")

parser.add_argument("--devices", type=int, nargs="*", default=[0],
help="Devices' ids to apply distributed training")

Expand Down Expand Up @@ -125,4 +128,5 @@
conformer_trainer.compile(model=conformer, optimizer=optimizer,
max_to_keep=args.max_ckpts)

conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
conformer_trainer.fit(train_dataset, eval_dataset,
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
6 changes: 5 additions & 1 deletion examples/conformer/train_ga_subword_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
parser.add_argument("--ebs", type=int, default=None,
help="Evaluation batch size per replica")

parser.add_argument("--acs", type=int, default=None,
help="Train accumulation steps")

parser.add_argument("--devices", type=int, nargs="*", default=[0],
help="Devices' ids to apply distributed training")

Expand Down Expand Up @@ -141,4 +144,5 @@
conformer_trainer.compile(model=conformer, optimizer=optimizer,
max_to_keep=args.max_ckpts)

conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
conformer_trainer.fit(train_dataset, eval_dataset,
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
15 changes: 8 additions & 7 deletions examples/deepspeech2/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ speech_config:
normalize_per_feature: False

decoder_config:
vocabulary: ./vocabularies/vietnamese.characters
vocabulary: null
blank_at_zero: False
beam_width: 500
lm_config:
model_path: /mnt/Data/ML/NLP/vntc_asr_5gram_trie.binary
model_path: null
alpha: 2.0
beta: 1.0

Expand All @@ -53,12 +53,13 @@ learning_config:

dataset_config:
train_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/train/train_transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/train-clean-100/transcripts.tsv
eval_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/train/eval_transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-other/transcripts.tsv
test_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/test/transcripts.tsv
tfrecords_dir: /mnt/Data/ML/ASR/Preprocessed/Vivos/TFRecords
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/test-clean/transcripts.tsv
tfrecords_dir: null

optimizer_config:
class_name: adam
Expand All @@ -68,7 +69,7 @@ learning_config:
running_config:
batch_size: 8
num_epochs: 20
outdir: /mnt/Projects/asrk16/trained/local/vivos
outdir: /mnt/d/SpeechProcessing/Trained/local/deepspeech2
log_interval_steps: 400
save_interval_steps: 400
eval_interval_steps: 800
15 changes: 8 additions & 7 deletions examples/jasper/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ speech_config:
normalize_per_feature: False

decoder_config:
vocabulary: ./vocabularies/vietnamese.characters
vocabulary: null
blank_at_zero: False
beam_width: 500
lm_config:
model_path: /mnt/Data/ML/NLP/vntc_asr_5gram_trie.binary
model_path: null
alpha: 2.0
beta: 1.0

Expand Down Expand Up @@ -60,12 +60,13 @@ learning_config:

dataset_config:
train_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/train/train_transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/train-clean-100/transcripts.tsv
eval_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/train/eval_transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-other/transcripts.tsv
test_paths:
- /mnt/Data/ML/ASR/Preprocessed/Vivos/test/transcripts.tsv
tfrecords_dir: /mnt/Data/ML/ASR/Preprocessed/Vivos/TFRecords
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/test-clean/transcripts.tsv
tfrecords_dir: null

optimizer_config:
class_name: adam
Expand All @@ -75,7 +76,7 @@ learning_config:
running_config:
batch_size: 8
num_epochs: 20
outdir: /mnt/Projects/asrk16/trained/local/jasper
outdir: /mnt/d/SpeechProcessing/Trained/local/jasper
log_interval_steps: 400
save_interval_steps: 400
eval_interval_steps: 800
10 changes: 5 additions & 5 deletions examples/streaming_transducer/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ learning_config:

dataset_config:
train_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/train-clean-100/transcripts.tsv
eval_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/Data/ML/ASR/Raw/LibriSpeech/dev-other/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/dev-other/transcripts.tsv
test_paths:
- /mnt/Data/ML/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
- /mnt/d/SpeechProcessing/Datasets/LibriSpeech/test-clean/transcripts.tsv
tfrecords_dir: null

optimizer_config:
Expand All @@ -80,7 +80,7 @@ learning_config:
batch_size: 2
accumulation_steps: 1
num_epochs: 20
outdir: /mnt/Projects/asrk16/trained/local/librispeech/streaming_transducer
outdir: /mnt/SpeechProcessing/Trained/local/streaming_transducer
log_interval_steps: 300
eval_interval_steps: 500
save_interval_steps: 1000
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
parser.add_argument("--ebs", type=int, default=None,
help="Evaluation batch size per replica")

parser.add_argument("--acs", type=int, default=None,
help="Train accumulation steps")

parser.add_argument("--devices", type=int, nargs="*", default=[0],
help="Devices' ids to apply distributed training")

Expand Down Expand Up @@ -116,4 +119,5 @@
streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
max_to_keep=args.max_ckpts)

streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
streaming_transducer_trainer.fit(train_dataset, eval_dataset,
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
parser.add_argument("--ebs", type=int, default=None,
help="Evaluation batch size per replica")

parser.add_argument("--acs", type=int, default=None,
help="Train accumulation steps")

parser.add_argument("--devices", type=int, nargs="*", default=[0],
help="Devices' ids to apply distributed training")

Expand Down Expand Up @@ -132,4 +135,5 @@
streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
max_to_keep=args.max_ckpts)

streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
streaming_transducer_trainer.fit(train_dataset, eval_dataset,
train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,19 @@
"soundfile>=0.10.3",
"PyYAML>=5.3.1",
"matplotlib>=3.2.1",
"numpy>=1.18.5,<1.19.0",
"numpy>=1.16.0,<1.19.0",
"sox>=1.3.7",
"nltk>=3.5",
"numba==0.49.1",
"tqdm>=4.47.0",
"tqdm>=4.51.0",
"colorama>=0.4.3",
"nlpaug>=1.0.1"
"nlpaug>=1.0.1",
"absl-py>=0.9,<0.11"
]

setuptools.setup(
name="TensorFlowASR",
version="0.2.10",
version="0.3.0",
author="Huy Le Nguyen",
author_email="nlhuy.cs.16@gmail.com",
description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
Expand Down
2 changes: 2 additions & 0 deletions tensorflow_asr/models/conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def __init__(self,
num_heads: int = 4,
mha_type: str = "relmha",
kernel_size: int = 32,
depth_multiplier: int = 1,
fc_factor: float = 0.5,
dropout: float = 0,
embed_dim: int = 512,
Expand All @@ -395,6 +396,7 @@ def __init__(self,
num_heads=num_heads,
mha_type=mha_type,
kernel_size=kernel_size,
depth_multiplier=depth_multiplier,
fc_factor=fc_factor,
dropout=dropout,
kernel_regularizer=kernel_regularizer,
Expand Down
20 changes: 6 additions & 14 deletions tensorflow_asr/optimizers/accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,17 @@ class GradientAccumulation:
def __init__(self, trainable_variables):
self.gradients = [
tf.Variable(
tf.zeros_like(self.flat_gradients(g)),
tf.zeros_like(g),
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ
) for g in trainable_variables
]

@staticmethod
def flat_gradients(gradient):
""" Convert gradients if it's tf.IndexedSlices. """
if type(gradient) == tf.IndexedSlices:
return tf.scatter_nd(
tf.expand_dims(gradient.indices, 1),
gradient.values,
gradient.dense_shape
)
return gradient

def reset(self):
for g in self.gradients: g.assign(tf.zeros_like(g))
for i, g in enumerate(self.gradients):
self.gradients[i].assign(tf.zeros_like(g))

def accumulate(self, step_gradients):
for i, g in enumerate(step_gradients):
self.gradients[i].assign_add(self.flat_gradients(g))
if g is None: continue
self.gradients[i].assign_add(g)
4 changes: 2 additions & 2 deletions tensorflow_asr/runners/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def save_from_checkpoint(func,
max_to_keep: number of checkpoints to keep
**kwargs: contains built models, optimizers
"""
steps = tf.Variable(0, dtype=tf.int64) # Step must be int64
epochs = tf.Variable(1)
steps = tf.Variable(0, trainable=False, dtype=tf.int64) # Step must be int64
epochs = tf.Variable(1, trainable=False)
checkpoint_dir = os.path.join(outdir, "checkpoints")
if not os.path.exists(checkpoint_dir):
raise ValueError(f"checkpoint directory not found: {checkpoint_dir}")
Expand Down
9 changes: 6 additions & 3 deletions tensorflow_asr/runners/base_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def __init__(self,
super(BaseTrainer, self).__init__(config)
self.set_strategy(strategy)
# Steps and Epochs start from 0
self.steps = tf.Variable(0, dtype=tf.int64) # Step must be int64 to use tf.summary
# Step must be int64 to use tf.summary
self.steps = tf.Variable(0, trainable=False, dtype=tf.int64)
self.train_steps_per_epoch = None
self.eval_steps_per_epoch = None
# Dataset
Expand Down Expand Up @@ -120,12 +121,14 @@ def set_train_data_loader(self, train_dataset, train_bs=None, train_acs=None):
self.config.batch_size = train_bs # Update batch size fed from arguments

if not train_acs: train_acs = self.config.accumulation_steps
self.accumulation_bs = train_bs // train_acs
self.config.accumulation_steps = train_acs # update accum steps fed from arguments

self.train_data = train_dataset.create(self.global_batch_size)
self.train_data_loader = self.strategy.experimental_distribute_dataset(self.train_data)
self.train_steps_per_epoch = train_dataset.total_steps
if hasattr(self, "accumulation"):
self.train_steps_per_epoch = train_dataset.total_steps // self.config.accumulation_steps
else:
self.train_steps_per_epoch = train_dataset.total_steps

def set_eval_data_loader(self, eval_dataset, eval_bs=None):
""" Set eval data loader (MUST).
Expand Down
Loading