TensorSpeech · nglehuy · Oct 25, 2020 · Oct 25, 2020 · Oct 25, 2020
diff --git a/examples/deepspeech2/README.md b/examples/deepspeech2/README.md
@@ -6,22 +6,19 @@ References: [https://arxiv.org/abs/1512.02595](https://arxiv.org/abs/1512.02595)
 
 ```yaml
 model_config:
-  conv_conf:
-    conv_type: 2
-    conv_kernels: [[11, 41], [11, 21], [11, 11]]
-    conv_strides: [[2, 2], [1, 2], [1, 2]]
-    conv_filters: [32, 32, 96]
-    conv_dropout: 0
-  rnn_conf:
-    rnn_layers:        5
-    rnn_type:          lstm
-    rnn_units:         512
-    rnn_bidirectional: True
-    rnn_rowconv:       False
-    rnn_dropout:       0
-  fc_conf:
-    fc_units: [1024]
-    fc_dropout: 0
+  conv_type: conv2d
+  conv_kernels: [[11, 41], [11, 21], [11, 11]]
+  conv_strides: [[2, 2], [1, 2], [1, 2]]
+  conv_filters: [32, 32, 96]
+  conv_dropout: 0.1
+  rnn_nlayers: 5
+  rnn_type: lstm
+  rnn_units: 512
+  rnn_bidirectional: True
+  rnn_rowconv: 0
+  rnn_dropout: 0.1
+  fc_nlayers: 0
+  fc_units: 1024
 ```
 
 ## Architecture
@@ -30,24 +27,6 @@ model_config:
 
 ## Training and Testing
 
-See `python examples/deepspeech2/run_ds2.py --help`
+See `python examples/deepspeech2/train_ds2.py --help`
 
-## Results on VIVOS Dataset
-
-* Features: Spectrogram with `80` frequency channels
-* KenLM: `alpha = 2.0` and `beta = 1.0`
-* Epochs: `20`
-* Train set split ratio: `90:10`
-* Augmentation: `None`
-* Model architecture: same as [vivos.yaml](./configs/vivos.yml)
-
-**CTC Loss**
-
-<img src="./figs/ds2_vivos_ctc_loss.svg" alt="ds2_vivos_ctc_loss" width="300px" />
-
-**Error rates**
-
-|                 |    WER (%)     |    CER (%)     |
-| :-------------- | :------------: | :------------: |
-| *BeamSearch*    |    43.75243    |   17.991581    |
-| *BeamSearch LM* | **20.7561836** | **11.0304441** |
+See `python examples/deepspeech2/test_ds2.py --help`
diff --git a/examples/deepspeech2/configs/vivos.yml → examples/deepspeech2/config.yml b/examples/deepspeech2/configs/vivos.yml → examples/deepspeech2/config.yml
@@ -24,7 +24,7 @@ speech_config:
   normalize_per_feature: False
 
 decoder_config:
-  vocabulary: /mnt/Projects/asrk16/TiramisuASR/vocabularies/vietnamese.txt
+  vocabulary: ./vocabularies/vietnamese.characters
   blank_at_zero: False
   beam_width: 500
   lm_config:
@@ -33,21 +33,20 @@ decoder_config:
     beta: 1.0
 
 model_config:
-  conv_conf:
-    conv_type: 2
-    conv_kernels: [[11, 41], [11, 21], [11, 11]]
-    conv_strides: [[2, 2], [1, 2], [1, 2]]
-    conv_filters: [32, 32, 96]
-    conv_dropout: 0
-  rnn_conf:
-    rnn_layers: 5
-    rnn_type: lstm
-    rnn_units: 512
-    rnn_bidirectional: True
-    rnn_rowconv: False
-    rnn_dropout: 0
-  fc_conf:
-    fc_units: null
+  name: deepspeech2
+  conv_type: conv2d
+  conv_kernels: [[11, 41], [11, 21], [11, 11]]
+  conv_strides: [[2, 2], [1, 2], [1, 2]]
+  conv_filters: [32, 32, 96]
+  conv_dropout: 0.1
+  rnn_nlayers: 5
+  rnn_type: lstm
+  rnn_units: 512
+  rnn_bidirectional: True
+  rnn_rowconv: 0
+  rnn_dropout: 0.1
+  fc_nlayers: 0
+  fc_units: 1024
 
 learning_config:
   augmentations: null

diff --git a/examples/deepspeech2/figs/ds2_vivos_ctc_loss.svg b/examples/deepspeech2/figs/ds2_vivos_ctc_loss.svg
diff --git a/examples/deepspeech2/model.py b/examples/deepspeech2/model.py
diff --git a/examples/deepspeech2/test_ds2.py b/examples/deepspeech2/test_ds2.py
@@ -19,7 +19,7 @@
 setup_environment()
 import tensorflow as tf
 
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
@@ -54,7 +54,7 @@
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.base_runners import BaseTester
-from model import DeepSpeech2
+from tensorflow_asr.models.deepspeech2 import DeepSpeech2
 
 tf.random.set_seed(0)
 assert args.export
@@ -63,13 +63,10 @@
 speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
 text_featurizer = CharFeaturizer(config["decoder_config"])
 # Build DS2 model
-ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
-                        arch_config=config["model_config"],
-                        num_classes=text_featurizer.num_classes,
-                        name="deepspeech2")
+ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
 ds2_model._build(speech_featurizer.shape)
 ds2_model.load_weights(args.saved, by_name=True)
-ds2_model.summary(line_length=150)
+ds2_model.summary(line_length=120)
 ds2_model.add_featurizers(speech_featurizer, text_featurizer)
 
 if args.tfrecords:

diff --git a/examples/deepspeech2/train_ds2.py b/examples/deepspeech2/train_ds2.py
@@ -19,7 +19,7 @@
 setup_environment()
 import tensorflow as tf
 
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "configs", "vivos.yml")
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
@@ -60,7 +60,7 @@
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
 from tensorflow_asr.runners.ctc_runners import CTCTrainer
-from model import DeepSpeech2
+from tensorflow_asr.models.deepspeech2 import DeepSpeech2
 
 config = UserConfig(DEFAULT_YAML, args.config, learning=True)
 speech_featurizer = TFSpeechFeaturizer(config["speech_config"])
@@ -100,12 +100,9 @@
 ctc_trainer = CTCTrainer(text_featurizer, config["learning_config"]["running_config"])
 # Build DS2 model
 with ctc_trainer.strategy.scope():
-    ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape,
-                            arch_config=config["model_config"],
-                            num_classes=text_featurizer.num_classes,
-                            name="deepspeech2")
+    ds2_model = DeepSpeech2(**config["model_config"], vocabulary_size=text_featurizer.num_classes)
     ds2_model._build(speech_featurizer.shape)
-    ds2_model.summary(line_length=150)
+    ds2_model.summary(line_length=120)
 # Compile
 ctc_trainer.compile(ds2_model, config["learning_config"]["optimizer_config"],
                     max_to_keep=args.max_ckpts)

diff --git a/examples/jasper/README.md b/examples/jasper/README.md
@@ -0,0 +1,20 @@
+# Jasper
+
+References: [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288)
+
+## Model YAML Config Structure
+
+```yaml
+model_config:
+
+```
+
+## Architecture
+
+<img src="./figs/jasper_arch.png" alt="jasper_arch" width="500px" />
+
+## Training and Testing
+
+See `python examples/jasper/train_jasper.py --help`
+
+See `python examples/jasper/test_jasper.py --help`