diff --git a/examples/conformer/train_conformer.py b/examples/conformer/train_conformer.py
index 32e17a8974..5697130463 100644
--- a/examples/conformer/train_conformer.py
+++ b/examples/conformer/train_conformer.py
@@ -26,29 +26,25 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -75,14 +71,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -90,13 +90,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 conformer_trainer = TransducerTrainer(
diff --git a/examples/conformer/train_ga_conformer.py b/examples/conformer/train_ga_conformer.py
index 9e29ac29ee..d5be194c4d 100644
--- a/examples/conformer/train_ga_conformer.py
+++ b/examples/conformer/train_ga_conformer.py
@@ -26,32 +26,27 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -78,14 +73,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -93,13 +92,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 conformer_trainer = TransducerTrainerGA(
diff --git a/examples/conformer/train_ga_subword_conformer.py b/examples/conformer/train_ga_subword_conformer.py
index 1896f8cede..609c27e6ad 100644
--- a/examples/conformer/train_ga_subword_conformer.py
+++ b/examples/conformer/train_ga_subword_conformer.py
@@ -26,41 +26,33 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--sentence_piece", default=False, action="store_true",
-                    help="Whether to use `SentencePiece` model")
+parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
 
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
+
+parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -100,14 +92,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -115,13 +111,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 conformer_trainer = TransducerTrainerGA(
diff --git a/examples/conformer/train_subword_conformer.py b/examples/conformer/train_subword_conformer.py
index 528e437dee..07abf8c8cd 100644
--- a/examples/conformer/train_subword_conformer.py
+++ b/examples/conformer/train_subword_conformer.py
@@ -26,38 +26,31 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--sentence_piece", default=False, action="store_true",
-                    help="Whether to use `SentencePiece` model")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
 
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
+
+parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -97,14 +90,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -112,13 +109,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 conformer_trainer = TransducerTrainer(
diff --git a/examples/contextnet/train_contextnet.py b/examples/contextnet/train_contextnet.py
index 6cc9be72b5..e52ab642b0 100644
--- a/examples/contextnet/train_contextnet.py
+++ b/examples/contextnet/train_contextnet.py
@@ -26,29 +26,25 @@
 
 parser = argparse.ArgumentParser(prog="ContextNet Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -75,14 +71,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -90,13 +90,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 contextnet_trainer = TransducerTrainer(
diff --git a/examples/contextnet/train_ga_contextnet.py b/examples/contextnet/train_ga_contextnet.py
index 45361181b1..d5e969ae54 100644
--- a/examples/contextnet/train_ga_contextnet.py
+++ b/examples/contextnet/train_ga_contextnet.py
@@ -26,32 +26,27 @@
 
 parser = argparse.ArgumentParser(prog="ContextNet Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -78,14 +73,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -93,13 +92,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 contextnet_trainer = TransducerTrainerGA(
diff --git a/examples/contextnet/train_ga_subword_contextnet.py b/examples/contextnet/train_ga_subword_contextnet.py
index 7a33956b42..32cf6a75df 100644
--- a/examples/contextnet/train_ga_subword_contextnet.py
+++ b/examples/contextnet/train_ga_subword_contextnet.py
@@ -26,38 +26,31 @@
 
 parser = argparse.ArgumentParser(prog="ContextNet Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
 
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
+
+parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -94,14 +87,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -109,13 +106,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 contextnet_trainer = TransducerTrainerGA(
diff --git a/examples/contextnet/train_subword_contextnet.py b/examples/contextnet/train_subword_contextnet.py
index 50f90892ba..2d186fbe42 100644
--- a/examples/contextnet/train_subword_contextnet.py
+++ b/examples/contextnet/train_subword_contextnet.py
@@ -26,35 +26,29 @@
 
 parser = argparse.ArgumentParser(prog="ContextNet Training")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
+parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
 
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
 
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
+parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
+parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
 
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
+
+parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+
+parser.add_argument("--bfs", type=int, default=100, help="Buffer size for shuffling")
 
 args = parser.parse_args()
 
@@ -91,14 +85,18 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        tfrecords_shards=args.tfrecords_shards,
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRTFRecordDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        tfrecords_shards=args.tfrecords_shards,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 else:
     train_dataset = ASRSliceDataset(
@@ -106,13 +104,15 @@
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
         augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
+        stage="train", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
     eval_dataset = ASRSliceDataset(
         data_paths=config.learning_config.dataset_config.eval_paths,
         speech_featurizer=speech_featurizer,
         text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
+        stage="eval", cache=args.cache,
+        shuffle=True, buffer_size=args.bfs,
     )
 
 contextnet_trainer = TransducerTrainer(
diff --git a/scripts/create_tfrecords.py b/scripts/create_tfrecords.py
index 7a787589a3..509d6b891b 100644
--- a/scripts/create_tfrecords.py
+++ b/scripts/create_tfrecords.py
@@ -20,14 +20,15 @@
 
 parser = argparse.ArgumentParser(prog="TFRecords Creation")
 
-parser.add_argument("--mode", "-m", type=str,
-                    default=None, help=f"Mode in {modes}")
+parser.add_argument("--mode", "-m", type=str, default=None, help=f"Mode in {modes}")
 
-parser.add_argument("--tfrecords_dir", type=str, default=None,
-                    help="Directory to tfrecords")
+parser.add_argument("--tfrecords_dir", type=str, default=None, help="Directory to tfrecords")
 
-parser.add_argument("transcripts", nargs="+", type=str,
-                    default=None, help="Paths to transcript files")
+parser.add_argument("--tfrecords_shards", type=int, default=16, help="Number of tfrecords shards")
+
+parser.add_argument("--shuffle", default=False, action="store_true", help="Shuffle data or not")
+
+parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Paths to transcript files")
 
 args = parser.parse_args()
 
@@ -36,9 +37,8 @@
 transcripts = preprocess_paths(args.transcripts)
 tfrecords_dir = preprocess_paths(args.tfrecords_dir)
 
-if args.mode == "train":
-    ASRTFRecordDataset(transcripts, tfrecords_dir, None, None,
-                       args.mode, shuffle=True).create_tfrecords()
-else:
-    ASRTFRecordDataset(transcripts, tfrecords_dir, None, None,
-                       args.mode, shuffle=False).create_tfrecords()
+ASRTFRecordDataset(
+    data_paths=transcripts, tfrecords_dir=tfrecords_dir,
+    speech_featurizer=None, text_featurizer=None,
+    stage=args.mode, shuffle=args.shuffle, tfrecords_shards=args.tfrecords_shards
+).create_tfrecords()
diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py
index 7ef06a1196..ab2e8690fc 100755
--- a/tensorflow_asr/datasets/asr_dataset.py
+++ b/tensorflow_asr/datasets/asr_dataset.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import abc
-import glob
 import multiprocessing
 import os
 
@@ -20,7 +19,7 @@
 import tensorflow as tf
 
 from ..augmentations.augments import Augmentation
-from .base_dataset import BaseDataset
+from .base_dataset import BaseDataset, BUFFER_SIZE
 from ..featurizers.speech_featurizers import read_raw_audio, SpeechFeaturizer
 from ..featurizers.text_featurizers import TextFeaturizer
 from ..utils.utils import bytestring_feature, print_one_line, get_num_batches
@@ -58,8 +57,12 @@ def __init__(self,
                  data_paths: list,
                  augmentations: Augmentation = Augmentation(None),
                  cache: bool = False,
-                 shuffle: bool = False):
-        super(ASRDataset, self).__init__(data_paths, augmentations, cache, shuffle, stage)
+                 shuffle: bool = False,
+                 buffer_size: int = BUFFER_SIZE):
+        super(ASRDataset, self).__init__(
+            data_paths=data_paths, augmentations=augmentations,
+            cache=cache, shuffle=shuffle, stage=stage, buffer_size=buffer_size
+        )
         self.speech_featurizer = speech_featurizer
         self.text_featurizer = text_featurizer
 
@@ -105,7 +108,7 @@ def process(self, dataset, batch_size):
             dataset = dataset.cache()
 
         if self.shuffle:
-            dataset = dataset.shuffle(TFRECORD_SHARDS, reshuffle_each_iteration=True)
+            dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True)
 
         # PADDED BATCH the dataset
         dataset = dataset.padded_batch(
@@ -150,13 +153,17 @@ def __init__(self,
                  text_featurizer: TextFeaturizer,
                  stage: str,
                  augmentations: Augmentation = Augmentation(None),
+                 tfrecords_shards: int = TFRECORD_SHARDS,
                  cache: bool = False,
-                 shuffle: bool = False):
+                 shuffle: bool = False,
+                 buffer_size: int = BUFFER_SIZE):
         super(ASRTFRecordDataset, self).__init__(
-            stage, speech_featurizer, text_featurizer,
-            data_paths, augmentations, cache, shuffle
+            stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, buffer_size=buffer_size
         )
         self.tfrecords_dir = tfrecords_dir
+        if tfrecords_shards <= 0: raise ValueError("tfrecords_shards must be positive")
+        self.tfrecords_shards = tfrecords_shards
         if not tf.io.gfile.exists(self.tfrecords_dir):
             tf.io.gfile.makedirs(self.tfrecords_dir)
 
@@ -177,10 +184,10 @@ def create_tfrecords(self):
         def get_shard_path(shard_id):
             return os.path.join(self.tfrecords_dir, f"{self.stage}_{shard_id}.tfrecord")
 
-        shards = [get_shard_path(idx) for idx in range(1, TFRECORD_SHARDS + 1)]
+        shards = [get_shard_path(idx) for idx in range(1, self.tfrecords_shards + 1)]
 
-        splitted_entries = np.array_split(entries, TFRECORD_SHARDS)
-        with multiprocessing.Pool(TFRECORD_SHARDS) as pool:
+        splitted_entries = np.array_split(entries, self.tfrecords_shards)
+        with multiprocessing.Pool(self.tfrecords_shards) as pool:
             pool.map(write_tfrecord_file, zip(shards, splitted_entries))
 
         return True
@@ -273,7 +280,7 @@ def process(self, dataset, batch_size):
             dataset = dataset.cache()
 
         if self.shuffle:
-            dataset = dataset.shuffle(TFRECORD_SHARDS, reshuffle_each_iteration=True)
+            dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True)
 
         # PADDED BATCH the dataset
         dataset = dataset.padded_batch(
@@ -337,7 +344,7 @@ def process(self, dataset, batch_size):
             dataset = dataset.cache()
 
         if self.shuffle:
-            dataset = dataset.shuffle(TFRECORD_SHARDS, reshuffle_each_iteration=True)
+            dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True)
 
         # PADDED BATCH the dataset
         dataset = dataset.padded_batch(
diff --git a/tensorflow_asr/datasets/base_dataset.py b/tensorflow_asr/datasets/base_dataset.py
index 864eee8864..d5e018de1a 100644
--- a/tensorflow_asr/datasets/base_dataset.py
+++ b/tensorflow_asr/datasets/base_dataset.py
@@ -15,6 +15,8 @@
 
 from ..augmentations.augments import Augmentation
 
+BUFFER_SIZE = 100
+
 
 class BaseDataset(metaclass=abc.ABCMeta):
     """ Based dataset for all models """
@@ -24,11 +26,15 @@ def __init__(self,
                  augmentations: Augmentation = Augmentation(None),
                  cache: bool = False,
                  shuffle: bool = False,
+                 buffer_size: int = BUFFER_SIZE,
                  stage: str = "train"):
         self.data_paths = data_paths
         self.augmentations = augmentations  # apply augmentation
         self.cache = cache  # whether to cache WHOLE transformed dataset to memory
         self.shuffle = shuffle  # whether to shuffle tf.data.Dataset
+        if buffer_size <= 0 and shuffle:
+            raise ValueError("buffer_size must be positive when shuffle is on")
+        self.buffer_size = buffer_size  # shuffle buffer size
         self.stage = stage  # for defining tfrecords files
         self.total_steps = None  # for better training visualization
 
diff --git a/tensorflow_asr/featurizers/text_featurizers.py b/tensorflow_asr/featurizers/text_featurizers.py
index df32d44089..9c78746d51 100755
--- a/tensorflow_asr/featurizers/text_featurizers.py
+++ b/tensorflow_asr/featurizers/text_featurizers.py
@@ -337,6 +337,7 @@ def build_from_corpus(cls, decoder_config: dict):
         The input sentence must be pretokenized when using word type."""
         decoder_cfg = DecoderConfig(decoder_config)
         # Train SentencePiece Model
+
         def corpus_iterator():
             for file in decoder_cfg.corpus_files:
                 with open(file, "r", encoding="utf-8") as f:
@@ -349,7 +350,7 @@ def corpus_iterator():
         sp.SentencePieceTrainer.Train(
             sentence_iterator=corpus_iterator(),
             model_prefix=decoder_cfg.output_path_prefix,
-            model_type=decoder_cfg.model_type,         
+            model_type=decoder_cfg.model_type,
             vocab_size=decoder_cfg.target_vocab_size,
             num_threads=cpu_count(),
             unk_id=cls.UNK_TOKEN_ID,
@@ -357,7 +358,7 @@ def corpus_iterator():
             eos_id=cls.EOS_TOKEN_ID,
             pad_id=cls.PAD_TOKEN_ID,
             unk_surface='__UNKNOWN__'  # change default unk surface U+2047("⁇") by "__UNKNOWN__"
-            )
+        )
         # Export fairseq dictionary
         processor = sp.SentencePieceProcessor()
         processor.Load(decoder_cfg.output_path_prefix + ".model")
@@ -399,7 +400,7 @@ def extract(self, text: str) -> tf.Tensor:
 
         Returns:
             sequence of ints in tf.Tensor
-        """        
+        """
         text = self.preprocess_text(text)
         text = text.strip()  # remove trailing space
         indices = self.model.encode_as_ids(text)
@@ -449,4 +450,3 @@ def indices2upoints(self, indices: tf.Tensor) -> tf.Tensor:
             indices = self.normalize_indices(indices)
             upoints = tf.gather_nd(self.upoints, tf.expand_dims(indices, axis=-1))
             return tf.gather_nd(upoints, tf.where(tf.not_equal(upoints, 0)))
-
diff --git a/tensorflow_asr/losses/rnnt_losses.py b/tensorflow_asr/losses/rnnt_losses.py
index 14b8cea9af..bebdc5b536 100644
--- a/tensorflow_asr/losses/rnnt_losses.py
+++ b/tensorflow_asr/losses/rnnt_losses.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
+# Copyright 2020 Huy Le Nguyen (@usimarit) and M. Yusuf Sarıgöz (@monatis)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,16 +19,19 @@
     use_warprnnt = True
 except ImportError:
     print("Cannot import RNNT loss in warprnnt. Falls back to RNNT in TensorFlow")
+    print("Note: The RNNT in Tensorflow is not supported for CPU yet")
     from tensorflow.python.ops.gen_array_ops import matrix_diag_part_v2
     use_warprnnt = False
 
 
 def rnnt_loss(logits, labels, label_length, logit_length, blank=0, name=None):
     if use_warprnnt:
-        return rnnt_loss_warprnnt(logits=logits, labels=labels, label_length=label_length, logit_length=logit_length, blank=blank)
+        return rnnt_loss_warprnnt(logits=logits, labels=labels,
+                                  label_length=label_length, logit_length=logit_length, blank=blank)
     else:
         return rnnt_loss_tf(logits=logits, labels=labels, label_length=label_length, logit_length=logit_length, name=name)
 
+
 @tf.function
 def rnnt_loss_warprnnt(logits, labels, label_length, logit_length, blank=0):
     if not tf.config.list_physical_devices('GPU'):
@@ -120,7 +123,8 @@ def next_state(x, mask_and_trans_probs):
         beta_t = tf.concat([x[:, :-1] + truth_probs, LOG_0 * tf.ones(shape=[batch_size, 1])], axis=1)
 
         beta_next = reduce_logsumexp(tf.stack([beta_b, beta_t], axis=0), axis=0)
-        masked_beta_next = nan_to_zero(beta_next * tf.expand_dims(mask_s, axis=1)) + nan_to_zero(x * tf.expand_dims((1.0 - mask_s), axis=1))
+        masked_beta_next = (nan_to_zero(beta_next * tf.expand_dims(mask_s, axis=1))
+                            + nan_to_zero(x * tf.expand_dims((1.0 - mask_s), axis=1)))
         return masked_beta_next
 
     # Initial beta for batches.
@@ -155,14 +159,10 @@ def compute_rnnt_loss_and_grad_helper(logits, labels, label_length, logit_length
     bp_diags = extract_diagonals(blank_probs)
     tp_diags = extract_diagonals(truth_probs)
 
-    label_mask = tf.expand_dims(tf.sequence_mask(
-        label_length + 1, maxlen=target_max_len, dtype=tf.float32), axis=1)
-    small_label_mask = tf.expand_dims(tf.sequence_mask(
-        label_length, maxlen=target_max_len, dtype=tf.float32), axis=1)
-    input_mask = tf.expand_dims(tf.sequence_mask(
-        logit_length, maxlen=input_max_len, dtype=tf.float32), axis=2)
-    small_input_mask = tf.expand_dims(tf.sequence_mask(
-        logit_length - 1, maxlen=input_max_len, dtype=tf.float32), axis=2)
+    label_mask = tf.expand_dims(tf.sequence_mask(label_length + 1, maxlen=target_max_len, dtype=tf.float32), axis=1)
+    small_label_mask = tf.expand_dims(tf.sequence_mask(label_length, maxlen=target_max_len, dtype=tf.float32), axis=1)
+    input_mask = tf.expand_dims(tf.sequence_mask(logit_length, maxlen=input_max_len, dtype=tf.float32), axis=2)
+    small_input_mask = tf.expand_dims(tf.sequence_mask(logit_length - 1, maxlen=input_max_len, dtype=tf.float32), axis=2)
     mask = label_mask * input_mask
     grad_blank_mask = (label_mask * small_input_mask)[:, :-1, :]
     grad_truth_mask = (small_label_mask * input_mask)[:, :, :-1]
@@ -172,30 +172,43 @@ def compute_rnnt_loss_and_grad_helper(logits, labels, label_length, logit_length
     indices = tf.stack([logit_length - 1, label_length], axis=1)
     blank_sl = tf.gather_nd(blank_probs, indices, batch_dims=1)
 
-    beta = backward_dp(bp_diags, tp_diags, batch_size, input_max_len, target_max_len, label_length, logit_length,
-                       blank_sl) * mask
+    beta = backward_dp(bp_diags, tp_diags, batch_size, input_max_len,
+                       target_max_len, label_length, logit_length, blank_sl) * mask
     beta = tf.where(tf.math.is_nan(beta), tf.zeros_like(beta), beta)
     final_state_probs = beta[:, 0, 0]
 
     # Compute gradients of loss w.r.t. blank log-probabilities.
-    grads_blank = -tf.exp((alpha[:, :-1, :] + beta[:, 1:, :] - tf.reshape(final_state_probs,
-                                                                          shape=[batch_size, 1, 1]) + blank_probs[:,
-                                                                                                      :-1,
-                                                                                                      :]) * grad_blank_mask) * grad_blank_mask
+    grads_blank = -tf.exp(
+        (
+            alpha[:, :-1, :] + beta[:, 1:, :]
+            - tf.reshape(final_state_probs, shape=[batch_size, 1, 1])
+            + blank_probs[:, :-1, :]
+        ) * grad_blank_mask
+    ) * grad_blank_mask
     grads_blank = tf.concat([grads_blank, tf.zeros(shape=(batch_size, 1, target_max_len))], axis=1)
     last_grads_blank = -1 * tf.scatter_nd(
-        tf.concat([tf.reshape(tf.range(batch_size, dtype=tf.int64), shape=[batch_size, 1]), tf.cast(indices, dtype=tf.int64)], axis=1),
-        tf.ones(batch_size, dtype=tf.float32), [batch_size, input_max_len, target_max_len])
+        tf.concat([tf.reshape(tf.range(batch_size, dtype=tf.int64), shape=[batch_size, 1]),
+                   tf.cast(indices, dtype=tf.int64)], axis=1),
+        tf.ones(batch_size, dtype=tf.float32),
+        [batch_size, input_max_len, target_max_len]
+    )
     grads_blank = grads_blank + last_grads_blank
 
     # Compute gradients of loss w.r.t. truth log-probabilities.
-    grads_truth = -tf.exp((alpha[:, :, :-1] + beta[:, :, 1:] - tf.reshape(final_state_probs, shape=[batch_size, 1,
-                                                                                                    1]) + truth_probs) * grad_truth_mask) * grad_truth_mask
+    grads_truth = -tf.exp(
+        (
+            alpha[:, :, :-1] + beta[:, :, 1:]
+            - tf.reshape(final_state_probs, shape=[batch_size, 1, 1])
+            + truth_probs
+        )
+        * grad_truth_mask
+    ) * grad_truth_mask
 
     # Compute gradients of loss w.r.t. activations.
     a = tf.tile(tf.reshape(tf.range(target_max_len - 1, dtype=tf.int64), shape=(1, 1, target_max_len - 1, 1)),
                 multiples=[batch_size, 1, 1, 1])
     b = tf.cast(tf.reshape(labels - 1, shape=(batch_size, 1, target_max_len - 1, 1)), dtype=tf.int64)
+    # b = tf.where(tf.equal(b, -1), tf.zeros_like(b), b)  # for cpu testing (index -1 on cpu will raise errors)
     c = tf.concat([a, b], axis=3)
     d = tf.tile(c, multiples=(1, input_max_len, 1, 1))
     e = tf.tile(tf.reshape(tf.range(input_max_len, dtype=tf.int64), shape=(1, input_max_len, 1, 1)),
@@ -208,8 +221,8 @@ def compute_rnnt_loss_and_grad_helper(logits, labels, label_length, logit_length
     probs = tf.exp(log_probs)
     grads_truth_scatter = tf.scatter_nd(scatter_idx, grads_truth,
                                         [batch_size, input_max_len, target_max_len, vocab_size - 1])
-    grads = tf.concat(
-        [tf.reshape(grads_blank, shape=(batch_size, input_max_len, target_max_len, -1)), grads_truth_scatter], axis=3)
+    grads = tf.concat([tf.reshape(grads_blank, shape=(batch_size, input_max_len, target_max_len, -1)),
+                       grads_truth_scatter], axis=3)
     grads_logits = grads - probs * (tf.reduce_sum(grads, axis=3, keepdims=True))
 
     loss = -final_state_probs
@@ -244,5 +257,4 @@ def grad(grad_loss):
 
             return result[0], grad
 
-
-        return compute_rnnt_loss_and_grad(*args)
\ No newline at end of file
+        return compute_rnnt_loss_and_grad(*args)
diff --git a/tensorflow_asr/utils/__init__.py b/tensorflow_asr/utils/__init__.py
index 8fc814dbec..e7becd8f27 100644
--- a/tensorflow_asr/utils/__init__.py
+++ b/tensorflow_asr/utils/__init__.py
@@ -64,7 +64,10 @@ def setup_strategy(devices):
 def setup_tpu(tpu_address=None):
     import tensorflow as tf
 
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver() if tpu_address is None else tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address)
+    if tpu_address is None:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+    else:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address)
     tf.config.experimental_connect_to_cluster(resolver)
     tf.tpu.experimental.initialize_tpu_system(resolver)
     print("All TPUs: ", tf.config.list_logical_devices('TPU'))
diff --git a/tests/featurizer/test_sentencepiece.py b/tests/featurizer/test_sentencepiece.py
index 4042eb6c8b..e289576915 100644
--- a/tests/featurizer/test_sentencepiece.py
+++ b/tests/featurizer/test_sentencepiece.py
@@ -48,13 +48,13 @@ def test_featurizer():
         "normalize_per_feature": False}
 
     text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(config, None)
-    subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 
+    subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                  os.pardir,
                                  os.pardir,
                                  "vocabularies",
-                                 "librispeech_train_4_1030.subwords")                                 
+                                 "librispeech_train_4_1030.subwords")
     text_featurizer_subwords = SubwordFeaturizer.load_from_file(config, subwords_path)
-    speech_featurizer = TFSpeechFeaturizer(config_speech)    
+    speech_featurizer = TFSpeechFeaturizer(config_speech)
     data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv")
 
     def get_data(featurizer: TextFeaturizer):